## Goal of the project

The goal was testing the python package pgmpy. The objective was building a model to predict how likely a given project was of being fully funded (reaching at least 100% of it's funding goal), given some information which were available at the moment the project was created. 


In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import glob
import re

dir = '../input/'


data = pd.DataFrame()
for f in glob.glob((dir+'*.csv')): # all files in the directory that matchs the criteria.
    data = pd.concat([data,pd.read_csv(f)])
    
useless_columns = ["id","url","category_url","igg_image_url","compressed_image_url","card_type",
                   "category_slug","source_url","friend_team_members","friend_contributors", "partner_name", "in_forever_funding"]
data = data.drop(useless_columns, axis = 1)

leak_column = ['nearest_five_percent']
data = data.drop(leak_column , axis = 1)
data = data[data.amt_time_left == 'No time left']
data = data.drop('amt_time_left' , axis = 1)    

def Remove_Non_Numeric(column):
    return re.sub(r"\D", "", str(column))

data.balance = data.balance.apply(Remove_Non_Numeric)
data.collected_percentage = data.collected_percentage.apply(Remove_Non_Numeric)

data = data[data.collected_percentage.values != '']
data = data[data.collected_percentage.values != ' ']
data.collected_percentage = data.collected_percentage.apply(float)

def Clean_Funding(column):
    if  "true" in column.lower():
        return 1
    elif "false" in column.lower() :
        return -1
    else:
        return 0
    
import re
from nltk.corpus import stopwords

def clean_text(text):    
    letters_only = re.sub("[^a-zA-Z]", " ",text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    return( " ".join( meaningful_words ))   

new_titles = data.title.apply(lambda title: clean_text(str(title)))
data.title = new_titles

data = data.drop_duplicates()

data['suc'] = 0
data.loc[(data.collected_percentage >= 100), 'suc'] = 1

In [None]:
data.shape

In [None]:
data.head(10)

In [None]:
print('Success rate')
print(data.suc.sum()/data.suc.count()*100)
print('')
print('Failure rate')
print(100 - data.suc.sum()/data.suc.count()*100)

In [None]:
print(data.currency_code.unique())

In [None]:
print(data.category_name.unique())
print(len(data.category_name.unique()))

In [None]:
ax_cur = sns.countplot(y="currency_code", hue="suc", data=data)

In [None]:
a4_dims = (11.7, 8.27)
fig, ax_cat = plt.subplots(figsize=a4_dims)

sns.countplot(ax = ax_cat, y="category_name", hue="suc", data=data)

In [None]:
data['category_name'].value_counts().head(10)

In [None]:
top_cat = data['category_name'].value_counts().head(10).index

a4_dims = (11.7, 8.27)
fig, ax_cat = plt.subplots(figsize=a4_dims)

sns.countplot(ax = ax_cat, y="category_name", hue="suc", data=data[data.category_name.isin(top_cat)])

In [None]:
trainRatio = 0.75
np.random.seed(1230)
trainIdx = np.random.rand(len(data)) < trainRatio

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000) 

In [None]:
titles = vectorizer.fit_transform(data[trainIdx].title)

In [None]:
words = vectorizer.get_feature_names()
counts = np.sum(titles, axis=0)

Word_Count = pd.DataFrame(counts.transpose(), columns = {'word_counts'} )
Word_Count['Word'] = words
Word_Count = Word_Count.sort_values(by = "word_counts", ascending = False)
#Word_Count = Word_Count.set_index('Word')

In [None]:
success_projectIdx = np.logical_and(trainIdx,(data.collected_percentage >= 100))
failed_projectIdx = np.logical_and(trainIdx, (data.collected_percentage < 100))

In [None]:
success_titles = vectorizer.fit_transform(data[success_projectIdx].title)
success_words = vectorizer.get_feature_names()
counts = np.sum(success_titles, axis=0)

success_Word_Count = pd.DataFrame(counts.transpose(), columns = {'suc_counts'} )
success_Word_Count['Word'] = success_words
success_Word_Count = success_Word_Count.sort_values(by = "suc_counts", ascending = False)


In [None]:
# I could just use word_count - success_words for this
failed_titles = vectorizer.fit_transform(data[failed_projectIdx].title)
failed_words = vectorizer.get_feature_names()
counts = np.sum(failed_titles, axis=0)

failed_Word_Count = pd.DataFrame(counts.transpose(), columns = {'failed_counts'} )
failed_Word_Count['Word'] = failed_words
failed_Word_Count = failed_Word_Count.sort_values(by = "failed_counts", ascending = False)

In [None]:
Word_Count = Word_Count.merge(failed_Word_Count, how='left', left_on = "Word", right_on = "Word")
Word_Count = Word_Count.merge(success_Word_Count, how='left', left_on = "Word", right_on = "Word")

Word_Count = Word_Count.sort_values(by="word_counts", ascending=False)

In [None]:
The first thing I've tried was building a model based on the most popular words, but this ended up not being informative enough to build a good model. For curiosity sake I left the list of words that appeared on more then 1% of the project titles, as well as its success rate 

In [None]:
Word_Count['suc_rate'] = (Word_Count["suc_counts"] -  Word_Count["failed_counts"])/ Word_Count['word_counts']

Word_Count['suc_rate'] = (Word_Count['suc_rate'] - min(Word_Count['suc_rate']))/(max(Word_Count['suc_rate']) - min(Word_Count['suc_rate']))
Word_Count['suc_rate'] = (Word_Count['suc_rate'] - 0.5)*2
Word_Count = Word_Count.sort_values(by="suc_rate", ascending=False)
relevant_word_count = Word_Count[Word_Count.word_counts > trainIdx.sum()/100]
print(relevant_word_count)

In [None]:
# Testing how pgmpy performs

In [None]:
Quick remainder of what the data looks like

In [None]:
data.head()

As a way to test pgmpy this was okay. pgmpy seems to be able to handle up to 10 features with an acceptable perfomance, although most of those were binary, but the train set was pretty large (about 150k rows). When using more features, it would quickly run out of memory. This might be improved by changing the enviroment, but nevertherless is a bad sign. Finally, it's not able to handle continous variables for training, which is quite a drawback. As far as the model goes, it was able to come with a decent model for which models were more likely to fail given only the project title, the currency, and category, withou any major problems or demading a lot of time

In [None]:
# filtering words that appear on more then 100 titles
non_specific_word_count = Word_Count[Word_Count.word_counts > 100]
non_specific_word_count = non_specific_word_count.sort_values(by="suc_rate", ascending=False)
print(non_specific_word_count.head())
print(non_specific_word_count.tail())

# quidditch is actually a game from the harry potter series

In [None]:
We will now reduce this by "success clusters" i.e.: groups in different success bands. Since we have around 1000 words we will break this in groups of about 100 words

In [None]:
dictionarySize = len(non_specific_word_count)
clusterSize = dictionarySize // 6

wordCluster = [ non_specific_word_count[i:i + clusterSize] for i in range(0, dictionarySize, clusterSize )]

In [None]:
from functools import partial

def matchCluster(cluster, title):
    titleSet = set(title.split(' '))
    clusterSet = set(cluster.Word.values)
    common = list(titleSet & clusterSet)
    return 1 if len(common) > 0 else 0

for i in range(len(wordCluster)):
    cluster = wordCluster[i]
    
    matchClusterDF = partial(matchCluster, cluster)
    data['has_cluster_' + str(i)]  = data.title.apply(matchClusterDF)

In [None]:
wordCluster[0]

In [None]:
wordCluster[-1]

In [None]:
from sklearn import preprocessing

le_cat = preprocessing.LabelEncoder()
le_cat.fit(data.category_name.unique())
data.category_name = le_cat.transform(data.category_name)

le_cur = preprocessing.LabelEncoder()
le_cur.fit(data.currency_code.unique())
data.currency_code = le_cur.transform(data.currency_code)

In [None]:
import pgmpy

from pgmpy.estimators import ParameterEstimator, BayesianEstimator
from pgmpy.models import BayesianModel

modelRelations = []
modelRelations.append(('currency_code', 'suc'))
modelRelations.append(('category_name', 'suc'))
for i in range(len(wordCluster)):
    modelRelations.append(('has_cluster_' + str(i), 'suc'))

model1 = BayesianModel(modelRelations)  # everything implies in suc

In [None]:
model1.fit(data.iloc[trainIdx], estimator=BayesianEstimator, prior_type="BDeu", )

In [None]:
# creates evidence
def createsEvidence(df_point, Word_Count):
    evidence = {}
    evidence['currency_code'] = df_point['currency_code']
    evidence['category_name'] = df_point['category_name']
    
    for i in range(len(wordCluster)):
        key = 'has_cluster_' + str(i)
        evidence[key] = df_point[key]
    
    return evidence

In [None]:
from numpy import random
from sklearn import preprocessing

sampleSize = 200

testIdx = np.where(~trainIdx)
testSize = len(testIdx[0])
testSample = random.choice(testSize, sampleSize)
testIdx = testIdx[0][testSample]

In [None]:
from pgmpy.inference import VariableElimination

predictions = {}
for pred_idx in testIdx:
    infer = VariableElimination(model1)
    evidence = createsEvidence(data.iloc[pred_idx], Word_Count)
    result = infer.query(['suc'], evidence = evidence)
    predictions[pred_idx] = (result['suc'].values[1], data.iloc[pred_idx].suc)

In [None]:
totalTrue = 0
truePositive = 0
totalPositive = 0

for pred_idx in testIdx:
    if(predictions[pred_idx][1]):
        totalTrue += 1
        if predictions[pred_idx][0] > 0.5:
            truePositive += 1
    if predictions[pred_idx][0] > 0.5:
        totalPositive += 1
        
   
precision = truePositive/totalTrue
recall = truePositive/totalPositive
print('')   
print("precision: %.2f"%precision)    
print("recall: %.2f"%recall)    
     

In [None]:
totalFalse = 0
trueNegative = 0
totalNegative = 0

for pred_idx in testIdx:
    if(not predictions[pred_idx][1]):
        totalFalse += 1
        if predictions[pred_idx][0] < 0.5:
            trueNegative += 1
    if predictions[pred_idx][0] < 0.5:
        totalNegative += 1

precision = trueNegative/totalFalse
recall = trueNegative/totalNegative
print('')   
print("precision: %.2f"%precision)    
print("recall: %.2f"%recall)  
        