In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 16, 8

from scipy.stats import norm
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier  

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('KS_train_data.csv', delimiter=',')
# df = pd.read_csv('KS_test_data.csv', delimiter=';')
# X = df.loc[:,'f1':'f100'].values
# y = [ bool(y) for y in df.loc[:,'loss'].values ]

In [11]:
df['delta_time_created'] = df.deadline - df.created_at
df['delta_time_launched'] = df.deadline - df.launched_at
df['delta_time_launched_days'] = df.delta_time_launched / 86400
df['delta_time_created_days'] = df.delta_time_launched / 86400
df['goal_converted_log'] = np.log(df.goal * df.fx_rate)
df['goal_per_day'] = df['goal_converted_log'] / df['delta_time_launched']
total_in_cat = {}
funded_in_cat = {}
rate_funded_cat = {}
for x in df.category.unique():
    total_in_cat[x] = df.loc[(df.category == x, 'project_id')].count()
    funded_in_cat[x] = df.loc[(df.category == x) & (df.funded == True), 'project_id'].count() 
    rate_funded_cat[x] = funded_in_cat[x] / total_in_cat[x]
df['rate_funded_cat'] = df.apply(lambda row: rate_funded_cat[row.category], axis=1)
df_dum = pd.get_dummies(df, columns=["category"], prefix=["cat_typ"] )
df.country = df.country.fillna('NA')
EU = ('GB', 'ES', 'FR', 'IT', 'NL', 'IS', 'CZ', 'FI', 'DE', 'IE', 'SJ', 'DK', 'SE', 'HU', 'NO', 'CY', 'CH', 'BE', 
          'LV', 'UA', 'AT', 'SI', 'LT', 'RO', 'RU', 'AX', 'MC', 'PT', 'GL', 'GR', 'SK', 'EE', 'BA', 'ME', 'LU', 'RS',
         'PL', 'MD', 'BG', 'HR', 'MK', 'BY', 'XK', 'FO', 'MT')
NA = ('US', 'CA', 'MX', 'CR', 'GT', 'HT', 'AG', 'JM', 'BZ', 'CU', 'SV', 'PR', 'PA', 'NI', 'DO', 'CW', 'VI', 'BB',
         'HN', 'LC', 'TT', 'BS', 'GP', 'VC', 'DM')
SA = ('AR', 'PE', 'SR', 'BR', 'BO', 'EC', 'CO', 'CL', 'VE', 'PY', 'GY', 'UY')
AF = ('KE', 'MW', 'ZA', 'RW', 'LR', 'EG', 'SN', 'NG', 'TZ', 'GH', 'GQ', 'ZM', 'MG', 'ET', 'MA', 'CD', 'BF', 'UG',
         'CI', 'DZ', 'ML', 'SD', 'ZW', 'CM', 'TN', 'NE', 'MZ', 'GN', 'SO', 'LY', 'DJ', 'GA', 'SS', 'GM', 'BJ', 'CF',
          'CG', 'NA')
AS = ('TH', 'ID', 'KH', 'IN', 'JP', 'TR', 'CN', 'MY', 'MN', 'IL', 'KR', 'PH', 'HK', 'SG', 'PS', 'TW', 'NP', 'IR',
         'QA', 'VN', 'IQ', 'AE', 'LK', 'GE', 'LB', 'AM', 'KZ', 'AF', 'KP', 'BD', 'PK', 'MM', 'BT', 'JO', 'MV', 'LA',
         'KW', 'SY', 'TJ', 'TL', 'YE', 'MO', 'KG')
AT = ('AQ')
OC = ('AU','NZ', 'PG', 'FJ', 'FM', 'CK', 'GU', 'NC', 'PF', 'VU' )
UNK = ('?')

def conditions(x):
    if x in EU:
        return "EU"
    elif x in NA:
        return "NA"
    elif x in SA:
        return "SA"
    elif x in AF:
        return "AF"
    elif x in AS:
        return "AS"
    elif x in AT:
        return "AT"
    elif x in OC:
        return "OC"
    else:
        return "UNK"

func = np.vectorize(conditions)
continents = func(df["country"])
df_dum["continents"] = continents
df_dum = pd.get_dummies(df_dum, columns=["continents"], prefix=["cont_type"] )
df_dum.columns



Index(['project_id', 'backers_count', 'blurb', 'converted_pledged_amount',
       'country', 'created_at', 'currency', 'deadline', 'fx_rate', 'goal',
       'launched_at', 'name', 'pledged', 'staff_pick', 'usd_pledged',
       'location', 'funded', 'subcategory', 'project_url', 'reward_url',
       'delta_time_created', 'delta_time_launched', 'delta_time_launched_days',
       'delta_time_created_days', 'goal_converted_log', 'goal_per_day',
       'rate_funded_cat', 'cont_type_AF', 'cont_type_AS', 'cont_type_AT',
       'cont_type_EU', 'cont_type_NA', 'cont_type_OC', 'cont_type_SA',
       'cont_type_UNK', 'cont_type_AF', 'cont_type_AS', 'cont_type_AT',
       'cont_type_EU', 'cont_type_NA', 'cont_type_OC', 'cont_type_SA',
       'cont_type_UNK', 'cat_typ_art', 'cat_typ_comics', 'cat_typ_crafts',
       'cat_typ_dance', 'cat_typ_design', 'cat_typ_fashion',
       'cat_typ_film & video', 'cat_typ_food', 'cat_typ_games',
       'cat_typ_journalism', 'cat_typ_music', 'cat_typ_photography'

In [12]:
cols = ['cont_type_AF', 'cont_type_AS', 'cont_type_AT', 'cont_type_EU',
       'cont_type_NA', 'cont_type_OC', 'cont_type_SA', 'cont_type_UNK',
       'cat_typ_art', 'cat_typ_comics', 'cat_typ_crafts',
       'cat_typ_dance', 'cat_typ_design', 'cat_typ_fashion',
       'cat_typ_film & video', 'cat_typ_food', 'cat_typ_games',
       'cat_typ_journalism', 'cat_typ_music', 'cat_typ_photography',
       'cat_typ_publishing', 'cat_typ_technology', 'cat_typ_theater', 
        'rate_funded_cat', 'delta_time_launched_days', 'goal_converted_log', 'staff_pick']
df_try = df_dum[cols]
df_try

Unnamed: 0,cont_type_AF,cont_type_AF.1,cont_type_AF.2,cont_type_AS,cont_type_AS.1,cont_type_AS.2,cont_type_AT,cont_type_AT.1,cont_type_AT.2,cont_type_EU,...,cat_typ_journalism,cat_typ_music,cat_typ_photography,cat_typ_publishing,cat_typ_technology,cat_typ_theater,rate_funded_cat,delta_time_launched_days,goal_converted_log,staff_pick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.558767,21.959491,7.937375,False
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.558767,40.000000,8.411833,False
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.558767,60.000000,8.517193,False
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0.558767,21.000000,8.779557,False
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.558767,30.000000,9.615805,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.751055,30.000000,8.853665,False
99996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.751055,30.000000,10.126631,False
99997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.751055,36.687963,8.612503,True
99998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.751055,35.714074,10.126631,False


In [13]:
X = df_try
y = df['funded']

In [14]:
def contigency_matrix(true_y, predicted_y):
    # YOUR CODE HERE, Create TP, FP, TN, FN
    tp=fp=tn=fn=0
    for true, pred in zip(true_y, predicted_y):
        if pred == True:
            if pred == true:
                tp += 1
            else:
                fp += 1
        else:
            if pred == true:
                tn += 1
            else:
                fn += 1      
    matrix = np.array(([tp, fp], [tn, fn]))
    # Make sure your output fits the following format:
    # matrix = np.array(([TP, FP], [TN, FN]))
    return matrix

def accuracy(true_y, predicted_y):
    matrix = contigency_matrix(true_y, predicted_y)
    tp = matrix[0][0]
    fp = matrix[0][1]
    tn = matrix[1][0]
    fn = matrix[1][1]
    if tp+fp+fn+tn == 0:
        return 0
    else:
        accuracy = (tp+tn)/(tp+fp+fn+tn)
        return accuracy
def precision(true_y, predicted_y):
    matrix = contigency_matrix(true_y, predicted_y)
    tp = matrix[0][0]
    fp = matrix[0][1]
    tn = matrix[1][0]
    fn = matrix[1][1]
    if tp+fp == 0:
        return 0
    else:
        precision = tp/(tp+fp)
        return precision
def recall(true_y, predicted_y):
    matrix = contigency_matrix(true_y, predicted_y)
    tp = matrix[0][0]
    fp = matrix[0][1]
    tn = matrix[1][0]
    fn = matrix[1][1]
    if tp+fn == 0:
        return 0
    else:
        recall = tp/(tp+fn)
        return recall
def f1(true_y, predicted_y):
    precision_v = precision(true_y, predicted_y)
    recall_v = recall(true_y, predicted_y)
    if precision_v+recall_v == 0:
        return 0
    else:
        f1 = 2*((precision_v*recall_v)/(precision_v+recall_v))
        return f1

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
y = np.array(y)
y = y.reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1)
imp_median_X = SimpleImputer(missing_values=np.nan, strategy='median').fit(X_train)
X_train = imp_median_X.transform(X_train)
X_test = imp_median_X.transform(X_test)

imp_median_y = SimpleImputer(missing_values=np.nan, strategy='median').fit(y_train)
y_train = imp_median_y.transform(y_train)
y_test = imp_median_y.transform(y_test)

# fit scaler and scale features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train) 
X_test_scaled = scaler.transform(X_test)

    
def compute_scores(X_train,X_test,y_train,y_test, C):
    # fit logistic regression model
    logreg = LogisticRegression(C=C, solver='liblinear').fit(X_train,y_train.ravel())
    # predict y for train set
    pred_train = logreg.predict(X_train).tolist()
    # predict y for test set
    pred_test = logreg.predict(X_test).tolist()
            
    # calculate evaluation measures
    evaluation_measures = dict()
    evaluation_measures['accuracy_train'] = accuracy(y_train, pred_train)
    evaluation_measures['accuracy_test'] = accuracy(y_test, pred_test)
    
    evaluation_measures['precision_train'] = precision(y_train, pred_train)
    evaluation_measures['precision_test'] = precision(y_test, pred_test)
    
    evaluation_measures['recall_train'] = recall(y_train, pred_train)
    evaluation_measures['recall_test'] = recall(y_test, pred_test)
    
    evaluation_measures['f1_train'] = f1(y_train, pred_train)
    evaluation_measures['f1_test'] = f1(y_test, pred_test)
    
    return evaluation_measures
    
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
measures = pd.DataFrame()
for c in C:
    em = compute_scores(X_train_scaled,X_test_scaled,y_train,y_test, c)
    em = pd.Series(em)
    measures = measures.append(em, ignore_index=True)
measures.index = C
measures.index = measures.index.rename('C-value')
display(measures)



Unnamed: 0_level_0,accuracy_test,accuracy_train,f1_test,f1_train,precision_test,precision_train,recall_test,recall_train
C-value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0001,0.7029,0.704567,0.764337,0.762719,0.714307,0.718641,0.821904,0.812557
0.001,0.706,0.709256,0.766518,0.766159,0.717088,0.722774,0.823269,0.815086
0.01,0.7064,0.709211,0.766093,0.765831,0.718685,0.723267,0.820198,0.813717
0.1,0.7063,0.709233,0.766032,0.765819,0.718577,0.723337,0.820198,0.813603
1.0,0.7063,0.709222,0.766032,0.765817,0.718577,0.723317,0.820198,0.813622
10.0,0.7063,0.709222,0.766032,0.765817,0.718577,0.723317,0.820198,0.813622
100.0,0.7063,0.709222,0.766032,0.765817,0.718577,0.723317,0.820198,0.813622
1000.0,0.7063,0.709222,0.766032,0.765817,0.718577,0.723317,0.820198,0.813622


In [18]:
logreg = LogisticRegression(C=1, solver='liblinear').fit(X_train,y_train.ravel())
pred_train = logreg.predict(X_train).tolist()
contigency_matrix(y_train, pred_train)

array([[42805, 16694],
       [20744,  9757]])

In [20]:
import random

def upsample(y_train):
    # y_train is the 1d matrix of the labels in your training data, e.g.
    #       0     1     2     3     4   5     6     7     8   ... 
    # y = [True False False False True True False False False ... False]
    #
    # the function returns the position of the training data to be considered for the final training set.
    # e.g. if you decide from the True instances to select 0, 4 and 5, while from the False instances 1, 3, and 8
    # the outcome of the function will be [0, 1, 3, 4, 5, 8] (= sampled_indexes)
    falses = 0
    false_indexes = []
    true_indexes = []
    for index, value in enumerate(y_train):
        if value == False:
            falses += 1
            false_indexes.append(index)
        else:
            true_indexes.append(index)
    sampled_indexes = random.sample(true_indexes, falses) + false_indexes

    return sampled_indexes
    
def new_training_set(X_train, y_train, sampled_indexes):
    X_train_new = []
    y_train_new = []
    for index in sampled_indexes:
        X_train_new.append(X_train[index])
        y_train_new.append(y_train[index])
    return [np.array(X_train_new), np.array(y_train_new)]

In [22]:
sampled_indexes = upsample(y_train)
v = new_training_set(X_train_scaled, y_train, sampled_indexes)
X_train_up = v[0]
y_train_up = v[1]

C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
measures = pd.DataFrame()
for c in C:
    em = compute_scores(X_train_up,X_test_scaled,y_train_up,y_test, c)
    em = pd.Series(em)
    measures = measures.append(em, ignore_index=True)
measures.index = C
measures.index = measures.index.rename('C-value')
display(measures)


Unnamed: 0_level_0,accuracy_test,accuracy_train,f1_test,f1_train,precision_test,precision_train,recall_test,recall_train
C-value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0001,0.6895,0.685266,0.727225,0.689684,0.753779,0.680137,0.702478,0.699503
0.001,0.6903,0.688204,0.725468,0.689606,0.759325,0.686521,0.694501,0.692719
0.01,0.6894,0.687897,0.723321,0.688035,0.761155,0.687732,0.68907,0.688338
0.1,0.689,0.687871,0.722965,0.688025,0.76078,0.687685,0.68873,0.688365
1.0,0.6891,0.687817,0.723078,0.687971,0.760825,0.687632,0.6889,0.688311
10.0,0.6891,0.687817,0.723078,0.687971,0.760825,0.687632,0.6889,0.688311
100.0,0.6891,0.687817,0.723078,0.687971,0.760825,0.687632,0.6889,0.688311
1000.0,0.6891,0.687817,0.723078,0.687971,0.760825,0.687632,0.6889,0.688311


In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1)
imp_median_X = SimpleImputer(missing_values=np.nan, strategy='median').fit(X_train)
X_train = imp_median_X.transform(X_train)
X_test = imp_median_X.transform(X_test)

imp_median_y = SimpleImputer(missing_values=np.nan, strategy='median').fit(y_train)
y_train = imp_median_y.transform(y_train)
y_test = imp_median_y.transform(y_test)

# fit scaler and scale features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train) 
X_test_scaled = scaler.transform(X_test)

# upsamle training data
sampled_indexes = upsample(y_train)
v = new_training_set(X_train_scaled, y_train, sampled_indexes)
X_train_up = v[0]
y_train_up = v[1]

def compute_scores_neural(X_train,X_test,y_train,y_test, layers, activation_func):
    # fit neural network model
    mlp = MLPClassifier(max_iter=10000, hidden_layer_sizes=layers, activation=activation_func).fit(X_train, y_train.ravel())
    # predict y for train set
    pred_train = mlp.predict(X_train).tolist()
    # predict y for test set
    pred_test = mlp.predict(X_test).tolist()
            
    # calculate evaluation measures
    evaluation_measures = dict()
    evaluation_measures['accuracy_train'] = accuracy(y_train, pred_train)
    evaluation_measures['accuracy_test'] = accuracy(y_test, pred_test)
    
    evaluation_measures['precision_train'] = precision(y_train, pred_train)
    evaluation_measures['precision_test'] = precision(y_test, pred_test)
    
    evaluation_measures['recall_train'] = recall(y_train, pred_train)
    evaluation_measures['recall_test'] = recall(y_test, pred_test)
    
    evaluation_measures['f1_train'] = f1(y_train, pred_train)
    evaluation_measures['f1_test'] = f1(y_test, pred_test)
    
    return evaluation_measures

# create df
layers = [[30],[30,30],[30,30,30],[100], [100,100],[100,100,100],[200],[200,200],[200,200,200]]
activation_functions = ['logistic', 'tanh', 'relu']
measuresDict = dict()
for layer in layers:
    for activation_func in activation_functions:
        em = compute_scores_neural(X_train_up,X_test_scaled,y_train_up,y_test, layer, activation_func)
        if activation_func not in measuresDict.keys():
            measuresDict[activation_func] = {layer[0]:{len(layer): em.values()}}
        else:
            if layer[0] not in measuresDict[activation_func].keys():
                measuresDict[activation_func][layer[0]] = {len(layer): em.values()}
            else:
                measuresDict[activation_func][layer[0]][len(layer)]= em.values()

measures_ordered = {(activ_func, nodes, layers): list(values)
    for activ_func, nodes in measuresDict.items()
    for nodes, layers in nodes.items()
    for layers, values in layers.items()}
measures = pd.DataFrame(measures_ordered)
measures = measures.T
measures.columns = em.keys()
measures.index.set_names(['activation function', 'nodes per layer', 'layers'], inplace=True)
display(measures)