In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 16, 8

from scipy.stats import norm
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier  

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('KS_train_data.csv', delimiter=',')
#df_test = pd.read_csv('KS_test_data.csv', delimiter=';')
# X = df.loc[:,'f1':'f100'].values
# y = [ bool(y) for y in df.loc[:,'loss'].values ]

In [3]:
df.country = df.country.fillna('NA')
EU = ('GB', 'ES', 'FR', 'IT', 'NL', 'IS', 'CZ', 'FI', 'DE', 'IE', 'SJ', 'DK', 'SE', 'HU', 'NO', 'CY', 'CH', 'BE', 
          'LV', 'UA', 'AT', 'SI', 'LT', 'RO', 'RU', 'AX', 'MC', 'PT', 'GL', 'GR', 'SK', 'EE', 'BA', 'ME', 'LU', 'RS',
         'PL', 'MD', 'BG', 'HR', 'MK', 'BY', 'XK', 'FO', 'MT')
NA = ('US', 'CA', 'MX', 'CR', 'GT', 'HT', 'AG', 'JM', 'BZ', 'CU', 'SV', 'PR', 'PA', 'NI', 'DO', 'CW', 'VI', 'BB',
         'HN', 'LC', 'TT', 'BS', 'GP', 'VC', 'DM')
SA = ('AR', 'PE', 'SR', 'BR', 'BO', 'EC', 'CO', 'CL', 'VE', 'PY', 'GY', 'UY')
AF = ('KE', 'MW', 'ZA', 'RW', 'LR', 'EG', 'SN', 'NG', 'TZ', 'GH', 'GQ', 'ZM', 'MG', 'ET', 'MA', 'CD', 'BF', 'UG',
         'CI', 'DZ', 'ML', 'SD', 'ZW', 'CM', 'TN', 'NE', 'MZ', 'GN', 'SO', 'LY', 'DJ', 'GA', 'SS', 'GM', 'BJ', 'CF',
          'CG', 'NA')
AS = ('TH', 'ID', 'KH', 'IN', 'JP', 'TR', 'CN', 'MY', 'MN', 'IL', 'KR', 'PH', 'HK', 'SG', 'PS', 'TW', 'NP', 'IR',
         'QA', 'VN', 'IQ', 'AE', 'LK', 'GE', 'LB', 'AM', 'KZ', 'AF', 'KP', 'BD', 'PK', 'MM', 'BT', 'JO', 'MV', 'LA',
         'KW', 'SY', 'TJ', 'TL', 'YE', 'MO', 'KG')
AT = ('AQ')
OC = ('AU','NZ', 'PG', 'FJ', 'FM', 'CK', 'GU', 'NC', 'PF', 'VU' )
UNK = ('?')
def conditions(x):
    if x in EU:
        return "EU"
    elif x in NA:
        return "NA"
    elif x in SA:
        return "SA"
    elif x in AF:
        return "AF"
    elif x in AS:
        return "AS"
    elif x in AT:
        return "AT"
    elif x in OC:
        return "OC"
    else:
        return "UNK"

func = np.vectorize(conditions)
continents = func(df["country"])
df["continent"] = continents

In [4]:
total_in_cat = {}
funded_in_cat = {}
rate_funded_cat = {}
for x in df.category.unique():
    total_in_cat[x] = df.loc[(df.category == x, 'project_id')].count()
    funded_in_cat[x] = df.loc[(df.category == x) & (df.funded == True), 'project_id'].count() 
    rate_funded_cat[x] = funded_in_cat[x] / total_in_cat[x]
df['rate_funded_cat'] = df.apply(lambda row: rate_funded_cat[row.category], axis=1)

In [5]:
total_in_country = {}
funded_in_country = {}
rate_funded_country = {}
for x in df.country.unique():
    total_in_country[x] = df.loc[(df.country == x, 'project_id')].count()
    funded_in_country[x] = df.loc[(df.country == x) & (df.funded == True), 'project_id'].count() 
    rate_funded_country[x] = funded_in_country[x] / total_in_country[x]
    df['rate_funded_country'] = df.apply(lambda row: rate_funded_country[row.country], axis=1)

KeyError: 'GB'

In [7]:
total_in_continent = {}
funded_in_continent = {}
rate_funded_continent = {}
for x in df.continent.unique():
    total_in_continent[x] = df.loc[(df.continent == x, 'project_id')].count()
    funded_in_continent[x] = df.loc[(df.continent == x) & (df.funded == True), 'project_id'].count() 
    rate_funded_continent[x] = funded_in_continent[x] / total_in_continent[x]
df['rate_funded_continent'] = df.apply(lambda row: rate_funded_continent[row.continent], axis=1)

In [41]:
total_in_sub = {}
funded_in_sub = {}
rate_funded_sub = {}
for x in df.subcategory.unique():
    total_in_sub[x] = df.loc[(df.subcategory == x, 'project_id')].count()
    funded_in_sub[x] = df.loc[(df.subcategory == x) & (df.funded == True), 'project_id'].count() 
    rate_funded_sub[x] = funded_in_sub[x] / total_in_sub[x]
df['rate_funded_sub'] = df.apply(lambda row: rate_funded_sub[row.subcategory], axis=1)

In [42]:
df['delta_time_created'] = df.deadline - df.created_at
df['delta_time_launched'] = df.deadline - df.launched_at
df['delta_time_launched_days'] = df.delta_time_launched / 86400
df['delta_time_created_days'] = df.delta_time_launched / 86400
df['goal_converted_log'] = np.log(df.goal * df.fx_rate)
df['goal_per_day'] = df['goal_converted_log'] / df['delta_time_launched']
cols = ['rate_funded_sub','rate_funded_continent', 'rate_funded_cat', 
        'delta_time_launched_days', 'goal_converted_log', 'staff_pick']

In [43]:
new_df = df
from sklearn.feature_extraction.text import CountVectorizer
import scipy as sp

vectorizer = CountVectorizer()
vectorizer.max_features = 100
vectorizer.max_df = .3

X = vectorizer.fit_transform(new_df.name.astype('U')) 
count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
new_df = pd.concat([new_df, count_vect_df], axis=1)
new_df.columns



Index(['project_id', 'backers_count', 'blurb', 'category',
       'converted_pledged_amount', 'country', 'created_at', 'currency',
       'deadline', 'fx_rate',
       ...
       'tour', 'up', 'us', 'video', 'we', 'web', 'with', 'world', 'you',
       'your'],
      dtype='object', length=132)

In [44]:
for word in vectorizer.get_feature_names():
    cols.append(word)


In [45]:
X = new_df[cols]
y = new_df['funded']


In [12]:
def contigency_matrix(true_y, predicted_y):
    # YOUR CODE HERE, Create TP, FP, TN, FN
    tp=fp=tn=fn=0
    for true, pred in zip(true_y, predicted_y):
        if pred == True:
            if pred == true:
                tp += 1
            else:
                fp += 1
        else:
            if pred == true:
                tn += 1
            else:
                fn += 1      
    matrix = np.array(([tp, fp], [tn, fn]))
    # Make sure your output fits the following format:
    # matrix = np.array(([TP, FP], [TN, FN]))
    return matrix

def accuracy(true_y, predicted_y):
    matrix = contigency_matrix(true_y, predicted_y)
    tp = matrix[0][0]
    fp = matrix[0][1]
    tn = matrix[1][0]
    fn = matrix[1][1]
    if tp+fp+fn+tn == 0:
        return 0
    else:
        accuracy = (tp+tn)/(tp+fp+fn+tn)
        return accuracy
def precision(true_y, predicted_y):
    matrix = contigency_matrix(true_y, predicted_y)
    tp = matrix[0][0]
    fp = matrix[0][1]
    tn = matrix[1][0]
    fn = matrix[1][1]
    if tp+fp == 0:
        return 0
    else:
        precision = tp/(tp+fp)
        return precision
def recall(true_y, predicted_y):
    matrix = contigency_matrix(true_y, predicted_y)
    tp = matrix[0][0]
    fp = matrix[0][1]
    tn = matrix[1][0]
    fn = matrix[1][1]
    if tp+fn == 0:
        return 0
    else:
        recall = tp/(tp+fn)
        return recall
def f1(true_y, predicted_y):
    precision_v = precision(true_y, predicted_y)
    recall_v = recall(true_y, predicted_y)
    if precision_v+recall_v == 0:
        return 0
    else:
        f1 = 2*((precision_v*recall_v)/(precision_v+recall_v))
        return f1

In [13]:
def polynomial(X, degree):
    
    ### BEGIN SOLUTION
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    ### END SOLUTION
    return X_poly

In [14]:
y = np.array(y)
y = y.reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25)
imp_median_X = SimpleImputer(missing_values=np.nan, strategy='median').fit(X_train)
X_train = imp_median_X.transform(X_train)
X_test = imp_median_X.transform(X_test)

imp_median_y = SimpleImputer(missing_values=np.nan, strategy='median').fit(y_train)
y_train = imp_median_y.transform(y_train)
y_test = imp_median_y.transform(y_test)

# fit scaler and scale features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train) 
X_test_scaled = scaler.transform(X_test)

    
def compute_scores(X_train,X_test,y_train,y_test, C):
    # fit logistic regression model
    logreg = LogisticRegression(C=C, solver='liblinear').fit(X_train,y_train.ravel())
    # predict y for train set
    pred_train = logreg.predict(X_train).tolist()
    # predict y for test set
    pred_test = logreg.predict(X_test).tolist()
            
    # calculate evaluation measures
    evaluation_measures = dict()
    evaluation_measures['accuracy_train'] = accuracy(y_train, pred_train)
    evaluation_measures['accuracy_test'] = accuracy(y_test, pred_test)
    
    evaluation_measures['precision_train'] = precision(y_train, pred_train)
    evaluation_measures['precision_test'] = precision(y_test, pred_test)
    
    evaluation_measures['recall_train'] = recall(y_train, pred_train)
    evaluation_measures['recall_test'] = recall(y_test, pred_test)
    
    evaluation_measures['f1_train'] = f1(y_train, pred_train)
    evaluation_measures['f1_test'] = f1(y_test, pred_test)
    
    return evaluation_measures

# for power in [1, 2]:
#     X_train_poly = polynomial(X_train, power)
#     X_test_poly = polynomial(X_test, power)
# # Scale all features using the RobustScaler
# scaler = RobustScaler().fit(X_train_poly)
# X_train_scaled = scaler.transform(X_train_poly)
# X_test_scaled = scaler.transform(X_test_poly)
C = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 1e2, 1e3, 1e4]
measures = pd.DataFrame()
for c in C:
    em = compute_scores(X_train_scaled,X_test_scaled,y_train,y_test, c)
    em = pd.Series(em)
    measures = measures.append(em, ignore_index=True)
measures.index = C
measures.index = measures.index.rename('C-value')
display(measures)

Unnamed: 0_level_0,accuracy_test,accuracy_train,f1_test,f1_train,precision_test,precision_train,recall_test,recall_train
C-value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0001,0.78364,0.783333,0.813386,0.811209,0.826069,0.82466,0.801087,0.798189
0.001,0.79096,0.790933,0.820954,0.819471,0.827817,0.825382,0.814203,0.813645
0.01,0.7912,0.79168,0.821746,0.820665,0.825863,0.824032,0.817669,0.817325
0.1,0.79076,0.791733,0.821535,0.820814,0.824884,0.823705,0.818213,0.817943
1.0,0.79072,0.79168,0.821494,0.820764,0.824872,0.823674,0.818145,0.817874
10.0,0.79072,0.79168,0.821494,0.820764,0.824872,0.823674,0.818145,0.817874
100.0,0.79072,0.79168,0.821494,0.820764,0.824872,0.823674,0.818145,0.817874
1000.0,0.79072,0.79168,0.821494,0.820764,0.824872,0.823674,0.818145,0.817874
10000.0,0.79072,0.79168,0.821494,0.820764,0.824872,0.823674,0.818145,0.817874


In [15]:
logreg = LogisticRegression(C=1, solver='liblinear').fit(X_train,y_train.ravel())
pred_train = logreg.predict(X_train).tolist()
contigency_matrix(y_train, pred_train)

array([[35779,  7661],
       [23600,  7960]])

In [46]:

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3)
transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

clf = MLPClassifier(solver='adam', alpha=1e-2, hidden_layer_sizes=(5, 5), random_state=1)
clf.fit(X_train, y_train)
print(f'Training accuracy: {clf.score(X_train, y_train)}')
print(f'Testing accuracy: {clf.score(X_test, y_test)}')

Training accuracy: 0.7980714285714285
Testing accuracy: 0.7897666666666666


In [17]:
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

mlp = make_pipeline(RobustScaler(), MLPClassifier(solver='adam', alpha=1e-2, hidden_layer_sizes=(4,4), random_state=1))
scores = cross_validate(mlp, X, y, cv=5, scoring=['f1_macro', 'accuracy'], return_train_score=True)
scores.keys()


dict_keys(['fit_time', 'score_time', 'test_f1_macro', 'train_f1_macro', 'test_accuracy', 'train_accuracy'])

In [31]:
from sklearn.metrics import SCORERS
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [20]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))

Accuracy: 0.78 (+/- 0.05)
