The dependent variable is binary, 0 (for failure) and 1 (for success). As such, we will consider the following classification techniques:
* logistic regression
* KNN
* SVM
* kernel SVM
* Naive-Bays
* decision tree classification
* random forest classification

We will also consider reducing the variables via PCA

Finally, we may optimize parameters using grid_search.


In [1]:
#-----------------------------------------
# USER INPUTS


In [2]:
#-----------------------------------------
# IMPORT LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

pd.options.display.max_columns = None # Shows all columns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [3]:
#-----------------------------------------
# IMPORT DATAFRAME

df = pd.read_csv('data/df02.csv', sep=',', na_filter=False, index_col=0, 
                 parse_dates=['launched_at'])

# Checks
if (df.isnull().sum().sum() != 0):
    print('*** WARNING: Null values introduced with read_csv ***')
if (df.isna().sum().sum() != 0):
    print('*** WARNING: NA values introduced with read_csv ***')
if (df=='').sum().sum() != 0:
    print('*** WARNING: Empty string (\'\') values introduced with read_csv ***')

In [4]:
df.head(2)

Unnamed: 0,launch_state,id,launched_at,category,country,goal,backers_count,pledged_ratio,funding_days,staff_pick,comics,crafts,dance,design,fashion,film & video,food,games,journalism,music,photography,publishing,technology,theater,AF,AG,AL,AM,AQ,AR,AT,AU,AX,AZ,BA,BB,BD,BE,BF,BG,BJ,BM,BO,BR,BS,BT,BW,BY,BZ,CA,CD,CG,CH,CI,CK,CL,CM,CN,CO,CR,CU,CV,CW,CY,CZ,DE,DJ,DK,DM,DO,DZ,EC,EE,EG,ES,ET,FI,FJ,FM,FO,FR,GA,GB,GD,GE,GH,GL,GM,GN,GP,GQ,GR,GT,GU,GY,HK,HN,HR,HT,HU,ID,IE,IL,IN,IQ,IR,IS,IT,JM,JO,JP,KE,KG,KH,KN,KP,KR,KW,KZ,LA,LB,LC,LK,LR,LS,LT,LU,LV,LY,MA,MC,MD,ME,MG,MK,ML,MM,MN,MO,MR,MT,MV,MW,MX,MY,MZ,NA,NE,NG,NI,NL,NO,NP,NZ,PA,PE,PF,PG,PH,PK,PL,PN,PR,PS,PT,PY,QA,RO,RS,RU,RW,SA,SC,SD,SE,SG,SI,SJ,SK,SL,SN,SO,SR,SS,SV,SX,SY,SZ,TC,TH,TL,TN,TO,TR,TT,TW,TZ,UA,UG,US,UY,VC,VE,VI,VN,VU,WS,XK,YE,ZA,ZM,ZW
0,0,895922629,2014-10-28 19:42:54,technology,US,3500.0,1,0.014286,30,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1190376005,2011-10-17 18:39:11,games,US,225.0,63,7.170978,16,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
info_variables = ['id','launched_at','category','country']

In [6]:
info_variables

['id', 'launched_at', 'category', 'country']

In [7]:
X = df.drop(columns=info_variables).drop(columns='launch_state')
y = df['launch_state']

In [8]:
df.shape

(141447, 212)

In [9]:
X.shape

(141447, 207)

In [10]:
y.shape

(141447,)

In [11]:
#-----------------------------------------
# TRAIN/TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=101)

In [12]:
X_train.shape[0]/df.shape[0]

0.7499982325535359

In [13]:
y_train.shape[0]/df.shape[0]

0.7499982325535359

In [14]:
X_test.shape[0]/df.shape[0]

0.2500017674464641

In [15]:
y_test.shape[0]/df.shape[0]

0.2500017674464641

In [16]:
#==============================================================================
#
# LOGISTIC REGRESSION
#
#==============================================================================

In [17]:
#-----------------------------------------
# FEATURE SCALING

sc_X_LogReg = StandardScaler()
X_train_LogReg = sc_X_LogReg.fit_transform(X_train)
X_test_LogReg = sc_X_LogReg.transform(X_test)

In [18]:
#-----------------------------------------
# FIT MODEL

start_clock = time.clock()

# Create classifier
# Logistic regression
classifier_LogReg = LogisticRegression(random_state=101)
classifier_LogReg.fit(X_train_LogReg, y_train)

end_clock = time.clock()

clock_fit_LogReg = end_clock-start_clock
print('Runtime, fit: ', clock_fit_LogReg, sep='')

Runtime, fit: 22.3610289


In [19]:
#-----------------------------------------
# PREDICT TEST RESULTS

start_clock = time.clock()

# No feature scaling
y_pred_LogReg = classifier_LogReg.predict(X_test_LogReg)

# Feature scaling
"""y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))
print(y_pred)"""

end_clock = time.clock()

clock_predict_LogReg = end_clock-start_clock
print('Runtime, predict: ', clock_predict_LogReg, sep='')

Runtime, predict: 0.011629600000006235


In [20]:
#-----------------------------------------
# EVALUATE MODEL

# Confusion matrix
cm_LogReg = confusion_matrix(y_test, y_pred_LogReg)

# Classification report
cr_LogReg = classification_report(y_test, y_pred_LogReg)

print(cm_LogReg)
print("\n")
print(cr_LogReg)

[[12004  2994]
 [ 2765 17599]]


             precision    recall  f1-score   support

          0       0.81      0.80      0.81     14998
          1       0.85      0.86      0.86     20364

avg / total       0.84      0.84      0.84     35362



In [21]:
acc_LogReg = cm_LogReg.diagonal().sum() / cm_LogReg.sum()

In [22]:
acc_LogReg

0.8371415643911544

In [23]:
#-----------------------------------------
# APPLY K-FOLD CROSS VALIDATION

start_clock = time.clock()

accuracies_LogReg = cross_val_score(
    estimator=classifier_LogReg, X=X_train_LogReg, y=y_train,
    cv=10)

end_clock = time.clock()

clock_10FCV_LogReg = end_clock-start_clock
print('Runtime, 10-fold CV: ', clock_10FCV_LogReg, sep='')

Runtime, 10-fold CV: 206.1631457


In [24]:
print("Accuracies:")
print(accuracies_LogReg)
print('\n')
print("RESULTS:")
print(f"  - Mean accuracy: {round(accuracies_LogReg.mean(),4)*100}%")
print(f"  - Accuracy std dev: {round(accuracies_LogReg.std(),4)*100}%")

Accuracies:
[0.82769347 0.85964747 0.85785654 0.85144688 0.85474597 0.84220944
 0.8476765  0.87368024 0.86480626 0.82888658]


RESULTS:
  - Mean accuracy: 85.09%
  - Accuracy std dev: 1.4000000000000001%


In [25]:
accuracies_LogReg.mean()

0.8508649360536339

In [26]:
cm_LogReg

array([[12004,  2994],
       [ 2765, 17599]], dtype=int64)

In [27]:
cm_LogReg.sum()

35362

In [28]:
cm_LogReg.diagonal().sum()

29603

In [32]:
clock_fit_LogReg

22.3610289

In [33]:
clock_10FCV_LogReg

206.1631457

In [31]:
pd.DataFrame([{'one':1, 'two':2, 'three':3}])

Unnamed: 0,one,three,two
0,1,3,2


In [66]:
df_results_LogReg = pd.DataFrame([{
    'model':'LogReg', 
    'time_fit':clock_fit_LogReg, 'time_predict':clock_predict_LogReg,
    'time_10_fold_CV':clock_10FCV_LogReg,
    'accuracy':acc_LogReg, 'acc_10_fold':accuracies_LogReg.mean()}])

In [68]:
df_results_LogReg

Unnamed: 0,acc_10_fold,accuracy,model,time_10_fold_CV,time_fit,time_predict
0,0.850865,0.837142,LogReg,206.163146,22.361029,0.01163


In [70]:
df_results_LogReg = df_results_LogReg[['model','time_fit','time_predict','time_10_fold_CV','accuracy','acc_10_fold']]

In [71]:
df_results_LogReg

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,LogReg,22.361029,0.01163,206.163146,0.837142,0.850865


In [72]:
df_results = df_results_LogReg

In [73]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,LogReg,22.361029,0.01163,206.163146,0.837142,0.850865


In [39]:
#==============================================================================
#
# K NEAREST NEIGHBORS
#
#==============================================================================

In [40]:
#-----------------------------------------
# FEATURE SCALING

sc_X_knn = StandardScaler()
X_train_knn = sc_X_knn.fit_transform(X_train)
X_test_knn = sc_X_knn.transform(X_test)

In [41]:
#-----------------------------------------
# FIT MODEL

start_clock = time.clock()

# Create classifier
# KNN
classifier_knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
classifier_knn.fit(X_train_knn, y_train)

end_clock = time.clock()

clock_fit_knn = end_clock-start_clock
print('Runtime, fit: ', clock_fit_knn, sep='')

Runtime, fit: 45.58440949999999


In [43]:
#-----------------------------------------
# PREDICT TEST RESULTS

start_clock = time.clock()

# No feature scaling
y_pred_knn = classifier_knn.predict(X_test_knn)

# Feature scaling
"""y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))
print(y_pred)"""

end_clock = time.clock()

clock_predict_knn = end_clock-start_clock
print('Runtime, predict: ', clock_predict_knn, sep='')

Runtime, predict: 949.5767655


In [44]:
#-----------------------------------------
# EVALUATE MODEL

# Confusion matrix
cm_knn = confusion_matrix(y_test, y_pred_knn)

# Classification report
cr_knn = classification_report(y_test, y_pred_knn)

print(cm_knn)
print("\n")
print(cr_knn)

[[13101  1897]
 [ 1300 19064]]


             precision    recall  f1-score   support

          0       0.91      0.87      0.89     14998
          1       0.91      0.94      0.92     20364

avg / total       0.91      0.91      0.91     35362



In [76]:
acc_knn = cm_knn.diagonal().sum() / cm_knn.sum()

In [52]:
#-----------------------------------------
# APPLY K-FOLD CROSS VALIDATION

# 10-fold cross validation time estimate:
print('10-fold CV estimated time: ', 
      round((clock_fit_knn + clock_predict_knn)*10/60/60, 2), 
      ' hours')

10-fold CV estimated time:  2.76  hours


In [53]:
start_clock = time.clock()

accuracies_knn = cross_val_score(
    estimator=classifier_knn, X=X_train_knn, y=y_train,
    cv=10)

end_clock = time.clock()

clock_10FCV_knn = end_clock-start_clock
print('Runtime, 10-fold CV: ', clock_10FCV_knn, sep='')

Runtime, 10-fold CV: 2893.3973577999996


In [54]:
print("Accuracies:")
print(accuracies_knn)
print('\n')
print("RESULTS:")
print(f"  - Mean accuracy: {round(accuracies_knn.mean(),4)*100}%")
print(f"  - Accuracy std dev: {round(accuracies_knn.std(),4)*100}%")

Accuracies:
[0.90432652 0.91337544 0.9120558  0.91026487 0.90960505 0.9080969
 0.90460929 0.91185897 0.9154332  0.90732535]


RESULTS:
  - Mean accuracy: 90.97%
  - Accuracy std dev: 0.35000000000000003%


In [None]:
"""df_results = pd.DataFrame([{'model':'LogReg',
              'time_fit':clock_fit_LogReg, 
              'time_predict':clock_predict_LogReg,
              'time_10_fold_CV':clock_10FCV_LogReg,
              'accuracy':acc_LogReg,
              'acc_10_fold':accuracies_LogReg.mean()}])"""

In [75]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,LogReg,22.361029,0.01163,206.163146,0.837142,0.850865


In [57]:
x=pd.DataFrame([{'one':1, 'two':2, 'three':3}])

In [58]:
x

Unnamed: 0,one,three,two
0,1,3,2


In [65]:
pd.DataFrame.append(x, pd.DataFrame([{'one':10, 'two':20, 'three':30}])).reset_index(drop=True)

Unnamed: 0,one,three,two
0,1,3,2
1,10,30,20


In [None]:
pd.DataFrame(df_results)

In [79]:
df_results_knn = pd.DataFrame([{
    'model':'KNN', 
    'time_fit':clock_fit_knn, 'time_predict':clock_predict_knn,
    'time_10_fold_CV':clock_10FCV_knn,
    'accuracy':acc_knn, 'acc_10_fold':accuracies_knn.mean()}])
df_results_knn = df_results_knn[['model','time_fit','time_predict','time_10_fold_CV','accuracy','acc_10_fold']]

In [80]:
df_results_knn

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,KNN,45.584409,949.576765,2893.397358,0.909592,0.909695


In [83]:
df_results = pd.DataFrame.append(df_results, df_results_knn).reset_index(drop=True)

In [84]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,LogReg,22.361029,0.01163,206.163146,0.837142,0.850865
1,KNN,45.584409,949.576765,2893.397358,0.909592,0.909695


In [None]:
#==============================================================================
#
# SUPPORT VECTOR MACHINE
#
#==============================================================================