This is a supervised classification problem; the dependent variable is binary, 0 (for failure) and 1 (for success). As such, we will consider the following classification techniques:
* Naive-Bays
* logistic regression
* KNN
* SVM
* kernel SVM
* decision tree classification
* random forest classification

We will also consider reducing the variables via PCA

Finally, we may optimize parameters using grid_search.

In [2]:
#-----------------------------------------
# USER INPUTS


In [3]:
#-----------------------------------------
# IMPORT LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import dill

pd.options.display.max_columns = None # Shows all columns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [6]:
#-----------------------------------------
# DUMP/LOAD SESSION

# ---- DUMP ----
#dill.dump_session('./04 Working - Models.db')

# ---- LOAD ----
#dill.load_session('./04 Working - Models.db')

In [3]:
#-----------------------------------------
# IMPORT DATAFRAME

df = pd.read_csv('data/df02.csv', sep=',', na_filter=False, index_col=0, 
                 parse_dates=['launched_at'])

# Checks
if (df.isnull().sum().sum() != 0):
    print('*** WARNING: Null values introduced with read_csv ***')
if (df.isna().sum().sum() != 0):
    print('*** WARNING: NA values introduced with read_csv ***')
if (df=='').sum().sum() != 0:
    print('*** WARNING: Empty string (\'\') values introduced with read_csv ***')

In [8]:
df.isnull().sum().sum()

0

In [9]:
df.isna().sum().sum()

0

In [13]:
(df == '').sum().sum()

0

In [14]:
info_variables = ['id','launched_at','category','country']

In [16]:
X = df.drop(columns=info_variables).drop(columns='launch_state')
y = df['launch_state']

In [20]:
#-----------------------------------------
# TRAIN/TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=101)

In [26]:
#-----------------------------------------
# FEATURE SCALING

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:
#==============================================================================
#
# NAIVE BAYES
#
#==============================================================================

In [27]:
#-----------------------------------------
# FIT MODEL

start_clock = time.clock()

# Naive-Bayes
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)

end_clock = time.clock()

clock_fit_nb = end_clock - start_clock
print('Runtime, fit: ', round(clock_fit_nb, 2), ' sec', sep='')

Runtime, fit: 0.47 sec


In [28]:
#-----------------------------------------
# PREDICT TEST RESULTS

start_clock = time.clock()

y_pred_nb = classifier_nb.predict(X_test)

end_clock = time.clock()

clock_predict_nb = end_clock - start_clock
print('Runtime, predict: ', round(clock_predict_nb, 2), ' sec', sep='')

Runtime, predict: 0.18 sec


In [29]:
#-----------------------------------------
# EVALUATE MODEL

# Confusion matrix
cm_nb = confusion_matrix(y_test, y_pred_nb)

# Classification report
cr_nb = classification_report(y_test, y_pred_nb)

print(cm_nb)
print("\n")
print(cr_nb)

[[ 2077 12921]
 [  474 19890]]


             precision    recall  f1-score   support

          0       0.81      0.14      0.24     14998
          1       0.61      0.98      0.75     20364

avg / total       0.69      0.62      0.53     35362



In [30]:
acc_nb = cm_nb.diagonal().sum() / cm_nb.sum()
acc_nb

In [32]:
#-----------------------------------------
# APPLY K-FOLD CROSS VALIDATION

start_clock = time.clock()

accuracies_nb = cross_val_score(
    estimator=classifier_nb, X=X_train, y=y_train,
    cv=10)

end_clock = time.clock()

clock_10FCV_nb = end_clock - start_clock
print('Runtime, 10-fold CV: ', round(clock_10FCV_nb, 2), ' sec', sep='')

Runtime, 10-fold CV: 7.64 sec


In [33]:
print("Accuracies:")
print(accuracies_nb)
print('\n')
print("RESULTS:")
print(f"  - Mean accuracy: {round(accuracies_nb.mean(), 2)*100}%")
print(f"  - Accuracy std dev: {round(accuracies_nb.std(), 2)*100}%")

Accuracies:
[0.90017909 0.93185032 0.94391554 0.60401546 0.63860873 0.94269017
 0.63700632 0.63650075 0.61959084 0.60384652]


RESULTS:
  - Mean accuracy: 75.0%
  - Accuracy std dev: 15.0%


In [42]:
df_results_nb = pd.DataFrame([{
    'model':'Naive Bayes', 
    'time_fit':clock_fit_nb, 'time_predict':clock_predict_nb,
    'time_10_fold_CV':clock_10FCV_nb,
    'accuracy':acc_nb, 'acc_10_fold':accuracies_nb.mean()}])

In [44]:
df_results_nb = df_results_nb[['model','time_fit','time_predict','time_10_fold_CV','accuracy','acc_10_fold']]

In [46]:
df_results = df_results_nb

In [47]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,Naive Bayes,0.472809,0.183608,7.644587,0.621204,0.74582


In [48]:
#==============================================================================
#
# LOGISTIC REGRESSION
#
#==============================================================================

In [None]:
"""#-----------------------------------------
# FEATURE SCALING

sc_X_LogReg = StandardScaler()
X_train_LogReg = sc_X_LogReg.fit_transform(X_train)
X_test_LogReg = sc_X_LogReg.transform(X_test)"""

In [49]:
#-----------------------------------------
# FIT MODEL

start_clock = time.clock()

# Logistic regression
classifier_LogReg = LogisticRegression(random_state=101)
classifier_LogReg.fit(X_train, y_train)

end_clock = time.clock()

clock_fit_LogReg = end_clock - start_clock
print('Runtime, fit: ', round(clock_fit_LogReg, 2), ' sec', sep='')

Runtime, fit: 20.99 sec


In [50]:
#-----------------------------------------
# PREDICT TEST RESULTS

start_clock = time.clock()

y_pred_LogReg = classifier_LogReg.predict(X_test)

end_clock = time.clock()

clock_predict_LogReg = end_clock - start_clock
print('Runtime, predict: ', round(clock_predict_LogReg, 2), ' sec', sep='')

Runtime, predict: 0.01 sec


In [51]:
#-----------------------------------------
# EVALUATE MODEL

# Confusion matrix
cm_LogReg = confusion_matrix(y_test, y_pred_LogReg)

# Classification report
cr_LogReg = classification_report(y_test, y_pred_LogReg)

print(cm_LogReg)
print("\n")
print(cr_LogReg)

[[12004  2994]
 [ 2765 17599]]


             precision    recall  f1-score   support

          0       0.81      0.80      0.81     14998
          1       0.85      0.86      0.86     20364

avg / total       0.84      0.84      0.84     35362



In [52]:
acc_LogReg = cm_LogReg.diagonal().sum() / cm_LogReg.sum()

In [53]:
acc_LogReg

0.8371415643911544

In [54]:
#-----------------------------------------
# APPLY K-FOLD CROSS VALIDATION

start_clock = time.clock()

accuracies_LogReg = cross_val_score(
    estimator=classifier_LogReg, X=X_train, y=y_train,
    cv=10)

end_clock = time.clock()

clock_10FCV_LogReg = end_clock - start_clock
print('Runtime, 10-fold CV: ', round(clock_10FCV_LogReg, 2), ' sec', sep='')

Runtime, 10-fold CV: 205.52 sec


In [55]:
print("Accuracies:")
print(accuracies_LogReg)
print('\n')
print("RESULTS:")
print(f"  - Mean accuracy: {round(accuracies_LogReg.mean(), 2)*100}%")
print(f"  - Accuracy std dev: {round(accuracies_LogReg.std(), 2)*100}%")

Accuracies:
[0.82769347 0.85964747 0.85785654 0.85144688 0.85474597 0.84220944
 0.8476765  0.87368024 0.86480626 0.82888658]


RESULTS:
  - Mean accuracy: 85.0%
  - Accuracy std dev: 1.0%


In [57]:
df_results_LogReg = pd.DataFrame([{
    'model':'Logistic Regression', 
    'time_fit':clock_fit_LogReg, 'time_predict':clock_predict_LogReg,
    'time_10_fold_CV':clock_10FCV_LogReg,
    'accuracy':acc_LogReg, 'acc_10_fold':accuracies_LogReg.mean()}])
df_results_LogReg = df_results_LogReg[['model','time_fit','time_predict','time_10_fold_CV','accuracy','acc_10_fold']]

In [59]:
df_results = pd.DataFrame.append(df_results, df_results_LogReg).reset_index(drop=True)

In [60]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,Naive Bayes,0.472809,0.183608,7.644587,0.621204,0.74582
1,Logistic Regression,20.989989,0.01449,205.522094,0.837142,0.850865


In [None]:
#==============================================================================
#
# K NEAREST NEIGHBORS
#
#==============================================================================

In [None]:
"""#-----------------------------------------
# FEATURE SCALING

sc_X_knn = StandardScaler()
X_train_knn = sc_X_knn.fit_transform(X_train)
X_test_knn = sc_X_knn.transform(X_test)"""

In [61]:
#-----------------------------------------
# FIT MODEL

start_clock = time.clock()

# KNN
classifier_knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
classifier_knn.fit(X_train, y_train)

end_clock = time.clock()

clock_fit_knn = end_clock - start_clock
print('Runtime, fit: ', round(clock_fit_knn, 2), ' sec', sep='')

Runtime, fit: 43.69 sec


In [62]:
#-----------------------------------------
# PREDICT TEST RESULTS

start_clock = time.clock()

y_pred_knn = classifier_knn.predict(X_test)

end_clock = time.clock()

clock_predict_knn = end_clock - start_clock
print('Runtime, predict: ', round(clock_predict_knn, 2), ' sec', sep='')

Runtime, predict: 931.51 sec


In [63]:
#-----------------------------------------
# EVALUATE MODEL

# Confusion matrix
cm_knn = confusion_matrix(y_test, y_pred_knn)

# Classification report
cr_knn = classification_report(y_test, y_pred_knn)

print(cm_knn)
print("\n")
print(cr_knn)

[[13101  1897]
 [ 1300 19064]]


             precision    recall  f1-score   support

          0       0.91      0.87      0.89     14998
          1       0.91      0.94      0.92     20364

avg / total       0.91      0.91      0.91     35362



In [64]:
acc_knn = cm_knn.diagonal().sum() / cm_knn.sum()
acc_knn

In [65]:
#-----------------------------------------
# APPLY K-FOLD CROSS VALIDATION

# 10-fold cross validation time estimate:
print('10-fold CV estimated time: ', 
      round((clock_fit_knn + clock_predict_knn)*10/60/60, 2), 
      ' hours')

10-fold CV estimated time:  2.71  hours


In [66]:
start_clock = time.clock()

accuracies_knn = cross_val_score(
    estimator=classifier_knn, X=X_train, y=y_train,
    cv=10)

end_clock = time.clock()

clock_10FCV_knn = end_clock - start_clock
print('Runtime, 10-fold CV: ', round(clock_10FCV_knn, 2), ' sec', sep='')

Runtime, 10-fold CV: 2974.55 sec


In [67]:
print("Accuracies:")
print(accuracies_knn)
print('\n')
print("RESULTS:")
print(f"  - Mean accuracy: {round(accuracies_knn.mean(), 2)*100}%")
print(f"  - Accuracy std dev: {round(accuracies_knn.std(), 2)*100}%")

Accuracies:
[0.90432652 0.91337544 0.9120558  0.91026487 0.90960505 0.9080969
 0.90460929 0.91185897 0.9154332  0.90732535]


RESULTS:
  - Mean accuracy: 91.0%
  - Accuracy std dev: 0.0%


In [68]:
df_results_knn = pd.DataFrame([{
    'model':'K Nearest Neighbors', 
    'time_fit':clock_fit_knn, 'time_predict':clock_predict_knn,
    'time_10_fold_CV':clock_10FCV_knn,
    'accuracy':acc_knn, 'acc_10_fold':accuracies_knn.mean()}])
df_results_knn = df_results_knn[['model','time_fit','time_predict','time_10_fold_CV','accuracy','acc_10_fold']]

In [69]:
df_results = pd.DataFrame.append(df_results, df_results_knn).reset_index(drop=True)

In [70]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,Naive Bayes,0.472809,0.183608,7.644587,0.621204,0.74582
1,Logistic Regression,20.989989,0.01449,205.522094,0.837142,0.850865
2,K Nearest Neighbors,43.687085,931.513263,2974.550811,0.909592,0.909695


In [None]:
#==============================================================================
#
# SUPPORT VECTOR MACHINE
#
#==============================================================================

In [None]:
"""#-----------------------------------------
# FEATURE SCALING

sc_X_svm = StandardScaler()
X_train_svm = sc_X_svm.fit_transform(X_train)
X_test_svm = sc_X_svm.transform(X_test)"""

In [71]:
#-----------------------------------------
# FIT MODEL

start_clock = time.clock()

# SVM
classifier_svm = SVC(kernel="rbf", random_state=101)
classifier_svm.fit(X_train, y_train)

end_clock = time.clock()

clock_fit_svm = end_clock - start_clock
print('Runtime, fit: ', round(clock_fit_svm, 2), ' sec', sep='')

Runtime, fit: 2722.81 sec


In [72]:
#-----------------------------------------
# PREDICT TEST RESULTS

start_clock = time.clock()

y_pred_svm = classifier_svm.predict(X_test)

end_clock = time.clock()

clock_predict_svm = end_clock - start_clock
print('Runtime, predict: ', round(clock_predict_svm, 2), ' sec', sep='')

Runtime, predict: 553.27 sec


In [73]:
#-----------------------------------------
# EVALUATE MODEL

# Confusion matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)

# Classification report
cr_svm = classification_report(y_test, y_pred_svm)

print(cm_svm)
print("\n")
print(cr_svm)

[[ 9722  5276]
 [ 3623 16741]]


             precision    recall  f1-score   support

          0       0.73      0.65      0.69     14998
          1       0.76      0.82      0.79     20364

avg / total       0.75      0.75      0.75     35362



In [74]:
acc_svm = cm_svm.diagonal().sum() / cm_svm.sum()
acc_svm

0.7483456818053278

In [75]:
#-----------------------------------------
# APPLY K-FOLD CROSS VALIDATION

# 10-fold cross validation time estimate:
print('10-fold CV estimated time: ', 
      round((clock_fit_svm + clock_predict_svm)*10/60/60, 2), 
      ' hours')

10-fold CV estimated time:  9.1  hours


In [76]:
# *** TIME-INTENSIVE ***
"""start_clock = time.clock()

accuracies_svm = cross_val_score(
    estimator=classifier_svm, X=X_train_svm, y=y_train,
    cv=10)

end_clock = time.clock()

clock_10FCV_svm = end_clock-start_clock
print('Runtime, 10-fold CV: ', clock_10FCV_svm, sep='')"""

"start_clock = time.clock()\n\naccuracies_svm = cross_val_score(\n    estimator=classifier_svm, X=X_train_svm, y=y_train,\n    cv=10)\n\nend_clock = time.clock()\n\nclock_10FCV_svm = end_clock-start_clock\nprint('Runtime, 10-fold CV: ', clock_10FCV_svm, sep='')"

In [122]:
try:
    clock_10FCV_svm
except:
    clock_10FCV_svm = None
    print("No 10-fold CV time to report")

No 10-fold CV time to report


In [113]:
try:
    accuracies_svm
except:
    accuracy_10FCV_mean_svm = None
    print("No K-fold CV accuracies to report")
else:
    accuracy_10FCV_mean_svm = accuracies_svm.mean()
    print("Accuracies:")
    print(accuracies_svm)
    print('\n')
    print("RESULTS:")
    print(f"  - Mean accuracy: {round(accuracies_svm.mean(),4)*100}%")
    print(f"  - Accuracy std dev: {round(accuracies_svm.std(),4)*100}%")

No K-fold CV accuracies to report


In [116]:
df_results_svm = pd.DataFrame([{
    'model':'SVM', 
    'time_fit':clock_fit_svm, 'time_predict':clock_predict_svm,
    'time_10_fold_CV':clock_10FCV_svm,
    'accuracy':acc_svm, 'acc_10_fold':accuracy_10FCV_mean_svm}])
df_results_svm = df_results_svm[['model','time_fit','time_predict','time_10_fold_CV','accuracy','acc_10_fold']]

In [117]:
df_results = pd.DataFrame.append(df_results, df_results_svm).reset_index(drop=True)

In [5]:
df_results

Unnamed: 0,model,time_fit,time_predict,time_10_fold_CV,accuracy,acc_10_fold
0,Naive Bayes,0.472809,0.183608,7.64459,0.621204,0.74582
1,Logistic Regression,20.989989,0.01449,205.522,0.837142,0.850865
2,K Nearest Neighbors,43.687085,931.513263,2974.55,0.909592,0.909695
3,SVM,2722.813777,553.268978,,0.748346,
