In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [2]:
import os
import dill
import warnings

import numpy as np
import pandas as pd
import time as time

from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [3]:
# To Read out 1000 rows rather than the default value. 
pd.set_option('display.max_rows', 1000)

# To Read out 1000 columns rather than the default value. 
pd.set_option('display.max_columns', 1000)

# Not prinitng the warnings
warnings.filterwarnings('ignore')

# Setting a random seed for reproducability
np.random.seed(7)

# Setting up the k-fold
kFold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 7)

# Setting the working directory for data and output files
cwd = os.getcwd()
os.chdir(cwd + '/data/')

# Setting the environment for temporary results
%env JOBLIB_TEMP_FOLDER = /tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [4]:
# Problem 1
df = pd.read_csv('kplr_dr25_inj1_plti.csv', header = 0)

print('Dataset Size:')
print(df.shape)
print()

temp_df = df.iloc[:, 0:15]
df_drop = temp_df[temp_df.isnull().any(axis=1)]
temp_df = temp_df.drop(df_drop.index.values)
temp_df = temp_df[temp_df.Recovered != 2]

print('Cleaned Dataset Size:')
print(temp_df.shape)
print()

X = temp_df.iloc[:, 1:14]
Y = temp_df.iloc[:, 14]

print('Input Size:', X.shape)
print('Output Size:', Y.shape)

Dataset Size:
(146294, 25)

Cleaned Dataset Size:
(145671, 15)

Input Size: (145671, 13)
Output Size: (145671,)


In [5]:
'''
# Problem 2
df = pd.read_csv('kplr_dr25_inj1_tces.csv', header = 0)

print('Dataset Size: ')
print(df.shape)
print()

cols = ['TCE_ID', 'KIC', 'Disp', 'Score', 'period', 'epoch', 'NTL', 'SS', 
        'CO', 'EM', 'Expected_MES', 'MES', 'NTran', 'depth', 'duration', 'Rp',
        'Rs', 'Ts', 'logg', 'a', 'Rp/Rs', 'a/Rs', 'impact', 'SNR_DV', 'Sp',
        'Fit_Prov']
df = df[cols]
df.columns

df['Disp'] = df['Disp'].replace('PC', 1)
df['Disp'] = df['Disp'].replace('FP', 0)

X = df.iloc[:, 10:25]
Y = df.iloc[:, 2]

print('Input Size:', X.shape)
print('Output Size:', Y.shape)
'''

"\n# Problem 2\ndf = pd.read_csv('kplr_dr25_inj1_tces.csv', header = 0)\n\nprint('Dataset Size: ')\nprint(df.shape)\nprint()\n\ncols = ['TCE_ID', 'KIC', 'Disp', 'Score', 'period', 'epoch', 'NTL', 'SS', \n        'CO', 'EM', 'Expected_MES', 'MES', 'NTran', 'depth', 'duration', 'Rp',\n        'Rs', 'Ts', 'logg', 'a', 'Rp/Rs', 'a/Rs', 'impact', 'SNR_DV', 'Sp',\n        'Fit_Prov']\ndf = df[cols]\ndf.columns\n\ndf['Disp'] = df['Disp'].replace('PC', 1)\ndf['Disp'] = df['Disp'].replace('FP', 0)\n\nX = df.iloc[:, 10:25]\nY = df.iloc[:, 2]\n\nprint('Input Size:', X.shape)\nprint('Output Size:', Y.shape)\n"

In [6]:
abc = AdaBoostClassifier()
dtc = DecisionTreeClassifier()
etc = ExtraTreesClassifier()
gbc = GradientBoostingClassifier()
gnb = GaussianNB()
gpc = GaussianProcessClassifier()
knc = KNeighborsClassifier()
mlp = MLPClassifier()
rfc = RandomForestClassifier()
svc = SVC()
xgb = XGBClassifier()

In [7]:
start = time.time()
abc_predict = cross_val_predict(abc, X, Y, cv = kFold)
abc_predict_proba = pd.DataFrame(cross_val_predict(abc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
abc_accuracy = metrics.accuracy_score(Y, abc_predict)  
abc_precision = metrics.precision_score(Y, abc_predict, pos_label=1)
abc_recall = metrics.recall_score(Y, abc_predict, pos_label=1)  
abc_f1 = metrics.f1_score(Y, abc_predict, pos_label=1)
abc_auroc = metrics.roc_auc_score(Y, abc_predict_proba[1])
abc_aurpc = metrics.average_precision_score(Y, abc_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  127.51


In [8]:
start = time.time()
dtc_predict = cross_val_predict(dtc, X, Y, cv = kFold)
dtc_predict_proba = pd.DataFrame(cross_val_predict(dtc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
dtc_accuracy = metrics.accuracy_score(Y, dtc_predict)  
dtc_precision = metrics.precision_score(Y, dtc_predict, pos_label=1)
dtc_recall = metrics.recall_score(Y, dtc_predict, pos_label=1)  
dtc_f1 = metrics.f1_score(Y, dtc_predict, pos_label=1)
dtc_auroc = metrics.roc_auc_score(Y, dtc_predict_proba[1])
dtc_aurpc = metrics.average_precision_score(Y, dtc_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  31.76


In [9]:
start = time.time()
etc_predict = cross_val_predict(etc, X, Y, cv = kFold)
etc_predict_proba = pd.DataFrame(cross_val_predict(etc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
etc_accuracy = metrics.accuracy_score(Y, etc_predict)  
etc_precision = metrics.precision_score(Y, etc_predict, pos_label=1)
etc_recall = metrics.recall_score(Y, etc_predict, pos_label=1)  
etc_f1 = metrics.f1_score(Y, etc_predict, pos_label=1)
etc_auroc = metrics.roc_auc_score(Y, etc_predict_proba[1])
etc_aurpc = metrics.average_precision_score(Y, etc_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  17.82


In [10]:
start = time.time()
gbc_predict = cross_val_predict(gbc, X, Y, cv = kFold)
gbc_predict_proba = pd.DataFrame(cross_val_predict(gbc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
gbc_accuracy = metrics.accuracy_score(Y, gbc_predict)  
gbc_precision = metrics.precision_score(Y, gbc_predict, pos_label=1)
gbc_recall = metrics.recall_score(Y, gbc_predict, pos_label=1)  
gbc_f1 = metrics.f1_score(Y, gbc_predict, pos_label=1)
gbc_auroc = metrics.roc_auc_score(Y, gbc_predict_proba[1])
gbc_aurpc = metrics.average_precision_score(Y, gbc_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  235.79


In [11]:
start = time.time()
gnb_predict = cross_val_predict(gnb, X, Y, cv = kFold)
gnb_predict_proba = pd.DataFrame(cross_val_predict(gnb, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
gnb_accuracy = metrics.accuracy_score(Y, gnb_predict)  
gnb_precision = metrics.precision_score(Y, gnb_predict, pos_label=1)
gnb_recall = metrics.recall_score(Y, gnb_predict, pos_label=1)  
gnb_f1 = metrics.f1_score(Y, gnb_predict, pos_label=1)
gnb_auroc = metrics.roc_auc_score(Y, gnb_predict_proba[1])
gnb_aurpc = metrics.average_precision_score(Y, gnb_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  1.24


In [12]:
'''
start = time.time()
gpc_predict = cross_val_predict(gpc, X, Y, cv = kFold)
gpc_predict_proba = pd.DataFrame(cross_val_predict(gpc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
gpc_accuracy = metrics.accuracy_score(Y, gpc_predict)  
gpc_precision = metrics.precision_score(Y, gpc_predict, pos_label=1)
gpc_recall = metrics.recall_score(Y, gpc_predict, pos_label=1)  
gpc_f1 = metrics.f1_score(Y, gpc_predict, pos_label=1)
gpc_auroc = metrics.roc_auc_score(Y, gpc_predict_proba[1])
gpc_aurpc = metrics.average_precision_score(Y, gpc_predict, pos_label=1)

print('Time: ', round(end - start, 2))
'''

"\nstart = time.time()\ngpc_predict = cross_val_predict(gpc, X, Y, cv = kFold)\ngpc_predict_proba = pd.DataFrame(cross_val_predict(gpc, X, Y, cv = kFold, method='predict_proba'))\nend = time.time()\n\n# Store metrics\ngpc_accuracy = metrics.accuracy_score(Y, gpc_predict)  \ngpc_precision = metrics.precision_score(Y, gpc_predict, pos_label=1)\ngpc_recall = metrics.recall_score(Y, gpc_predict, pos_label=1)  \ngpc_f1 = metrics.f1_score(Y, gpc_predict, pos_label=1)\ngpc_auroc = metrics.roc_auc_score(Y, gpc_predict_proba[1])\ngpc_aurpc = metrics.average_precision_score(Y, gpc_predict, pos_label=1)\n\nprint('Time: ', round(end - start, 2))\n"

In [13]:
start = time.time()
knc_predict = cross_val_predict(knc, X, Y, cv = kFold)
knc_predict_proba = pd.DataFrame(cross_val_predict(knc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
knc_accuracy = metrics.accuracy_score(Y, knc_predict)  
knc_precision = metrics.precision_score(Y, knc_predict, pos_label=1)
knc_recall = metrics.recall_score(Y, knc_predict, pos_label=1)  
knc_f1 = metrics.f1_score(Y, knc_predict, pos_label=1)
knc_auroc = metrics.roc_auc_score(Y, knc_predict_proba[1])
knc_aurpc = metrics.average_precision_score(Y, knc_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  23.17


In [14]:
start = time.time()
mlp_predict = cross_val_predict(mlp, X, Y, cv = kFold)
mlp_predict_proba = pd.DataFrame(cross_val_predict(mlp, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
mlp_accuracy = metrics.accuracy_score(Y, mlp_predict)  
mlp_precision = metrics.precision_score(Y, mlp_predict, pos_label=1)
mlp_recall = metrics.recall_score(Y, mlp_predict, pos_label=1)  
mlp_f1 = metrics.f1_score(Y, mlp_predict, pos_label=1)
mlp_auroc = metrics.roc_auc_score(Y, mlp_predict_proba[1])
mlp_aurpc = metrics.average_precision_score(Y, mlp_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  809.28


In [15]:
start = time.time()
rfc_predict = cross_val_predict(rfc, X, Y, cv = kFold)
rfc_predict_proba = pd.DataFrame(cross_val_predict(rfc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
rfc_accuracy = metrics.accuracy_score(Y, rfc_predict)  
rfc_precision = metrics.precision_score(Y, rfc_predict, pos_label=1)
rfc_recall = metrics.recall_score(Y, rfc_predict, pos_label=1)  
rfc_f1 = metrics.f1_score(Y, rfc_predict, pos_label=1)
rfc_auroc = metrics.roc_auc_score(Y, rfc_predict_proba[1])
rfc_aurpc = metrics.average_precision_score(Y, rfc_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  49.82


In [16]:
'''
start = time.time()
svc_predict = cross_val_predict(svc, X, Y, cv = kFold)
svc_predict_proba = pd.DataFrame(cross_val_predict(svc, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
svc_accuracy = metrics.accuracy_score(Y, svc_predict)  
svc_precision = metrics.precision_score(Y, svc_predict, pos_label=1)
svc_recall = metrics.recall_score(Y, svc_predict, pos_label=1)  
svc_f1 = metrics.f1_score(Y, svc_predict, pos_label=1)
svc_auroc = metrics.roc_auc_score(Y, svc_predict_proba[1])
svc_aurpc = metrics.average_precision_score(Y, svc_predict, pos_label=1)

print('Time: ', round(end - start, 2))
'''

"\nstart = time.time()\nsvc_predict = cross_val_predict(svc, X, Y, cv = kFold)\nsvc_predict_proba = pd.DataFrame(cross_val_predict(svc, X, Y, cv = kFold, method='predict_proba'))\nend = time.time()\n\n# Store metrics\nsvc_accuracy = metrics.accuracy_score(Y, svc_predict)  \nsvc_precision = metrics.precision_score(Y, svc_predict, pos_label=1)\nsvc_recall = metrics.recall_score(Y, svc_predict, pos_label=1)  \nsvc_f1 = metrics.f1_score(Y, svc_predict, pos_label=1)\nsvc_auroc = metrics.roc_auc_score(Y, svc_predict_proba[1])\nsvc_aurpc = metrics.average_precision_score(Y, svc_predict, pos_label=1)\n\nprint('Time: ', round(end - start, 2))\n"

In [17]:
start = time.time() 
xgb_predict = cross_val_predict(xgb, X, Y, cv = kFold)
xgb_predict_proba = pd.DataFrame(cross_val_predict(xgb, X, Y, cv = kFold, method='predict_proba'))
end = time.time()

# Store metrics
xgb_accuracy = metrics.accuracy_score(Y, xgb_predict)  
xgb_precision = metrics.precision_score(Y, xgb_predict, pos_label=1)
xgb_recall = metrics.recall_score(Y, xgb_predict, pos_label=1)  
xgb_f1 = metrics.f1_score(Y, xgb_predict, pos_label=1)
xgb_auroc = metrics.roc_auc_score(Y, xgb_predict_proba[1])
xgb_aurpc = metrics.average_precision_score(Y, xgb_predict, pos_label=1)

print('Time: ', round(end - start, 2))

Time:  166.35


In [18]:
predictions = {
    'ADABoost': abc_predict,
    'ExtraTrees': etc_predict,
    'RandomForest': rfc_predict,
    'GradientBoosting': gbc_predict,
    'XGBoost': xgb_predict,
    'DecisionTree': dtc_predict,
    'MultiLayerPerceptron': mlp_predict,
    'KNeighbors': knc_predict,
    'NaiveBayes': gnb_predict
} 

predictions = pd.DataFrame(predictions)

In [19]:
# Store metrics
predictions['Aggregate'] = predictions.mean(axis = 1)
aggregate_auroc = metrics.roc_auc_score(Y, predictions['Aggregate'])

predictions['Aggregate'] = round(predictions['Aggregate']).astype(int)
aggregate_accuracy = metrics.accuracy_score(Y, predictions['Aggregate'])  
aggregate_precision = metrics.precision_score(Y, predictions['Aggregate'], pos_label = 1)
aggregate_recall = metrics.recall_score(Y, predictions['Aggregate'], pos_label = 1)  
aggregate_f1 = metrics.f1_score(Y, predictions['Aggregate'], pos_label = 1)
aggregate_aurpc = metrics.average_precision_score(Y, predictions['Aggregate'], pos_label = 1)

In [20]:
# Model comparison
models = pd.DataFrame({
    'Model': ['ADA Boost', 'Extra Trees', 'Random Forest', 'Gradient Boosting', 'XG Boost', 'Decision Tree',
              'Multi Layer Perceptron', 'K Neighbors', 'Naive Bayes', 'Aggregate'],
    'Accuracy' : [abc_accuracy, etc_accuracy, rfc_accuracy, gbc_accuracy, xgb_accuracy, dtc_accuracy,
                  mlp_accuracy, knc_accuracy, gnb_accuracy, aggregate_accuracy],
    'F1' : [abc_f1, etc_f1, rfc_f1, gbc_f1, xgb_f1, dtc_f1, mlp_f1, knc_f1, gnb_f1, aggregate_f1],
    'AUROC' : [abc_auroc, etc_auroc, rfc_auroc, gbc_auroc, xgb_auroc, dtc_auroc, mlp_auroc, knc_auroc, 
               gnb_auroc, aggregate_auroc],
    'AURPC' : [abc_aurpc, etc_aurpc, rfc_aurpc, gbc_aurpc, xgb_aurpc, dtc_aurpc, mlp_aurpc, knc_aurpc,
               gnb_aurpc, aggregate_aurpc],
    'Precision': [abc_precision, etc_precision, rfc_precision, gbc_precision, xgb_precision, dtc_precision, mlp_precision, 
                  knc_precision, gnb_precision, aggregate_precision],
    'Recall' : [abc_recall, etc_recall, rfc_recall, gbc_recall, xgb_recall, dtc_recall, mlp_recall, knc_recall,
                gnb_recall, aggregate_recall]
})
# Print table and sort by test precision
models = models.sort_values(by = 'Accuracy', ascending = False)

blankIndex = [''] * len(models)
models.index = blankIndex
models

Unnamed: 0,Model,Accuracy,F1,AUROC,AURPC,Precision,Recall
,Gradient Boosting,0.88945,0.821452,0.956928,0.727988,0.815915,0.827063
,XG Boost,0.889312,0.821752,0.956764,0.727676,0.81387,0.829787
,Aggregate,0.889223,0.820885,0.924018,0.727509,0.816255,0.825568
,ADA Boost,0.887013,0.819431,0.953368,0.722775,0.805565,0.833784
,Random Forest,0.881033,0.799774,0.942286,0.710306,0.828788,0.772722
,Extra Trees,0.868423,0.771552,0.931884,0.683323,0.827589,0.722623
,Decision Tree,0.845707,0.749599,0.820379,0.638436,0.748115,0.751088
,Multi Layer Perceptron,0.83852,0.747333,0.930184,0.627974,0.720129,0.776674
,K Neighbors,0.717404,0.487399,0.719908,0.413898,0.551032,0.43694
,Naive Bayes,0.455932,0.521889,0.860136,0.355842,0.357559,0.96573


In [21]:
models.to_csv('baseline.csv', sep = ',', encoding = 'utf-8', index = False)