In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv('data/koi_data.csv')
data = data.set_index('kepoi_name')
data['koi_disposition'] = data['koi_disposition'] == 'CONFIRMED'
data.head(5)

Unnamed: 0_level_0,koi_disposition,koi_period,koi_impact,koi_duration,koi_depth,koi_ror,koi_srho,koi_prad,koi_sma,koi_incl,...,koi_fwm_srao,koi_fwm_sdeco,koi_fwm_prao,koi_fwm_pdeco,koi_dicco_mra,koi_dicco_mdec,koi_dicco_msky,koi_dikco_mra,koi_dikco_mdec,koi_dikco_msky
kepoi_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
K00752.01,True,9.48804,0.146,2.9575,615.8,0.02234,3.20796,2.26,0.0853,89.66,...,0.43,0.94,-0.0002,-0.00055,-0.01,0.2,0.2,0.08,0.31,0.32
K00752.02,True,54.41838,0.586,4.507,874.8,0.02795,3.02368,2.83,0.2734,89.57,...,-0.63,1.23,0.00066,-0.00105,0.39,0.0,0.39,0.49,0.12,0.5
K00754.01,False,1.73695,1.276,2.40641,8079.2,0.38739,0.2208,33.46,0.0267,67.09,...,-0.111,0.002,0.00302,-0.00142,-0.249,0.147,0.289,-0.257,0.099,0.276
K00755.01,True,2.52559,0.701,1.6545,603.3,0.02406,1.98635,2.75,0.0374,85.41,...,-0.01,0.23,8e-05,-7e-05,0.03,-0.09,0.1,0.07,0.02,0.07
K00114.01,False,7.36179,1.169,5.022,233.7,0.18339,0.00485,39.21,0.082,60.92,...,-13.45,24.09,0.00303,-0.00555,-4.506,7.71,8.93,-4.537,7.713,8.948


In [3]:
data.describe()

Unnamed: 0,koi_period,koi_impact,koi_duration,koi_depth,koi_ror,koi_srho,koi_prad,koi_sma,koi_incl,koi_teq,...,koi_fwm_srao,koi_fwm_sdeco,koi_fwm_prao,koi_fwm_pdeco,koi_dicco_mra,koi_dicco_mdec,koi_dicco_msky,koi_dikco_mra,koi_dikco_mdec,koi_dikco_msky
count,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,...,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0,5202.0
mean,37.032237,0.717106,5.607025,21340.318993,0.235205,3.41537,112.230798,0.158146,81.181413,1143.721069,...,-0.355681,-0.805629,-0.000263,0.000439,-0.049743,-0.087413,1.930251,-0.038402,-0.098738,1.920226
std,88.417985,2.628207,6.962634,66989.80855,2.586213,25.131368,3699.799318,0.241792,16.308839,775.788868,...,10.978677,14.741473,0.065707,0.077519,2.46567,2.746534,3.147553,2.465094,2.734732,3.142764
min,0.30694,0.0,0.1046,0.8,0.00129,4e-05,0.08,0.0072,2.29,92.0,...,-275.6,-397.62,-4.0,-0.8,-21.5,-75.9,0.0,-23.6,-76.6,0.0
25%,2.213962,0.226,2.50025,176.8,0.013058,0.176092,1.46,0.033,81.93,615.25,...,-0.5,-0.57,-0.00024,-0.00024,-0.27,-0.2915,0.12825,-0.26525,-0.32,0.18
50%,7.386755,0.61,3.8055,495.95,0.024185,0.748045,2.6,0.07365,87.89,948.0,...,0.0,-0.03,0.0,0.0,0.0,0.0,0.46,-0.007,-0.018,0.453
75%,23.448117,0.92375,6.00075,2120.525,0.17126,2.267063,21.645,0.1582,89.52,1482.0,...,0.5,0.45,0.00026,0.00028,0.23,0.23,2.57,0.22625,0.25,2.42
max,1071.23262,100.806,138.54,864260.0,99.87065,918.75239,200346.0,2.0345,90.0,9791.0,...,97.78,98.78,1.19,5.0,45.68,27.5,88.6,46.57,31.2,89.6


In [4]:
x = data.drop('koi_disposition', axis=1)
y = data['koi_disposition']

In [5]:
def create_pipeline(model):
    pipe = Pipeline([['std', StandardScaler()],
                     ['clf', model]])

    return pipe

In [6]:
def create_model(experiments, model_name, model_obj, model_params={}):
    experiments['model_name'].append(model_name)
    experiments['model_object'].append(model_obj(**model_params))
    experiments['model_params'].append(model_params)
    
    return experiments

In [7]:
experiments = {'model_name': [], 'model_object': [], 'model_params': []}

# Naive Bayes baseline
experiments = create_model(experiments, 'NaiveBayes', GaussianNB)

# Decision Tree experiments
experiments = create_model(experiments, 'DecisionTree', DecisionTreeClassifier, {'max_depth': 5})
experiments = create_model(experiments, 'DecisionTree', DecisionTreeClassifier, {'max_depth': 10})
experiments = create_model(experiments, 'DecisionTree', DecisionTreeClassifier, {'max_depth': 15})
experiments = create_model(experiments, 'DecisionTree', DecisionTreeClassifier, {'max_depth': 20})
experiments = create_model(experiments, 'DecisionTree', DecisionTreeClassifier, {'max_depth': None})

# SVM experiments
experiments = create_model(experiments, 'SVM', SVC, {'kernel': 'linear'})
experiments = create_model(experiments, 'SVM', SVC, {'kernel': 'sigmoid'})
experiments = create_model(experiments, 'SVM', SVC, {'kernel': 'poly', 'degree': 2})
experiments = create_model(experiments, 'SVM', SVC, {'kernel': 'poly', 'degree': 3})
experiments = create_model(experiments, 'SVM', SVC, {'kernel': 'poly', 'degree': 4})
experiments = create_model(experiments, 'SVM', SVC, {'kernel': 'rbf'})

# KNN experiments
experiments = create_model(experiments, 'KNN', KNeighborsClassifier, {'n_neighbors': 1})
experiments = create_model(experiments, 'KNN', KNeighborsClassifier, {'n_neighbors': 2})
experiments = create_model(experiments, 'KNN', KNeighborsClassifier, {'n_neighbors': 5})
experiments = create_model(experiments, 'KNN', KNeighborsClassifier, {'n_neighbors': 10})
experiments = create_model(experiments, 'KNN', KNeighborsClassifier, {'n_neighbors': 50})

# Random Forest experiments
experiments = create_model(experiments, 'RandomForest', RandomForestClassifier, {'n_estimators': 50})
experiments = create_model(experiments, 'RandomForest', RandomForestClassifier, {'n_estimators': 100})
experiments = create_model(experiments, 'RandomForest', RandomForestClassifier, {'n_estimators': 200})
experiments = create_model(experiments, 'RandomForest', RandomForestClassifier, {'n_estimators': 500})

# Gradient Tree Boosting experiments

experiments = create_model(experiments, 'GradientTreeBoosting', GradientBoostingClassifier, {'n_estimators': 50})
experiments = create_model(experiments, 'GradientTreeBoosting', GradientBoostingClassifier, {'n_estimators': 100})
experiments = create_model(experiments, 'GradientTreeBoosting', GradientBoostingClassifier, {'n_estimators': 200})
experiments = create_model(experiments, 'GradientTreeBoosting', GradientBoostingClassifier, {'n_estimators': 500})

In [8]:
experiments['pipeline'] = list(map(lambda model: create_pipeline(model), experiments['model_object']))

In [None]:
experiments['accuracy_scores'] = list(map(
    lambda pipeline: cross_val_score(pipeline, x, y, cv=5, n_jobs=-1), experiments['pipeline']))

In [None]:
experiments['accuracy_mean'] = list(map(lambda scores: np.mean(scores), experiments['accuracy_scores']))
experiments['accuracy_std'] = list(map(lambda scores: np.std(scores), experiments['accuracy_scores']))

In [None]:
results = pd.DataFrame(experiments)
print(results)

In [None]:
print(results[['model_name', 'model_params', 'accuracy_mean', 'accuracy_std']].sort_values(
    'accuracy_mean', ascending=False))