In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:

df = pd.read_csv("exoplanet_data.csv")
df = df.dropna(axis='columns', how='all')
df = df.dropna()
unknown_df = df[df['koi_disposition'] == 'CANDIDATE']
known_df = df[df['koi_disposition'] != 'CANDIDATE']
known_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6982,FALSE POSITIVE,0,1,0,0,1.012174,2.860000e-06,-2.860000e-06,131.908350,0.003960,...,-312,4.096,0.209,-0.171,1.716,0.478,-0.478,294.10941,44.691078,14.712
6983,FALSE POSITIVE,0,1,0,0,21.513523,2.714000e-04,-2.714000e-04,132.335600,0.012200,...,-141,3.508,0.187,-0.153,3.318,0.665,-0.813,287.46786,37.966640,10.630
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
X = known_df.drop('koi_disposition', axis = 1)
X.head()
y = known_df['koi_disposition']
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=0, train_size = 0.80)
ensemble_split = (X_train, X_test, y_train, y_test)
with open('pickle_jar/ensemble_split.pickle', 'wb') as kosher_dill:
    pickle.dump(ensemble_split,kosher_dill)
#A second split for a Hyperparameter tuning set
X_train, X_hp_train, y_train, y_hp_train = train_test_split(X_train, y_train, random_state=0, train_size = 0.75)


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler().fit(X_train)

X_train_minmax = minmax_scaler.transform(X_train)
X_hp_train_minmax = minmax_scaler.transform(X_hp_train)
X_test_minmax = minmax_scaler.transform(X_test)

split_data = (X_train_minmax, X_hp_train_minmax, X_test_minmax, y_train,y_hp_train, y_test)
with open('pickle_jar/split_data.pickle', 'wb') as kosher_dill:
    pickle.dump(split_data,kosher_dill)

with open('pickle_jar/unknown_df.pickle', 'wb') as bread_butter:
    pickle.dump(unknown_df,bread_butter)

# Train the Model



In [8]:
### ESP - Model performs better without scaling
knn_model = KNeighborsClassifier(n_jobs = -3)
knn_model = knn_model.fit(X_train_minmax,y_train)

print(f"Training Data Score: {knn_model.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {knn_model.score(X_test_minmax, y_test)}")

Training Data Score: 0.9921408362150267
Testing Data Score: 0.9868049010367578


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [10]:
# Create the GridSearchCV model
### ESP - High cv because scores are very close, and I want to make sure I get a representative average

param_grid = {'n_neighbors': [2,3,4,5,6,7,8],
              'weights': ['uniform','distance'],
              'algorithm': ['auto','ball_tree','kd_tree','brute'],
              'leaf_size': [25,30,35,40,45],
              'p':[1,2,3],
              'n_jobs':[1]}
grid = GridSearchCV(knn_model, param_grid, verbose=3, cv = 7, n_jobs = -3)

In [11]:
# Train the model with GridSearch
grid.fit(X_hp_train_minmax, y_hp_train)

Fitting 7 folds for each of 840 candidates, totalling 5880 fits
[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done  20 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-3)]: Done 260 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-3)]: Done 900 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-3)]: Done 1796 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-3)]: Done 2948 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-3)]: Done 4356 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 5880 out of 5880 | elapsed:  1.8min finished


GridSearchCV(cv=7, estimator=KNeighborsClassifier(n_jobs=-3), n_jobs=-3,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [25, 30, 35, 40, 45], 'n_jobs': [1],
                         'n_neighbors': [2, 3, 4, 5, 6, 7, 8], 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [12]:
print(grid.best_params_)
print(grid.best_score_)


{'algorithm': 'auto', 'leaf_size': 25, 'n_jobs': 1, 'n_neighbors': 4, 'p': 1, 'weights': 'uniform'}
0.9896367574565553


In [14]:
knn_model = KNeighborsClassifier(leaf_size = 25, n_neighbors = 4, p = 1, weights = 'uniform')
knn_model = knn_model.fit(X_train_minmax,y_train)

print(f"Training Data Score: {knn_model.score(X_train_minmax, y_train)}")
print(f"Testing Data Score: {knn_model.score(X_test_minmax, y_test)}")

Training Data Score: 0.9930839358692235
Testing Data Score: 0.9877474081055608


# Save the Model

In [15]:
with open('pickle_jar/knn_model.pickle', 'wb') as dill:
    pickle.dump(knn_model,dill)