In [11]:
import pandas as pd
from os import path
from sklearn.neighbors import KNeighborsClassifier

## Load and clean the data

In [38]:
filename = path.join(".", "data", "exoplanet_data.csv")

df = pd.read_csv(filename)
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [72]:
# Use the seven most important features identified in the random forest model
data = df[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr', 'koi_prad', 'koi_prad_err2', 'koi_duration_err2']]
data.head()

Unnamed: 0,koi_fpflag_co,koi_fpflag_nt,koi_fpflag_ss,koi_model_snr,koi_prad,koi_prad_err2,koi_duration_err2
0,0,0,0,25.8,2.83,-0.19,-0.116
1,0,0,1,76.3,14.6,-1.31,-0.0341
2,0,0,1,505.6,33.46,-2.83,-0.00537
3,0,0,0,40.9,2.75,-0.35,-0.042
4,0,0,0,40.2,2.77,-0.3,-0.0673


## Split and scale the data

In [73]:
# Split the data into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=12)

In [74]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Find the best K

In [75]:
train_scores = []
test_scores = []
for k in range(1, 32, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

k: 1, Train/Test Score: 1.000/0.824
k: 3, Train/Test Score: 0.900/0.857
k: 5, Train/Test Score: 0.887/0.860
k: 7, Train/Test Score: 0.883/0.862
k: 9, Train/Test Score: 0.879/0.864
k: 11, Train/Test Score: 0.877/0.870
k: 13, Train/Test Score: 0.877/0.870
k: 15, Train/Test Score: 0.876/0.870
k: 17, Train/Test Score: 0.874/0.868
k: 19, Train/Test Score: 0.873/0.865
k: 21, Train/Test Score: 0.873/0.867
k: 23, Train/Test Score: 0.871/0.868
k: 25, Train/Test Score: 0.871/0.868
k: 27, Train/Test Score: 0.872/0.868
k: 29, Train/Test Score: 0.872/0.870
k: 31, Train/Test Score: 0.870/0.872


In [76]:
knn = KNeighborsClassifier(n_neighbors=31)
knn.fit(X_train_scaled, y_train)
print('k=31 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

k=31 Test Acc: 0.872


In [77]:
count = 0
for x in knn.predict(X_test_scaled)[:20]:
    print(count, x)
    count += 1

0 CONFIRMED
1 FALSE POSITIVE
2 FALSE POSITIVE
3 FALSE POSITIVE
4 FALSE POSITIVE
5 CONFIRMED
6 CONFIRMED
7 CONFIRMED
8 CANDIDATE
9 FALSE POSITIVE
10 CONFIRMED
11 CANDIDATE
12 CONFIRMED
13 CONFIRMED
14 CANDIDATE
15 FALSE POSITIVE
16 FALSE POSITIVE
17 CANDIDATE
18 FALSE POSITIVE
19 FALSE POSITIVE


In [78]:
df['koi_disposition'].head(20)

0          CONFIRMED
1     FALSE POSITIVE
2     FALSE POSITIVE
3          CONFIRMED
4          CONFIRMED
5          CONFIRMED
6          CONFIRMED
7          CONFIRMED
8          CONFIRMED
9          CONFIRMED
10         CONFIRMED
11    FALSE POSITIVE
12    FALSE POSITIVE
13    FALSE POSITIVE
14         CONFIRMED
15         CONFIRMED
16    FALSE POSITIVE
17         CONFIRMED
18    FALSE POSITIVE
19         CONFIRMED
Name: koi_disposition, dtype: object

## Tune the model with GridSearchCV

In [79]:
# https://medium.com/@erikgreenj/k-neighbors-classifier-with-gridsearchcv-basics-3c445ddeb657
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': [3,9,31,51],
             'weights': ['uniform', 'distance'],
             'metric': ['euclidean', 'manhattan']}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=1)

In [80]:
gs = grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    2.2s finished


In [81]:
gs.best_score_

0.7980195606138889

In [82]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='distance')

In [83]:
gs.best_params_

{'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}

In [84]:
count = 0
for x in gs.predict(X_test_scaled)[:20]:
    print(count, x)

0 CANDIDATE
0 CANDIDATE
0 FALSE POSITIVE
0 CANDIDATE
0 FALSE POSITIVE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE
0 FALSE POSITIVE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE
0 FALSE POSITIVE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE
0 CANDIDATE


In [85]:
df['koi_disposition'].head(20)

0          CONFIRMED
1     FALSE POSITIVE
2     FALSE POSITIVE
3          CONFIRMED
4          CONFIRMED
5          CONFIRMED
6          CONFIRMED
7          CONFIRMED
8          CONFIRMED
9          CONFIRMED
10         CONFIRMED
11    FALSE POSITIVE
12    FALSE POSITIVE
13    FALSE POSITIVE
14         CONFIRMED
15         CONFIRMED
16    FALSE POSITIVE
17         CONFIRMED
18    FALSE POSITIVE
19         CONFIRMED
Name: koi_disposition, dtype: object