In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1315 sha256=d95524959c4660eebbae8688c0710536b5b0757a2ed7f63e58a20f8b67d3460b
  Stored in directory: /Users/saurin/Library/Caches/pip/wheels/76/03/bb/589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder 
from sklearn.metrics import accuracy_score

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
koi_disposition      6991 non-null object
koi_fpflag_nt        6991 non-null int64
koi_fpflag_ss        6991 non-null int64
koi_fpflag_co        6991 non-null int64
koi_fpflag_ec        6991 non-null int64
koi_period           6991 non-null float64
koi_period_err1      6991 non-null float64
koi_period_err2      6991 non-null float64
koi_time0bk          6991 non-null float64
koi_time0bk_err1     6991 non-null float64
koi_time0bk_err2     6991 non-null float64
koi_impact           6991 non-null float64
koi_impact_err1      6991 non-null float64
koi_impact_err2      6991 non-null float64
koi_duration         6991 non-null float64
koi_duration_err1    6991 non-null float64
koi_duration_err2    6991 non-null float64
koi_depth            6991 non-null float64
koi_depth_err1       6991 non-null float64
koi_depth_err2       6991 non-null float64
koi_prad             6991 non-null float64

In [5]:
# Select your features (columns)

In [6]:
# Set features. This will also be used as your x values.
X = df.drop('koi_disposition', axis=1)
X.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [7]:
# Create a Train Test Split

# Use `koi_disposition` for the y values
y = df['koi_disposition'].values.reshape(-1,1)
y

array([['CONFIRMED'],
       ['FALSE POSITIVE'],
       ['FALSE POSITIVE'],
       ...,
       ['CANDIDATE'],
       ['FALSE POSITIVE'],
       ['FALSE POSITIVE']], dtype=object)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 42)

In [26]:
X_test

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4982,1,0,0,0,88.523517,2.337000e-03,-2.337000e-03,189.275600,0.023200,-0.023200,...,-75,4.782,0.060,-0.070,0.462,0.050,-0.070,295.57629,48.897861,15.841
4866,0,0,0,0,102.493119,1.322000e-03,-1.322000e-03,166.009400,0.010500,-0.010500,...,-140,4.583,0.024,-0.117,0.791,0.132,-0.055,297.66437,46.944820,15.142
2934,0,0,1,0,1.181165,3.890000e-06,-3.890000e-06,132.425220,0.002760,-0.002760,...,-154,4.582,0.048,-0.078,0.747,0.098,-0.066,291.34879,51.256119,15.971
5007,0,1,0,0,8.412009,2.360000e-07,-2.360000e-07,139.648520,0.000024,-0.000024,...,-206,4.225,0.185,-0.185,1.314,0.373,-0.305,291.84970,37.738621,13.551
3869,0,1,1,1,1.195515,5.730000e-07,-5.730000e-07,132.303851,0.000410,-0.000410,...,-182,4.271,0.220,-0.180,1.155,0.331,-0.271,297.54337,41.947979,13.998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4253,0,0,0,0,41.889700,9.430000e-05,-9.430000e-05,172.001020,0.001650,-0.001650,...,-195,4.519,0.050,-0.212,0.907,0.273,-0.091,295.46463,47.908070,14.740
387,0,0,0,0,11.515172,1.570000e-05,-1.570000e-05,174.858550,0.001060,-0.001060,...,-155,4.590,0.024,-0.102,0.798,0.108,-0.054,284.68887,46.172249,15.386
2807,0,0,0,0,9.478508,5.270000e-05,-5.270000e-05,140.165260,0.004300,-0.004300,...,-217,4.428,0.054,-0.202,1.076,0.335,-0.112,286.97128,44.708302,14.456
1032,0,0,0,0,63.073224,2.866000e-04,-2.866000e-04,191.253200,0.003540,-0.003540,...,-74,3.209,0.458,-0.122,6.354,1.590,-3.180,298.75018,45.466400,13.089


In [27]:
y_test

array([['FALSE POSITIVE'],
       ['CANDIDATE'],
       ['FALSE POSITIVE'],
       ...,
       ['CONFIRMED'],
       ['CANDIDATE'],
       ['FALSE POSITIVE']], dtype=object)

In [9]:
X_train

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3875,1,0,0,0,11.377528,1.111000e-04,-1.111000e-04,132.584800,0.008330,-0.008330,...,-305,4.162,0.128,-0.192,1.660,0.528,-0.352,300.19409,45.145741,13.930
1768,0,0,0,0,2.215713,7.630000e-06,-7.630000e-06,131.821070,0.002980,-0.002980,...,-169,4.447,0.108,-0.162,0.901,0.195,-0.130,297.02008,43.432549,15.392
3250,0,0,0,0,7.785911,2.034000e-04,-2.034000e-04,137.873400,0.027900,-0.027900,...,-180,4.479,0.054,-0.216,0.954,0.305,-0.102,290.42307,51.388729,13.515
6574,1,0,1,1,2.404557,3.730000e-06,-3.730000e-06,131.676160,0.001440,-0.001440,...,-175,4.013,0.259,-0.130,1.775,0.411,-0.503,293.21356,46.175129,13.474
2815,0,0,0,0,110.461746,2.985000e-03,-2.985000e-03,147.546500,0.023800,-0.023800,...,-206,4.434,0.054,-0.216,1.058,0.349,-0.116,287.03952,46.481701,15.092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,1,0,0,8.268081,6.340000e-07,-6.340000e-07,135.056330,0.000064,-0.000064,...,-190,4.502,0.050,-0.200,0.922,0.273,-0.091,292.53125,46.728699,15.768
5191,0,0,0,0,11.161938,1.677000e-04,-1.677000e-04,133.553800,0.013000,-0.013000,...,-124,4.072,0.188,-0.101,1.640,0.281,-0.343,295.21268,49.562180,13.374
5226,0,1,0,0,6.150251,7.000000e-07,-7.000000e-07,134.422825,0.000088,-0.000088,...,-458,3.896,0.270,-0.180,2.867,0.988,-1.087,297.18176,45.988441,10.622
5390,1,0,0,0,3.343285,4.380000e-05,-4.380000e-05,134.845100,0.011200,-0.011200,...,-197,3.773,0.293,-0.098,2.652,0.433,-0.939,296.86258,41.147419,13.276


In [10]:
y_train

array([['FALSE POSITIVE'],
       ['CONFIRMED'],
       ['CANDIDATE'],
       ...,
       ['FALSE POSITIVE'],
       ['FALSE POSITIVE'],
       ['FALSE POSITIVE']], dtype=object)

In [11]:
# Pre-processing

In [12]:
scaler = MinMaxScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_e = label_encoder.transform(y_train).reshape(-1,1)
y_test_e = label_encoder.transform(y_test)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
enc = OneHotEncoder(handle_unknown='ignore')

In [15]:
y_enc = enc.fit_transform(y_train_e).toarray()

In [24]:
y_test_enc = enc.transform(y_test).toarray()
y_test_enc.shape

(1399, 3)

# Train the Model



In [17]:
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_enc)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
y_pred = rf.predict(X_test_scaled)
y_pred

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [19]:
pred_df = pd.DataFrame(y_pred, y_test_e)
pred_df.head()

Unnamed: 0,0,1,2
2,0.0,0.0,1.0
0,1.0,0.0,0.0
2,0.0,0.0,1.0
2,0.0,0.0,1.0
2,0.0,0.0,1.0


In [20]:
inv_pred = enc.inverse_transform(y_pred)
inv_pred

array([[2],
       [0],
       [2],
       ...,
       [1],
       [0],
       [2]], dtype=object)

In [25]:
y_test_e = y_test_e.reshape(-1,1)
y_test_e

array([[2],
       [0],
       [2],
       ...,
       [1],
       [0],
       [2]])

In [28]:
accuracy = rf.score(y_test_e, y_pred)

ValueError: Number of features of the model must match the input. Model n_features is 40 and input n_features is 1 

In [23]:
# accuracy = rf.score(y_test_e, inv_pred)
print("Accuracy Score: {}".format(accuracy_score(y_test_enc, y_pred)))

Accuracy Score: 0.035739814152966405


In [48]:
inv_pred_1 = label_encoder.inverse_transform(inv_pred.astype(int).ravel())

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)