In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
df['koi_disposition'].value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

# Select your features (columns)

In [6]:
# Set features. This will also be used as your x values.
data = df.drop("koi_disposition",axis=1)
target = df['koi_disposition']
selected_features = data.columns

In [7]:
target.value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [8]:
data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


Will be dropping:

koi_impact
koi_prad
koi_insol

All have 0 values - I don't think they will make a difference to the model. I also don't want it to interfere with accuracy

In [9]:
# # visualize data
# %matplotlib inline
# import matplotlib.pyplot as plt
# data_transit.hist(bins=50, figsize=(20,15))
# plt.show()

## Create train/test split

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data,target, random_state=42)

### Pre-processing

In [11]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler
# from tensorflow.keras.utils import to_categorical

# scale the data
X_scaler = StandardScaler().fit(X_train)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [12]:
# transform X values
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Select and train a model

In [38]:
# create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
model_rfc = rf.fit(X_train_scaled, encoded_y_train)

In [39]:
print(f"Training Data Score: {model_rfc.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {model_rfc.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8975972540045767


In [40]:
# Calculate classification report
from sklearn.metrics import classification_report

predictions = model_rfc.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                           target_names =["confirmed", "false positive", "candidate"]))

                precision    recall  f1-score   support

     confirmed       0.82      0.76      0.79       411
false positive       0.83      0.84      0.84       484
     candidate       0.97      1.00      0.98       853

      accuracy                           0.90      1748
     macro avg       0.87      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748



# Tuning

In [24]:
# finding important features and dropping those not needed
importances = rf.feature_importances_
sorted(zip(importances, list(data)), reverse=True)

[(0.1057948561435557, 'koi_fpflag_nt'),
 (0.10008157934280916, 'koi_fpflag_co'),
 (0.07609431777890929, 'koi_fpflag_ss'),
 (0.05352790123559077, 'koi_model_snr'),
 (0.04530144121897327, 'koi_prad'),
 (0.039558716327717354, 'koi_fpflag_ec'),
 (0.03818206316683265, 'koi_duration_err1'),
 (0.032461960309466596, 'koi_steff_err1'),
 (0.032340725498036654, 'koi_prad_err2'),
 (0.0304553928727608, 'koi_prad_err1'),
 (0.02961804562400317, 'koi_duration_err2'),
 (0.02791700769763188, 'koi_steff_err2'),
 (0.023055645215791464, 'koi_time0bk_err2'),
 (0.02230938927444922, 'koi_period'),
 (0.022201952430263905, 'koi_depth'),
 (0.021235463554549593, 'koi_duration'),
 (0.019833386235010353, 'koi_time0bk_err1'),
 (0.018811363258892214, 'koi_period_err2'),
 (0.018550475059047555, 'koi_impact'),
 (0.018160318249184224, 'koi_period_err1'),
 (0.017192128202101552, 'koi_insol_err1'),
 (0.01619981637469335, 'koi_insol_err2'),
 (0.015112813419421335, 'koi_insol'),
 (0.014773844318084702, 'koi_teq'),
 (0.01294

In [28]:
# Build New X Data - drop all features with less than 1% importance.
new_data = data.drop(["koi_tce_plnt_num","koi_slogg_err1",'koi_srad_err2','koi_srad','koi_slogg','koi_steff'],axis=1)

In [29]:
new_data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_steff_err1,koi_steff_err2,koi_slogg_err2,koi_srad_err1,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,2.87,-1.62,25.8,81,-81,-0.096,0.105,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,31.04,-10.49,76.3,158,-176,-0.176,0.233,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,668.95,-230.35,505.6,157,-174,-0.168,0.201,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,874.33,-314.24,40.9,169,-211,-0.21,0.334,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,420.33,-136.7,40.2,189,-232,-0.229,0.315,296.28613,48.22467,15.714


### Re-Preprocess

In [31]:
from sklearn.model_selection import train_test_split
X_train_new,X_test_new,y_train,y_test = train_test_split(new_data,target, random_state=42)

In [34]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler

# scale the data
X_scaler_new = StandardScaler().fit(X_train_new)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [35]:
# transform X values
X_train_scaled_new = X_scaler_new.transform(X_train_new)
X_test_scaled_new = X_scaler_new.transform(X_test_new)

## Refit model with new_data

In [36]:
# create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 200)
model_rfc_new = rf.fit(X_train_scaled_new, encoded_y_train)

In [37]:
print(f"Training Data Score: {model_rfc_new.score(X_train_scaled_new, encoded_y_train)}")
print(f"Testing Data Score: {model_rfc_new.score(X_test_scaled_new, encoded_y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9004576659038902


In [42]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'RandomForestClassifier.sav'
joblib.dump(model_rfc_new, filename)

['RandomForestClassifier.sav']