In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

### Select your features (columns)

In [3]:
# Dropped variables that are highly correlated

X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_time0bk', 'koi_time0bk_err1',
       'koi_impact', 'koi_impact_err1', 'koi_duration', 'koi_duration_err1',
       'koi_depth', 'koi_depth_err1', 'koi_teq', 'koi_insol', 'koi_model_snr',
       'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1', 'koi_slogg',
       'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'ra', 'dec',
       'koi_kepmag']]

y = df['koi_disposition']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Random Forest Model using GridSearch

In [5]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 20)), 'max_depth': list(range(2,10)), 'min_samples_leaf' : [200,250,300], 'n_estimators': [200]}

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = GridSearchCV(RandomForestClassifier(random_state=101), params, verbose=1, cv=3)

rfc.fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print(rfc.best_params_)

print(rfc.best_score_)

### Prediction and Evaluation of Model

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

predictions = rfc.predict(X_test)

In [None]:
# Confusion Matrix

print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))

### SAVE THE MODEL

In [None]:
import joblib

filename = 'RandomForest_model.sav'

joblib.dump(rfc, filename)

In [None]:
pred = pd.DataFrame(predictions,columns=['Predicted'])

Resp = pd.DataFrame(list(y_test),columns=['Response'])

Output = pd.merge(Resp, pred, left_index=True, right_index=True)

Output.to_excel('RF_Predictions.xlsx')