In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Read the CSV and Perform Basic Data Cleaning

In [34]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

### Select your features (columns)

In [35]:
X = df.drop('koi_disposition',axis=1)

y = df['koi_disposition']

### Drop input variables that are highly correlated

In [36]:
# Create correlation matrix
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.7
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

# Drop features 
X.drop(to_drop, axis=1, inplace=True)

### Create a Train Test Split

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Decision Tree Model Without Tuning

In [49]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()

dtree.fit(X_train,y_train)

DecisionTreeClassifier()

### Prediction and Evaluation of dtree Model

In [50]:
from sklearn.metrics import classification_report,confusion_matrix

predictions = dtree.predict(X_test)

In [51]:
print(confusion_matrix(y_test,predictions))

[[ 369  133   13]
 [ 141  376    8]
 [   8   13 1037]]


In [41]:
print(classification_report(y_test,predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.71      0.71      0.71       515
     CONFIRMED       0.71      0.72      0.71       525
FALSE POSITIVE       0.98      0.98      0.98      1058

      accuracy                           0.85      2098
     macro avg       0.80      0.80      0.80      2098
  weighted avg       0.85      0.85      0.85      2098



### Decision Tree model with Hyperparameter Tuning

In [42]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 20)), 'max_depth': list(range(2,10)), 'min_samples_leaf' : [200,250,300]}

In [43]:
clf = GridSearchCV(DecisionTreeClassifier(random_state=101), params, verbose=1, cv=3)

clf.fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1296 out of 1296 | elapsed:   37.6s finished


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=101),
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                         'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19],
                         'min_samples_leaf': [200, 250, 300]},
             verbose=1)

### Prediction and Evaluation of dtree Model

In [44]:
predictions2 = clf.predict(X_test)

print(confusion_matrix(y_test,predictions2))

[[ 335  170   10]
 [  65  449   11]
 [  10   13 1035]]


In [45]:
print(classification_report(y_test,predictions2))

                precision    recall  f1-score   support

     CANDIDATE       0.82      0.65      0.72       515
     CONFIRMED       0.71      0.86      0.78       525
FALSE POSITIVE       0.98      0.98      0.98      1058

      accuracy                           0.87      2098
     macro avg       0.84      0.83      0.83      2098
  weighted avg       0.87      0.87      0.87      2098



In [46]:
print(clf.best_params_)

print(clf.best_score_)

{'max_depth': 4, 'max_leaf_nodes': 5, 'min_samples_leaf': 200}
0.8604128346617618


### Save the Model

In [47]:
import joblib
filename = 'DecisionTree_model.sav'
joblib.dump(clf, filename)

['DecisionTree_model.sav']

In [48]:
pred = pd.DataFrame(predictions2,columns=['Predicted'])

Resp = pd.DataFrame(list(y_test),columns=['Response'])

Output = pd.merge(Resp, pred, left_index=True, right_index=True)

Output.to_excel('DT_Predictions.xlsx')

In [56]:
result = joblib.load('DecisionTree_model.sav')