In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\shweta shukla\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import numpy as np

In [4]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [6]:
# Set features. This will also be used as your x values.Here all of the columns except "koi_disposition" are selected as features. 
selected_features = df.drop("koi_disposition", axis=1)
selected_features.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [7]:
X = selected_features
y = df["koi_disposition"]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify =y)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,0.0003463,-0.0003463,219.33483,0.0023,-0.0023,...,-148,4.777,0.04,-0.027,0.492,0.026,-0.027,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,9e-08,-9e-08,131.654831,0.000124,-0.000124,...,-146,4.664,0.056,-0.032,0.591,0.045,-0.045,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,5.36e-06,-5.36e-06,137.447816,0.000445,-0.000445,...,-176,4.338,0.153,-0.187,1.096,0.309,-0.206,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,3.1e-05,-3.1e-05,218.225235,0.000127,-0.000127,...,-134,4.346,0.084,-0.126,1.148,0.202,-0.124,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,5.6e-05,-5.6e-05,138.678725,0.000987,-0.000987,...,-68,4.347,0.03,-0.03,1.044,0.057,-0.042,285.67938,50.241299,10.961


In [10]:
y.head()

0         CONFIRMED
1    FALSE POSITIVE
2    FALSE POSITIVE
3         CONFIRMED
4         CONFIRMED
Name: koi_disposition, dtype: object

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5243, 40)
(5243,)
(1748, 40)
(1748,)


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
# Scale your data
X_train_scaled = MinMaxScaler().fit(X_train)
X_test_scaled = X_train_scaled.transform(X_test)
X_train_scaled = X_train_scaled.transform(X_train)

# Train the Model



Support Vector Machine
-----------------------------------------

In [13]:
# SVM without GridSearch
from sklearn.svm import SVC
svc_model = SVC(kernel='linear')
svc_model.fit(X_train_scaled, y_train)
SVM_Predictions = svc_model.predict(X_test_scaled)

In [14]:
# SVM Accuracy without GridSearch
print(f"Training Data Score: {svc_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svc_model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8439824527942018
Testing Data Score: 0.8415331807780321


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
paramgrid = {
                "C": [0.1,1,10,50,100],
                "gamma" : [0.0001, 0.001, 0.01, 0.1]
            }
SVM_GridSearch_Model = GridSearchCV(svc_model, paramgrid)

In [17]:
# Train the model with GridSearch
SVM_GridSearch_Model.fit(X_train_scaled, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 50, 100],
                         'gamma': [0.0001, 0.001, 0.01, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
print(SVM_GridSearch_Model.best_params_)
print(SVM_GridSearch_Model.best_score_)

{'C': 100, 'gamma': 0.0001}
0.8838451268357811


In [19]:
# SVM Accuracy with GridSearch
print(f"Training Data Score: {SVM_GridSearch_Model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {SVM_GridSearch_Model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8901392332633988
Testing Data Score: 0.8861556064073226


In [20]:
SVM_GridSearch_Predictions = SVM_GridSearch_Model.predict(X_test_scaled)

In [21]:
# SVM Classification Report with GridSearch
from sklearn.metrics import classification_report
print(classification_report(y_test, SVM_GridSearch_Predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.69      0.76       422
     CONFIRMED       0.75      0.85      0.80       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.89      1748
     macro avg       0.86      0.85      0.85      1748
  weighted avg       0.89      0.89      0.88      1748



# Save the Model

In [22]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Support_Vector_Machine.sav'
joblib.dump(SVM_GridSearch_Model, filename)

['Support_Vector_Machine.sav']

Random Forest Classifier Method
---------------------------------------------------------------

In [23]:
# Random Forest Classifier Method without GridSearch
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier = classifier.fit(X_train_scaled, y_train)

In [24]:
# Random Forest Classifier Accuracy without GridSearch
print(f"Training Data Score with Random Forest Classifier: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score with Random Forest Classifier: {classifier.score(X_test_scaled, y_test)}")


Training Data Score with Random Forest Classifier: 0.9946595460614153
Testing Data Score with Random Forest Classifier: 0.8752860411899314


In [25]:
# GridSearch
paramgrid1 = {"n_estimators": [50, 150, 250],
             "max_depth":[100, 200, 300]}

In [26]:
# Create the GridSearchCV model
RandomForest_GridSearch_Model = GridSearchCV(classifier, paramgrid1)

In [27]:
# Train the model with GridSearch
RandomForest_GridSearch_Model.fit(X_train_scaled, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [28]:
print(RandomForest_GridSearch_Model.best_score_)
print(RandomForest_GridSearch_Model.best_params_)

0.8966240701888232
{'max_depth': 100, 'n_estimators': 150}


In [29]:
# Random Forest Classifier Method with GridSearch
print(f"Training Data Score with Random Forest Classifier with GridSearch: {RandomForest_GridSearch_Model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score with Random Forest Classifier with GridSearch: {RandomForest_GridSearch_Model.score(X_test_scaled, y_test)}")

Training Data Score with Random Forest Classifier with GridSearch: 1.0
Testing Data Score with Random Forest Classifier with GridSearch: 0.8941647597254004


In [30]:
RandomForest_Prediction = RandomForest_GridSearch_Model.predict(X_test_scaled)

In [31]:
# Random Forest Classifier Accuracy Report with GridSearch
print(classification_report(y_test, RandomForest_Prediction))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.73      0.78       422
     CONFIRMED       0.80      0.84      0.82       450
FALSE POSITIVE       0.97      1.00      0.98       876

      accuracy                           0.89      1748
     macro avg       0.87      0.86      0.86      1748
  weighted avg       0.89      0.89      0.89      1748



In [32]:
# save model with model variable
filename = 'Random_Forest_Model.sav'
joblib.dump(RandomForest_GridSearch_Model, filename)

['Random_Forest_Model.sav']

Logistic Regression Method
-------------------------------------------------

In [33]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [34]:
LogisticRegression = LogisticRegression()

In [35]:
LogisticRegression.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
# Logistic Regression Model Accuracy
print(f"Training Data Score with Logistic Regression Classifier: {LogisticRegression.score(X_train_scaled, y_train)}")
print(f"Testing Data Score with Logistic Regression Classifier: {LogisticRegression.score(X_test_scaled, y_test)}")

Training Data Score with Logistic Regression Classifier: 0.8411214953271028
Testing Data Score with Logistic Regression Classifier: 0.8409610983981693


In [37]:
predictions = LogisticRegression.predict(X_test_scaled)
df_predictions = pd.DataFrame({"Actual":y_test,"Predicted":predictions})
df_predictions.head()

Unnamed: 0,Actual,Predicted
1981,CANDIDATE,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE


In [38]:
from sklearn.model_selection import GridSearchCV

# Create the GridSearchCV model
param_grid = {'C':[1, 5, 10],
             'penalty':['l1','l2']}

grid = GridSearchCV(LogisticRegression, param_grid, verbose=3)

In [39]:
# Train the model with GridSearch
LogisticRegression_Model = grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... C=1, penalty=l1, score=0.875, total=   0.4s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ..................... C=1, penalty=l1, score=0.862, total=   0.3s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] ..................... C=1, penalty=l1, score=0.856, total=   0.4s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.848, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.834, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.818, total=   0.1s
[CV] C=5, penalty=l1 .................................................
[CV] ..................... C=5, penalty=l1, score=0.884, total=   0.8s
[CV] C=5, penalty=l1 .................................................
[CV] ..................... C=5, penalty=l1, score=0.873, total=   0.9s
[CV] C=5, penalty=l1 .................................................
[CV] ..................... C=5, penalty=l1, score=0.883, total=   1.1s
[CV] C=5, penalty=l2 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   13.0s finished


In [40]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'penalty': 'l1'}
0.8798397863818425


In [41]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Logistic_Regression.sav'
joblib.dump(grid, filename)

['Logistic_Regression.sav']

K Nearest Neighbors[KNN] Method
---------------------------------------------------

In [42]:
from sklearn.neighbors import KNeighborsClassifier

In [43]:
KNN = KNeighborsClassifier(n_neighbors=5)

In [44]:
KNN.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [45]:
print(f"Training Data Score with KNN Method: {KNN.score(X_train_scaled, y_train)}")
print(f"Testing Data Score with KNN Method: {KNN.score(X_test_scaled, y_test)}")

Training Data Score with KNN Method: 0.8725920274651917
Testing Data Score with KNN Method: 0.8249427917620137


In [46]:
paramgrid_KNN = {"n_neighbors": [5, 10, 20]}

In [47]:
grid_KNN = GridSearchCV(KNN, paramgrid_KNN)

In [48]:
grid_KNN.fit(X_train_scaled, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': [5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [49]:
grid_KNN.best_params_

{'n_neighbors': 20}

In [50]:
# KNN Method Accuracy
print(f"Training Data Score with KNN Method with Gridsearch: {KNN.score(X_train_scaled, y_train)}")
print(f"Testing Data Score with KNN Method with Gridsearch: {KNN.score(X_test_scaled, y_test)}")

Training Data Score with KNN Method with Gridsearch: 0.8725920274651917
Testing Data Score with KNN Method with Gridsearch: 0.8249427917620137


In [51]:
import joblib
filename = 'knn.sav'
joblib.dump(grid_KNN, filename)

['knn.sav']

Conclusion
---------------------
Random Forest Classifier Method fits the model best and scores the best accuracy results!
-----------------------------------------------------------------------------------------------------------------------------------
