In [1]:
## Update sklearn to prevent version mismatches:
!pip install sklearn --upgrade



In [2]:
## Install joblib (to save model): 
## *Restart your kernel after installing 
!pip install joblib



In [3]:
## Dependencies
import numpy as np
import pandas as pd

## Read CSV & Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("../a_Resources/Data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
print(f"Shape: {df.shape}")
df.head(10)

Shape: (6991, 41)


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
5,CONFIRMED,0,0,0,0,2.566589,1.78e-05,-1.78e-05,179.55437,0.00461,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
6,CONFIRMED,0,0,0,0,16.068647,1.09e-05,-1.09e-05,173.621937,0.000517,...,-83,4.485,0.083,-0.028,0.848,0.033,-0.072,286.99948,48.37579,15.841
7,CONFIRMED,0,0,0,0,2.470613,2.7e-08,-2.7e-08,122.763305,9e-06,...,-78,4.457,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338
8,CONFIRMED,0,1,0,0,2.204735,4.3e-08,-4.3e-08,121.358542,1.6e-05,...,-89,4.019,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463
9,CONFIRMED,0,0,0,0,3.522498,1.98e-07,-1.98e-07,121.119423,4.7e-05,...,-137,4.169,0.055,-0.045,1.451,0.11,-0.11,281.28812,42.45108,13.563


In [5]:
## DataFrame Exploration
# df.iloc[:,0:10].head(10)

## Set Target & Features

In [6]:
target = df['koi_disposition']
target.value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [7]:
features = df.drop(columns=['koi_disposition'])
print(f"Features: {len(features.columns)}")

Features: 40


## Train / Test Split

In [8]:
## Train / Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

In [9]:
## Check split
print(f"Train: {round(len(y_train)/len(target)*100, 2)}%")
print(f"Test: {round(len(y_test)/len(target)*100, 2)}%")

Train: 75.0%
Test: 25.0%


## Pre-processing

In [10]:
## Encode Target
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

## Label encoding
label_encoder = LabelEncoder()
label_encoder.fit(target)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [11]:
## Target names
target_names = label_encoder.inverse_transform([0, 1, 2])
target_names

array(['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [12]:
## Scale Features
from sklearn.preprocessing import MinMaxScaler

## Create MinMaxScaler model & Fit to training data
X_scaler = MinMaxScaler().fit(X_train)

## Transform training & testing data using X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train Preliminar Model

In [13]:
## Define model: Linear Support Vector Classification
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [14]:
## Fit model
model.fit(X_train_scaled, y_train_encoded)

SVC(kernel='linear')

In [15]:
print(f"Training Data Score: {round(model.score(X_train_scaled, y_train_encoded)*100,4)}%")
print(f"Testing Data Score: {round(model.score(X_test_scaled, y_test_encoded)*100,4)}%")

Training Data Score: 84.5508%
Testing Data Score: 84.1533%


In [16]:
## Classification Report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test_encoded, predictions, target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.70      0.62      0.66       411
     CONFIRMED       0.71      0.76      0.73       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.84      1748
     macro avg       0.80      0.79      0.79      1748
  weighted avg       0.84      0.84      0.84      1748



In [17]:
# ## ROC AUC Score
# from sklearn import metrics

# y_score = model.predict_proba(X_test_scaled)
# print(f"ROC_AUC Score: {round(metrics.roc_auc_score(y_test_encoded, y_score, multi_class='ovr')*100, 4)}%")

## Feature Selection

In [18]:
## Recursive Feature Elimination
from sklearn.feature_selection import RFE

selector = RFE(model, n_features_to_select=15, step=1)
selector = selector.fit(X_train_scaled, y_train_encoded)

In [19]:
# selector.support_
selector.ranking_

array([ 1,  1,  1,  1,  1,  7,  6,  5,  1,  1, 16,  1, 18,  1,  1,  1, 11,
       21, 20, 25, 23, 26,  1, 22, 19, 24,  1,  9,  3,  1,  1,  2, 14,  4,
       13,  8, 15, 17, 10, 12])

In [20]:
selected_features = features.loc[:,selector.support_]
print(f"Shape: {selected_features.shape}")
selected_features.columns

Shape: (6991, 15)


Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact_err1',
       'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_teq',
       'koi_model_snr', 'koi_steff_err1', 'koi_steff_err2'],
      dtype='object')

In [21]:
print(f"Selected Features: {len(selected_features.columns)}")

Selected Features: 15


## Train Selected Features Model

In [22]:
## Train / Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

## Create MinMaxScaler model & Fit to training data
X_scaler = MinMaxScaler().fit(X_train)

## Transform training & testing data using X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Filtered model
filtered_model = SVC(kernel='linear')

## Fit model
filtered_model.fit(X_train_scaled, y_train_encoded)

SVC(kernel='linear')

In [23]:
## Model Train / Test Score
print(f"Training Data Score: {round(filtered_model.score(X_train_scaled, y_train_encoded)*100,4)}%")
print(f"Testing Data Score: {round(filtered_model.score(X_test_scaled, y_test_encoded)*100,4)}%")

Training Data Score: 84.7416%
Testing Data Score: 84.4966%


In [24]:
## Classification Report
from sklearn.metrics import classification_report
predictions = filtered_model.predict(X_test_scaled)
print(classification_report(y_test_encoded, predictions, target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.71      0.61      0.66       411
     CONFIRMED       0.71      0.77      0.74       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.84      1748
     macro avg       0.80      0.79      0.80      1748
  weighted avg       0.84      0.84      0.84      1748



In [25]:
## ROC AUC Score
# from sklearn import metrics

# y_score = filtered_model.predict_proba(X_test_scaled)
# print(f"ROC_AUC Score: {round(metrics.roc_auc_score(y_test_encoded, y_score, multi_class='ovr')*100, 4)}%")

## Model Tuning (Hyperparameters)

In [26]:
## Get list of available parameters
filtered_model.get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [27]:
## Create GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}

grid = GridSearchCV(filtered_model, param_grid, verbose=3)

In [28]:
## Train model using GridSearch
grid.fit(X_train_scaled, y_train_encoded)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............................C=1, gamma=0.0001; total time=   0.1s
[CV 2/5] END ..............................C=1, gamma=0.0001; total time=   0.1s
[CV 3/5] END ..............................C=1, gamma=0.0001; total time=   0.1s
[CV 4/5] END ..............................C=1, gamma=0.0001; total time=   0.1s
[CV 5/5] END ..............................C=1, gamma=0.0001; total time=   0.1s
[CV 1/5] END ..............................C=1, gamma=0.0005; total time=   0.1s
[CV 2/5] END ..............................C=1, gamma=0.0005; total time=   0.1s
[CV 3/5] END ..............................C=1, gamma=0.0005; total time=   0.1s
[CV 4/5] END ..............................C=1, gamma=0.0005; total time=   0.1s
[CV 5/5] END ..............................C=1, gamma=0.0005; total time=   0.1s
[CV 1/5] END ...............................C=1, gamma=0.001; total time=   0.1s
[CV 2/5] END ...............................C=1,

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=4)

In [29]:
print(f"Tuned Parameters: {grid.best_params_}")
print(f"Tuned Score: {round((grid.best_score_)*100,4)}%")

Tuned Parameters: {'C': 50, 'gamma': 0.0001}
Tuned Score: 88.346%


## Train Tuned Model

In [30]:
## Tuned model
tuned_model = SVC(**grid.best_params_, kernel='linear')

## Fit model
tuned_model.fit(X_train_scaled, y_train_encoded)

SVC(C=50, gamma=0.0001, kernel='linear')

In [31]:
## Model Train / Test Score
print(f"Training Data Score: {round(tuned_model.score(X_train_scaled, y_train_encoded)*100,4)}%")
print(f"Testing Data Score: {round(tuned_model.score(X_test_scaled, y_test_encoded)*100,4)}%")

Training Data Score: 88.3082%
Testing Data Score: 87.643%


In [32]:
## Classification Report
from sklearn.metrics import classification_report

predictions = tuned_model.predict(X_test_scaled)
print(classification_report(y_test_encoded, predictions, target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.81      0.65      0.72       411
     CONFIRMED       0.75      0.85      0.80       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.88      1748
     macro avg       0.85      0.83      0.84      1748
  weighted avg       0.88      0.88      0.87      1748



In [33]:
## ROC AUC Score
# from sklearn import metrics

# y_score = tuned_model.predict_proba(X_test_scaled)
# print(f"ROC_AUC Score: {round(metrics.roc_auc_score(y_test_encoded, y_score, multi_class='ovr')*100, 4)}%")

## Save Tuned Model

In [34]:
import joblib
filename = 'linear_svc.sav'
joblib.dump(tuned_model, filename)

['linear_svc.sav']