# Machine Learning Challenge
#### Rob Gauer
#### August 11, 2020
#### File Name:  RobGauer_model_1.ipynb
#### -----------------------

In [1]:
# Update sklearn to prevent version mismatches.

# !pip install sklearn --upgrade

In [2]:
# Install joblib. This will be used to save your model. 
# Restart your kernel after installing. 

# !pip install joblib

In [3]:
# Setup and Dependancies.
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# total dataset is 9564 records
df = pd.read_csv("cumulative.csv")   
df.shape

(9564, 50)

In [5]:
# show column names
print(df.columns)  

Index(['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition',
       'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
       'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1',
       'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
       'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
       'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth',
       'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1',
       'koi_prad_err2', 'koi_teq', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_tce_delivname', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')


# Select your features (columns)

In [6]:
# Set features. This will also be used as your x values.
#  Remove features that create noise and result in improved model
df = df.drop(columns=['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_score','koi_tce_delivname'])
# Drop the null columns where all values are nullkepoi_name
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,...,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0,8744.0
mean,0.157136,0.241194,0.201167,0.125114,56.080618,0.001839523,-0.001839523,164.563271,0.0093,-0.0093,...,-161.354758,4.310223,0.120733,-0.140411,1.704566,0.352884,-0.379933,292.075061,43.828259,14.273969
std,0.36395,0.427832,0.400895,0.330867,117.38528,0.007276504,0.007276504,66.476457,0.021662,0.021662,...,72.986448,0.431557,0.132813,0.082936,5.682429,0.906364,1.810943,4.772918,3.599786,1.343509
min,0.0,0.0,0.0,0.0,0.25982,1.1e-08,-0.1568,120.515914,9e-06,-0.569,...,-1762.0,0.047,0.0,-1.207,0.109,0.0,-103.825,279.85272,36.577381,6.966
25%,0.0,0.0,0.0,0.0,2.667824,5.28675e-06,-0.000245625,132.729408,0.0012,-0.01,...,-197.0,4.21575,0.043,-0.195,0.826,0.12575,-0.247,288.670237,40.805911,13.474
50%,0.0,0.0,0.0,0.0,8.970985,3.323e-05,-3.323e-05,136.910235,0.00402,-0.00402,...,-159.0,4.439,0.07,-0.127,0.997,0.246,-0.111,292.285005,43.703989,14.534
75%,0.0,0.0,0.0,0.0,34.190033,0.000245625,-5.28675e-06,169.975942,0.01,-0.0012,...,-112.0,4.544,0.149,-0.087,1.34625,0.356,-0.069,295.90051,46.722135,15.31825
max,1.0,1.0,1.0,1.0,1071.232624,0.1568,-1.1e-08,1472.522306,0.569,-9e-06,...,0.0,5.364,1.472,0.0,180.013,33.091,0.0,301.72076,52.33601,19.065


In [7]:
# still 8700 records without nulls
df.shape 

(8744, 41)

In [8]:
# first column is the label, koi_pdisposition
df.head(5) 

Unnamed: 0,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CANDIDATE,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CANDIDATE,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CANDIDATE,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [9]:
# check remaining feature's names
print(df.columns) 

Index(['koi_pdisposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')


# Create a Train Test Split
#### Use koi_dispostion for the y values.

In [10]:
# Separate data into train (75%) and test (25%)
from sklearn.model_selection import train_test_split 

In [11]:
 # define label
y = df["koi_pdisposition"] 

# drop label from features
X = df.drop(columns=["koi_pdisposition"])  

# random sample of label and features
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [12]:
X_train.tail(5)

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
1848,0,0,0,0,10.627887,8.221e-05,-8.221e-05,137.81843,0.00608,-0.00608,...,-210.0,4.445,0.072,-0.217,1.014,0.322,-0.115,292.54752,41.64296,15.401
3114,0,0,0,0,10.423677,7.173e-06,-7.173e-06,138.009041,0.000584,-0.000584,...,-135.0,4.234,0.143,-0.117,1.331,0.243,-0.199,297.33307,41.891121,12.205
870,0,0,0,0,1.9228,4.693e-06,-4.693e-06,134.50174,0.00199,-0.00199,...,-83.0,4.559,0.022,-0.072,0.815,0.067,-0.033,285.61746,39.379532,14.829
9138,0,1,0,0,1.072934,1.45e-07,-1.45e-07,132.293066,0.000109,-0.000109,...,-338.0,4.222,0.105,-0.21,1.523,0.526,-0.263,297.74243,47.891682,13.96
1580,0,0,0,0,31.782112,0.0001769,-0.0001769,134.8645,0.00478,-0.00478,...,-110.0,4.491,0.06,-0.09,0.88,0.1,-0.067,288.81195,39.77066,13.828


In [13]:
y_train.shape  

(6558,)

# Pre-Processing
#### Scale the data using the MinMaxScaler and perform some feature selection.

In [14]:
# normalize features between  1 and 0
from sklearn.preprocessing import MinMaxScaler 

In [15]:
# Scale your data
# feature selection
X_scale = MinMaxScaler().fit(X_train)  

In [16]:
scaled_X_train = X_scale.transform(X_train)
scaled_X_test = X_scale.transform(X_test)

In [17]:
# Sample testing
scaled_X_test

array([[1.        , 0.        , 0.        , ..., 0.58909669, 0.80928728,
        0.55789735],
       [0.        , 0.        , 0.        , ..., 0.52956598, 0.32528598,
        0.54062319],
       [1.        , 0.        , 0.        , ..., 0.52391115, 0.38322744,
        0.69873543],
       ...,
       [0.        , 0.        , 0.        , ..., 0.76285163, 0.8381738 ,
        0.57062567],
       [0.        , 0.        , 1.        , ..., 0.61831056, 0.32029931,
        0.52822547],
       [0.        , 1.        , 0.        , ..., 0.87532765, 0.64948715,
        0.51243904]])

# Create and Train the Logistic Regression Model for Classification

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(scaled_X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Display Train the Model

In [19]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")  

#not very good result at 0.50

Training Data Score: 0.503354681305276
Testing Data Score: 0.5036596523330283


# Hyperparameter Tuning 
#### Use  GridSearchCV to tune the model's parameters.

In [20]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(model, {'C': [1, 10], 'penalty': ["l1", "l2"]}, verbose=3)

In [21]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ....................... C=1, penalty=l1, score=nan, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.780, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ..................... C=1, penalty=l2, score=0.748, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.775, total=   0.1s
[CV] C=1, penalty=l2 .................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[CV] ..................... C=1, penalty=l2, score=0.789, total=   0.1s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l1 ................................................
[CV] ...................... C=10, penalty=l1, score=nan, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.780, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] .

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[CV] .................... C=10, penalty=l2, score=0.750, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.774, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.789, total=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.7s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [22]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'penalty': 'l2'}
0.774476172536325


In [23]:
grid

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

# Save the Model

In [24]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'RobGauer_model_1.sav'
joblib.dump(model, filename)  

# model at 0.77447

['RobGauer_model_1.sav']

In [25]:
# EOF