In [1]:
# Import packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Sklearn Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error, precision_score, confusion_matrix, accuracy_score

# Visualizes all the columns
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Import dataset
df = pd.read_csv('cumulative.csv')

# Print the shape of the dataset
print(df.shape)

# Select top of the dataset
df.head()

(9564, 50)


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,,,93.59,29.45,-16.65,35.8,1.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,,,9.11,2.87,-1.62,25.8,2.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638.0,,,39.3,31.04,-10.49,76.3,1.0,q1_q17_dr25_tce,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395.0,,,891.96,668.95,-230.35,505.6,1.0,q1_q17_dr25_tce,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,,,926.16,874.33,-314.24,40.9,1.0,q1_q17_dr25_tce,6031.0,169.0,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              9564 non-null   int64  
 1   kepid              9564 non-null   int64  
 2   kepoi_name         9564 non-null   object 
 3   kepler_name        2294 non-null   object 
 4   koi_disposition    9564 non-null   object 
 5   koi_pdisposition   9564 non-null   object 
 6   koi_score          8054 non-null   float64
 7   koi_fpflag_nt      9564 non-null   int64  
 8   koi_fpflag_ss      9564 non-null   int64  
 9   koi_fpflag_co      9564 non-null   int64  
 10  koi_fpflag_ec      9564 non-null   int64  
 11  koi_period         9564 non-null   float64
 12  koi_period_err1    9110 non-null   float64
 13  koi_period_err2    9110 non-null   float64
 14  koi_time0bk        9564 non-null   float64
 15  koi_time0bk_err1   9110 non-null   float64
 16  koi_time0bk_err2   9110 

In [4]:
df['ExoplanetCandidate'] = df['koi_pdisposition'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoplanetConfirmed'] = df['koi_disposition'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )

In [5]:
df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate,ExoplanetConfirmed
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,,,93.59,29.45,-16.65,35.8,1.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,2
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,,,9.11,2.87,-1.62,25.8,2.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,2
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638.0,,,39.3,31.04,-10.49,76.3,1.0,q1_q17_dr25_tce,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,0,0
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395.0,,,891.96,668.95,-230.35,505.6,1.0,q1_q17_dr25_tce,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0,0
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,,,926.16,874.33,-314.24,40.9,1.0,q1_q17_dr25_tce,6031.0,169.0,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1,2


In [6]:
df.drop(columns=['kepler_name','kepoi_name','koi_teq_err1',
                 'kepid','koi_disposition','koi_pdisposition',
                 'koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co',
                 'koi_fpflag_ec','koi_tce_delivname',
                 'koi_teq_err2'], inplace=True)

In [7]:
df.shape

(9564, 40)

In [25]:
df.corr()

Unnamed: 0,rowid,koi_score,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate,ExoplanetConfirmed
rowid,1.0,-0.451683,0.199884,0.215965,-0.215965,0.012979,0.201011,-0.201011,0.023865,-0.030344,0.0681,0.178232,0.233903,-0.233903,0.267861,0.132844,-0.132844,0.035806,0.02427,-0.026888,0.160361,0.029402,0.039943,-0.027372,0.223401,-0.0845,0.145112,0.219717,-0.216116,-0.157263,0.152672,-0.161262,0.069576,0.090963,-0.07204,0.042842,0.013394,-0.097242,-0.418908,-0.50646
koi_score,-0.451683,1.0,-0.075041,-0.094433,0.094433,-0.012889,-0.027061,0.027061,-0.22613,-0.119604,-0.076584,-0.176026,-0.097346,0.097346,-0.300011,-0.155539,0.155539,-0.074786,-0.054819,0.050079,-0.305312,-0.029496,-0.041655,0.024613,-0.274289,0.213595,-0.191833,-0.372409,0.333482,0.152163,-0.164208,0.227586,-0.073932,-0.116433,0.082708,-0.167913,0.0924,0.051773,0.973443,0.896079
koi_period,0.199884,-0.075041,1.0,0.616152,-0.616152,0.612582,0.204087,-0.204087,-0.03343,0.044773,-0.025808,0.361854,0.334624,-0.334624,-0.043823,0.014083,-0.014083,-0.011244,-0.010112,0.009857,-0.353252,-0.01894,-0.02643,0.017695,-0.032547,0.026838,0.024614,-0.014388,0.002149,-0.050179,0.014662,-0.009083,0.017817,-0.000191,-0.007542,-0.041113,0.021024,-0.027582,-0.04261,-0.062738
koi_period_err1,0.215965,-0.094433,0.616152,1.0,-1.0,0.404329,0.416233,-0.416233,-0.041111,0.014236,-0.024984,0.25257,0.562123,-0.562123,-0.055765,0.024903,-0.024903,-0.010904,-0.008444,0.007663,-0.174397,-0.008345,-0.011607,0.007761,-0.062418,0.046367,0.025086,0.040911,-0.034257,-0.020292,0.019265,-0.038289,0.022177,0.010102,-0.012838,-0.025482,0.021739,-0.027146,-0.06295,-0.08682
koi_period_err2,-0.215965,0.094433,-0.616152,-1.0,1.0,-0.404329,-0.416233,0.416233,0.041111,-0.014236,0.024984,-0.25257,-0.562123,0.562123,0.055765,-0.024903,0.024903,0.010904,0.008444,-0.007663,0.174397,0.008345,0.011607,-0.007761,0.062418,-0.046367,-0.025086,-0.040911,0.034257,0.020292,-0.019265,0.038289,-0.022177,-0.010102,0.012838,0.025482,-0.021739,0.027146,0.06295,0.08682
koi_time0bk,0.012979,-0.012889,0.612582,0.404329,-0.404329,1.0,0.100966,-0.100966,0.010829,0.067862,-0.041159,0.211222,0.18594,-0.18594,-0.039903,0.000328,-0.000328,-0.004745,-0.00462,0.009242,-0.275652,-0.018255,-0.0238,0.017301,-0.030879,0.034356,-0.003365,0.002706,-0.000238,0.008873,-0.032291,-0.005309,-0.000969,-0.010436,0.008659,-0.026781,0.011746,0.031895,0.007567,0.001207
koi_time0bk_err1,0.201011,-0.027061,0.204087,0.416233,-0.416233,0.100966,1.0,-1.0,-0.057877,-0.009591,-0.03366,0.169384,0.526963,-0.526963,-0.103541,0.086031,-0.086031,-0.01968,-0.014642,0.012355,-0.068658,-0.000381,-0.002478,-0.002578,-0.11882,0.058923,0.038153,0.037726,-0.036973,-0.038219,0.030638,-0.056433,0.044187,0.03915,-0.039436,-0.009332,-0.001006,-0.016957,-0.013306,-0.064603
koi_time0bk_err2,-0.201011,0.027061,-0.204087,-0.416233,0.416233,-0.100966,-1.0,1.0,0.057877,0.009591,0.03366,-0.169384,-0.526963,0.526963,0.103541,-0.086031,0.086031,0.01968,0.014642,-0.012355,0.068658,0.000381,0.002478,0.002578,0.11882,-0.058923,-0.038153,-0.037726,0.036973,0.038219,-0.030638,0.056433,-0.044187,-0.03915,0.039436,0.009332,0.001006,0.016957,0.013306,0.064603
koi_impact,0.023865,-0.22613,-0.03343,-0.041111,0.041111,0.010829,-0.057877,0.057877,1.0,0.303838,-0.433475,0.059642,-0.034425,0.034425,0.037165,0.070898,-0.070898,0.528965,0.504228,-0.452056,0.059536,-0.008591,3.7e-05,0.006073,0.042958,-0.048657,0.078103,0.125611,-0.110065,-0.035274,0.072347,-0.103036,0.000924,0.032944,-0.013267,0.055489,-0.020904,0.006337,-0.223501,-0.220064
koi_impact_err1,-0.030344,-0.119604,0.044773,0.014236,-0.014236,0.067862,-0.009591,0.009591,0.303838,1.0,-0.426508,0.00644,0.020472,-0.020472,-0.049437,0.039011,-0.039011,0.060221,0.044024,-0.029002,-0.074304,-0.007035,-0.008646,0.006591,-0.058655,-0.018385,-0.008821,0.054831,-0.042181,0.003632,0.008827,-0.046415,0.006863,0.011954,-0.006654,0.026201,-0.031373,0.045766,-0.120234,-0.133705


In [8]:
df.isna().any()

rowid                 False
koi_score              True
koi_period            False
koi_period_err1        True
koi_period_err2        True
koi_time0bk           False
koi_time0bk_err1       True
koi_time0bk_err2       True
koi_impact             True
koi_impact_err1        True
koi_impact_err2        True
koi_duration          False
koi_duration_err1      True
koi_duration_err2      True
koi_depth              True
koi_depth_err1         True
koi_depth_err2         True
koi_prad               True
koi_prad_err1          True
koi_prad_err2          True
koi_teq                True
koi_insol              True
koi_insol_err1         True
koi_insol_err2         True
koi_model_snr          True
koi_tce_plnt_num       True
koi_steff              True
koi_steff_err1         True
koi_steff_err2         True
koi_slogg              True
koi_slogg_err1         True
koi_slogg_err2         True
koi_srad               True
koi_srad_err1          True
koi_srad_err2          True
ra                  

In [9]:
df.dropna(inplace=True)

In [10]:
df.shape

(7803, 40)

In [11]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)

Unnamed: 0,rowid,koi_score,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate,ExoplanetConfirmed
0,1.0,1.000,9.488036,2.775000e-05,-2.775000e-05,170.538750,0.002160,-0.002160,0.146,0.318,-0.146,2.95750,0.08190,-0.08190,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,93.59,29.45,-16.65,35.8,1.0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1.0,2.0
1,2.0,0.969,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,0.586,0.059,-0.443,4.50700,0.11600,-0.11600,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,9.11,2.87,-1.62,25.8,2.0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1.0,2.0
2,3.0,0.000,19.899140,1.494000e-05,-1.494000e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.78220,0.03410,-0.03410,10829.0,171.0,-171.0,14.60,3.92,-1.31,638.0,39.30,31.04,-10.49,76.3,1.0,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,0.0,0.0
3,4.0,0.000,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.50,-2.83,1395.0,891.96,668.95,-230.35,505.6,1.0,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597,0.0,0.0
4,5.0,1.000,2.525592,3.761000e-06,-3.761000e-06,171.595550,0.001130,-0.001130,0.701,0.235,-0.478,1.65450,0.04200,-0.04200,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,926.16,874.33,-314.24,40.9,1.0,6031.0,169.0,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,9560.0,0.000,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,-0.015700,0.765,0.023,-0.541,4.80600,0.63400,-0.63400,87.7,13.0,-13.0,1.11,0.32,-0.23,929.0,176.40,152.77,-77.60,8.4,1.0,5638.0,169.0,-152.0,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478,0.0,0.0
9560,9561.0,0.000,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,1.252,0.051,-0.049,3.22210,0.01740,-0.01740,1579.2,4.6,-4.6,29.35,7.70,-2.57,2088.0,4500.53,3406.38,-1175.26,453.3,1.0,5638.0,139.0,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082,0.0,0.0
9561,9562.0,0.497,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,-0.007690,0.043,0.423,-0.043,3.11400,0.22900,-0.22900,48.5,5.4,-5.4,0.72,0.24,-0.08,1608.0,1585.81,1537.86,-502.22,10.6,1.0,6119.0,165.0,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757,1.0,1.0
9562,9563.0,0.021,0.681402,2.434000e-06,-2.434000e-06,132.181750,0.002850,-0.002850,0.147,0.309,-0.147,0.86500,0.16200,-0.16200,103.6,14.7,-14.7,1.07,0.36,-0.11,2218.0,5713.41,5675.74,-1836.94,12.3,1.0,6173.0,193.0,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385,0.0,0.0


In [12]:
def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_test, y_pred)))
    print('Recall: ' + str(metrics.recall_score(y_test, y_pred)))
    print('F1 Score: ' + str(metrics.f1_score(y_test, y_pred)))
    print('Precision: ' + str(metrics.precision_score(y_test, y_pred)))
    
# Print Confusion Matrix
    print('\nConfusion Matrix:')
    print(' TN,  FP, FN, TP')
    print(confusion_matrix(y_true, y_pred).ravel())
    
# Function Prints best parameters for GridSearchCV
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_)) 

In [13]:
features = df.drop(columns=['ExoplanetCandidate','ExoplanetConfirmed'])
target = df.ExoplanetCandidate

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1, test_size=.40)

In [15]:
for dataset in [y_train, y_test]:
    print(round(len(dataset)/len(target), 2))

0.6
0.4


In [16]:
lr = LogisticRegression(C=100, max_iter=200, class_weight='balanced')

# Fitting Model to the train set
lr.fit(X_train, y_train)

# Predicting on the test set
y_pred = lr.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.8074951953875721
Recall: 0.8545918367346939
F1 Score: 0.8168241389820177
Precision: 0.7822533566841798

Confusion Matrix:
 TN,  FP, FN, TP
[1181  373  228 1340]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
knn = KNeighborsClassifier(leaf_size=8, metric='manhattan',weights='uniform')

# Fitting Model to the train set
knn.fit(X_train, y_train)

# Predicting on the test set
y_pred = knn.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.7953235105701474
Recall: 0.8278061224489796
F1 Score: 0.8024729520865533
Precision: 0.7786442711457708

Confusion Matrix:
 TN,  FP, FN, TP
[1185  369  270 1298]


In [18]:
tree = DecisionTreeClassifier()

# Fitting Model to the train set
tree.fit(X_train, y_train)

# Predicting on the test set
y_pred = tree.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.9814221652786675
Recall: 0.9795918367346939
F1 Score: 0.9814696485623003
Precision: 0.9833546734955185

Confusion Matrix:
 TN,  FP, FN, TP
[1528   26   32 1536]


In [19]:
# Instantiate model
forest = RandomForestClassifier(n_estimators=100, criterion='gini')
# Fitting Model to the train set
forest.fit(X_train, y_train)
# Predicting on the test set
y_pred = forest.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.9881486226777707
Recall: 0.9840561224489796
F1 Score: 0.9881524175472302
Precision: 0.992282958199357

Confusion Matrix:
 TN,  FP, FN, TP
[1542   12   25 1543]


In [20]:

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [21]:
#svm = SVC(kernel='linear')
# Fitting Model to the train set
#svm.fit(X_train, y_train)
# Predicting on the test set
#y_pred = svm.predict(X_test)

# Evaluating model
#evaluation(y_test, y_pred)

In [22]:
# GridSearch Parameters
parameters = {
    'n_estimators': [5, 50, 100, 150, 200],
    'max_depth': list(range(1, 11)),
    'criterion':['gini','entropy'],
    'max_features': list(range(1,20)),
    'oob_score':[False,True],
}

In [23]:
# This cell might take over an hour if you run it.

# grid = GridSearchCV(forest, parameters, cv=5, verbose=1, n_jobs=-1)

# grid.fit(X_train, y_train)
# y_test_grid = grid.predict(X_test)

# # Evaluating model
# evaluation(y_test, y_pred)