## Import libraries

In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib
!pip install joblib



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV

## Read data


In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


## Data analysis


### Data info

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

### Statistical summary

In [6]:
df.describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,...,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0
mean,0.157059,0.244743,0.202975,0.125018,56.191248,0.001851122,-0.001851122,164.48882,0.00934,-0.00934,...,-161.20698,4.305049,0.121091,-0.14048,1.740749,0.35271,-0.388568,292.082406,43.812143,14.271508
std,0.363882,0.429966,0.402243,0.330763,117.570962,0.007184503,0.007184503,67.020475,0.021989,0.021989,...,71.448481,0.439238,0.132048,0.08199,5.903415,0.839017,1.907797,4.762908,3.606167,1.350802
min,0.0,0.0,0.0,0.0,0.25982,1.1e-08,-0.1568,120.515914,9e-06,-0.569,...,-1733.0,0.047,0.0,-1.007,0.109,0.0,-103.825,279.85608,36.577381,6.966
25%,0.0,0.0,0.0,0.0,2.620126,5.005e-06,-0.0002401,132.683917,0.001145,-0.01,...,-197.0,4.209,0.044,-0.195,0.829,0.128,-0.252,288.70473,40.79776,13.455
50%,0.0,0.0,0.0,0.0,8.947426,3.3e-05,-3.3e-05,136.73923,0.00399,-0.00399,...,-159.0,4.436,0.07,-0.127,0.999,0.248,-0.111,292.31476,43.679661,14.534
75%,0.0,0.0,0.0,0.0,34.282605,0.0002401,-5.005e-06,169.937005,0.01,-0.001145,...,-112.0,4.543,0.149,-0.088,1.357,0.357,-0.069,295.88855,46.693659,15.322
max,1.0,1.0,1.0,1.0,1071.232624,0.1568,-1.1e-08,1472.522306,0.569,-9e-06,...,0.0,5.364,1.472,0.0,180.013,25.956,0.0,301.72076,52.33601,19.065


### Target variable distribution

In [7]:
df['koi_disposition'].value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

### Correlation

In [8]:
df.corr(method='pearson')

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
koi_fpflag_nt,1.0,-0.241148,0.003062,0.05079,0.408724,0.353231,-0.353231,0.249381,0.250697,-0.250697,...,-0.149565,-0.128576,0.053214,-0.140248,0.045535,0.0981,-0.062283,0.03092,-0.04651,-0.101643
koi_fpflag_ss,-0.241148,1.0,0.144516,0.101695,-0.155964,-0.134208,0.134208,-0.111469,-0.177043,0.177043,...,-0.185036,-0.053394,0.1189,-0.0618,0.032638,0.030783,-0.034283,0.08801,-0.053471,-0.006009
koi_fpflag_co,0.003062,0.144516,1.0,0.53828,-0.164792,-0.084351,0.084351,-0.121452,-0.023107,0.023107,...,-0.121247,-0.012101,-0.019921,-0.057406,0.032129,0.029239,-0.016611,0.159988,-0.094027,0.067023
koi_fpflag_ec,0.05079,0.101695,0.53828,1.0,-0.138946,-0.079903,0.079903,-0.09976,0.013635,-0.013635,...,-0.083607,0.003066,-0.006111,-0.052307,0.022411,0.030085,-0.01197,0.069693,-0.045495,0.047932
koi_period,0.408724,-0.155964,-0.164792,-0.138946,1.0,0.674842,-0.674842,0.655186,0.272529,-0.272529,...,-0.036036,-0.05541,-0.006147,-0.168086,0.01759,0.041289,-0.008776,-0.030675,0.011226,-0.045959
koi_period_err1,0.353231,-0.134208,-0.084351,-0.079903,0.674842,1.0,-1.0,0.469613,0.472034,-0.472034,...,-0.055134,-0.014447,0.005737,-0.140277,0.0151,0.033152,-0.00889,-0.005353,0.007145,-0.034097
koi_period_err2,-0.353231,0.134208,0.084351,0.079903,-0.674842,-1.0,1.0,-0.469613,-0.472034,0.472034,...,0.055134,0.014447,-0.005737,0.140277,-0.0151,-0.033152,0.00889,0.005353,-0.007145,0.034097
koi_time0bk,0.249381,-0.111469,-0.121452,-0.09976,0.655186,0.469613,-0.469613,1.0,0.162845,-0.162845,...,-0.030101,-0.001929,-0.03189,-0.107467,-0.004831,0.006444,0.011434,-0.016981,0.010483,0.021898
koi_time0bk_err1,0.250697,-0.177043,-0.023107,0.013635,0.272529,0.472034,-0.472034,0.162845,1.0,-1.0,...,-0.056111,-0.046285,0.035131,-0.11673,0.047142,0.068065,-0.042652,-0.003804,-0.009188,-0.041119
koi_time0bk_err2,-0.250697,0.177043,0.023107,-0.013635,-0.272529,-0.472034,0.472034,-0.162845,-1.0,1.0,...,0.056111,0.046285,-0.035131,0.11673,-0.047142,-0.068065,0.042652,0.003804,0.009188,0.041119


## Data cleaning

### Removing null

In [9]:
df.isnull().sum()

koi_disposition      0
koi_fpflag_nt        0
koi_fpflag_ss        0
koi_fpflag_co        0
koi_fpflag_ec        0
koi_period           0
koi_period_err1      0
koi_period_err2      0
koi_time0bk          0
koi_time0bk_err1     0
koi_time0bk_err2     0
koi_impact           0
koi_impact_err1      0
koi_impact_err2      0
koi_duration         0
koi_duration_err1    0
koi_duration_err2    0
koi_depth            0
koi_depth_err1       0
koi_depth_err2       0
koi_prad             0
koi_prad_err1        0
koi_prad_err2        0
koi_teq              0
koi_insol            0
koi_insol_err1       0
koi_insol_err2       0
koi_model_snr        0
koi_tce_plnt_num     0
koi_steff            0
koi_steff_err1       0
koi_steff_err2       0
koi_slogg            0
koi_slogg_err1       0
koi_slogg_err2       0
koi_srad             0
koi_srad_err1        0
koi_srad_err2        0
ra                   0
dec                  0
koi_kepmag           0
dtype: int64

In [10]:
df = df.dropna()
len(df)

6991

## Select features (columns) for model
