In [208]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [209]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [210]:
import pandas as pd
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

# Read the CSV and Perform Basic Data Cleaning

In [211]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df['koi_disposition'].value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [212]:
#df_drop without the preidictions for cadidates and make df_pred of just predictions df
df_drop = df.loc[(df['koi_disposition'] != 'CANDIDATE')]
df_pred = df.loc[(df['koi_disposition'] == 'CANDIDATE')]
#feature matrix
X = df_drop.drop("koi_disposition", 1)
#target variable
y = df_drop["koi_disposition"]
y = pd.get_dummies(df_drop[['koi_disposition']], drop_first=True)

In [213]:
df_drop['koi_disposition'] = pd.get_dummies(df_drop[['koi_disposition']], drop_first=True)
df_drop.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,1,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [214]:
#Using Pearson Correlation
cor = df_drop.corr()
#Correlation with output variable
cor_target = abs(cor["koi_disposition"])

In [215]:
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.3]
relevant_features

koi_disposition    1.000000
koi_fpflag_nt      0.349495
koi_fpflag_ss      0.471960
koi_fpflag_co      0.427765
koi_fpflag_ec      0.317279
koi_steff_err1     0.442792
koi_steff_err2     0.385086
Name: koi_disposition, dtype: float64

In [216]:
#chcecking for correlation between features to reduce load. Drop koi_steff_err2
print(df[["koi_fpflag_nt","koi_fpflag_ss"]].corr())
print(df[["koi_fpflag_co","koi_fpflag_ec"]].corr())
print(df[["koi_steff_err1","koi_steff_err2"]].corr())

               koi_fpflag_nt  koi_fpflag_ss
koi_fpflag_nt       1.000000      -0.241148
koi_fpflag_ss      -0.241148       1.000000
               koi_fpflag_co  koi_fpflag_ec
koi_fpflag_co        1.00000        0.53828
koi_fpflag_ec        0.53828        1.00000
                koi_steff_err1  koi_steff_err2
koi_steff_err1        1.000000       -0.836126
koi_steff_err2       -0.836126        1.000000


In [217]:
# Use sklearn's `train_test_split` to split the data into training and testing

X = X[['koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co', 'koi_fpflag_ec', 'koi_steff_err1']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
predicted = model.predict(X_test)


In [218]:
# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

In [219]:
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")
model.score(X_test, y_test)

Mean Squared Error (MSE): 0.06342212740075791
R-squared (R2 ): 0.7174188039843015


0.7174188039843015