In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.discriminant_analysis import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# read in the Kessler Objects of Interest (KOI) dataset
# the first 53 lines are description
df = pd.read_csv('../data/koi.csv', skiprows=53)
df.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


Making an assumption here that "CANDIDATE" doesn't tell us very much.
We want to train a model to predict whether a row will be "CONFIRMED" or a "FALSE POSITIVE"

In [7]:
# remove the "CANDIDATE" rows
df = df[df['koi_disposition'] != 'CANDIDATE']

We want to try and use the information we have about the exoplanet to predict the **koi_pdisposition** field.

In [8]:
# these are the columns we are interested in using
X = df[['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad', 'koi_kepmag']]

# replace missing values with the mean of the column
X = X.fillna(X.mean())

# we want to predict if the disposition is CONFIRMED
y = df['koi_disposition'] == 'CONFIRMED'


Feature Engineering

In [9]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Linear Regression

In [28]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

lr_test_predictions = lr_model.predict(X_test) > 0.5
print(f'Linear Regression accuracy: {100 *accuracy_score(y_test, lr_test_predictions):.2f}%')


Linear Regression accuracy: 79.48%


## Linear Regression with Polynomial Features

In [None]:
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)

lr_wp_model = LinearRegression()
lr_wp_model.fit(X_train_poly, y_train)

X_test_poly = poly_features.transform(X_test)
lr_wp_test_predictions = lr_wp_model.predict(X_test_poly) >= 0.5
print(f'Linear Regression with Polynomial Features accuracy: {100 *accuracy_score(y_test, lr_wp_test_predictions):.2f}%')

Linear Regression with Polynomial Features accuracy: 83.92%


array([ 3.38772287e-13, -3.22466487e-01,  5.62381771e-02, -4.42421347e-01,
       -1.06429741e-01, -1.85631378e-01, -1.57412477e+00,  1.85081659e-01,
       -9.40901172e+00, -7.29650392e-02, -8.62278554e-02, -1.94631070e-01,
       -2.48852958e-01, -1.81715114e-02,  6.89848696e-02, -2.09153666e-03,
        2.64629691e-01,  2.97375572e-02,  3.66556415e-02, -3.75965002e+00,
        1.19527221e+00, -1.91243076e+01,  3.15416247e-02, -1.97731416e-02,
        4.68358474e-02, -3.17506085e-02,  2.96961578e-02, -4.49409798e-03,
        1.77939131e-02, -1.19211126e-03,  7.30334564e-03,  6.36550249e-02,
        1.24186864e-02,  6.77549524e-02, -2.78979660e-03, -7.93224115e-03,
        1.29523270e-02, -5.46955639e-04, -2.27786867e-02,  2.20880232e-02,
        3.30985344e-03,  2.08449255e-01,  1.86149503e-01,  2.39167876e-01,
        8.62707520e-02, -4.35192293e-02, -1.47047586e-01,  1.60355384e-01,
        2.49164564e-02,  7.91735145e-02,  3.74311500e-03,  4.56068655e-02,
        9.97857837e-02, -

In [32]:

xgb_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)
print(f'XGBoost accuracy: {100 * accuracy_score(y_test, y_pred):.2f}%')


XGBoost accuracy: 92.71%
