In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.discriminant_analysis import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# read in the Kessler Objects of Interest (KOI) dataset
# the first 53 lines are description
df = pd.read_csv('../data/koi.csv', skiprows=53)
df.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


Making an assumption here that "CANDIDATE" doesn't tell us very much.
We want to train a model to predict whether a row will be "CONFIRMED" or a "FALSE POSITIVE"

In [99]:
# remove the "CANDIDATE" rows
df = df[df['koi_disposition'] != 'CANDIDATE']

We want to try and use the information we have about the exoplanet to predict the **koi_pdisposition** field.

In [100]:
# these are the columns we are interested in using
X = df[['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad', 'koi_kepmag']]

# replace missing values with the mean of the column
X = X.fillna(X.mean())

# we want to predict if the disposition is CONFIRMED
y = df['koi_disposition'] == 'CONFIRMED'


Feature Engineering

In [101]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [102]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Linear Regression

In [103]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test) > 0.5
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Linear Regression accuracy: {100 * lr_accuracy:.2f}%')


Linear Regression accuracy: 79.48%


## Linear Regression with Polynomial Features

In [104]:
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)

lr_wp_model = LinearRegression()
lr_wp_model.fit(X_train_poly, y_train)

X_test_poly = poly_features.transform(X_test)
y_pred = lr_wp_model.predict(X_test_poly) >= 0.5
lr_wp_accuracy = accuracy_score(y_test, y_pred)
print(f'Linear Regression with Polynomial Features accuracy: {100 * lr_wp_accuracy:.2f}%')

Linear Regression with Polynomial Features accuracy: 83.92%


## XGBoost

In [105]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost accuracy: {100 * xgb_accuracy:.2f}%')


XGBoost accuracy: 92.71%


## XGBoost with Polynomial Features

In [106]:
xgb_model_poly = xgb.XGBClassifier(eval_metric='logloss')
xgb_model_poly.fit(X_train_poly, y_train)

y_pred = xgb_model_poly.predict(X_test_poly)
xgb_wp_accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost accuracy: {100 * xgb_wp_accuracy:.2f}%')

XGBoost accuracy: 92.71%


## Neural Network

In [117]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

nn_model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

nn_model.fit(X_train, y_train, epochs=100, validation_split=0.2)
nn_loss, nn_accuracy = nn_model.evaluate(X_test, y_test)

print(f"TensorFlow Neural Network accuracy: {100 * nn_accuracy:.2f}%")

Epoch 1/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8194 - loss: 0.3937 - val_accuracy: 0.8465 - val_loss: 0.3413
Epoch 2/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8660 - loss: 0.3205 - val_accuracy: 0.8795 - val_loss: 0.2900
Epoch 3/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8653 - loss: 0.3028 - val_accuracy: 0.8795 - val_loss: 0.2838
Epoch 4/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8792 - loss: 0.2918 - val_accuracy: 0.8945 - val_loss: 0.2686
Epoch 5/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8740 - loss: 0.2915 - val_accuracy: 0.9040 - val_loss: 0.2560
Epoch 6/100
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8820 - loss: 0.2787 - val_accuracy: 0.8936 - val_loss: 0.2639
Epoch 7/100
[1m133/13

In [108]:
pd.DataFrame([
    ["LinearRegression:", lr_accuracy],
    ["LinearRegression with Polynomial Features:", lr_wp_accuracy],
    ["XGBoost:", xgb_accuracy],
    ["XGBoost with Polynomial Features:", xgb_wp_accuracy],
    ["Neural Network:", nn_accuracy]
])

Unnamed: 0,0,1
0,LinearRegression:,0.794815
1,LinearRegression with Polynomial Features:,0.839192
2,XGBoost:,0.927065
3,XGBoost with Polynomial Features:,0.927065
4,Neural Network:,0.884007
