In [5]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

# Data Pre-Processing


In [6]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense

In [7]:
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_score","koi_pdisposition", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [9]:
df.shape

(8744, 41)

In [10]:
mask = df["koi_disposition"]=="FALSE POSITIVE"
df.loc[mask, "koi_disposition"]="False_Positive"
df["koi_disposition"]

0            CONFIRMED
1            CONFIRMED
2       False_Positive
3       False_Positive
4            CONFIRMED
5            CONFIRMED
6            CONFIRMED
7            CONFIRMED
8       False_Positive
9            CONFIRMED
10           CONFIRMED
11           CONFIRMED
12           CONFIRMED
13           CONFIRMED
14      False_Positive
15      False_Positive
16      False_Positive
17      False_Positive
18           CONFIRMED
19           CONFIRMED
20      False_Positive
21           CONFIRMED
22           CONFIRMED
23           CONFIRMED
24      False_Positive
25           CONFIRMED
26           CONFIRMED
27           CONFIRMED
28      False_Positive
29      False_Positive
             ...      
9532    False_Positive
9533         CANDIDATE
9535    False_Positive
9536    False_Positive
9537    False_Positive
9538    False_Positive
9539         CANDIDATE
9540         CONFIRMED
9541    False_Positive
9542         CANDIDATE
9543    False_Positive
9544         CANDIDATE
9545    Fal

In [11]:
df.shape

(8744, 41)

In [12]:
# Create Train Test Split
from sklearn.model_selection import train_test_split
y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [13]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_selected = X_scaler.transform(X_train)
X_test_selected = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


In [23]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [24]:
y_train_categorical.shape

(6558, 3)

# Create and Train DL Model using Keras

In [25]:
# Create an empty sequential model
model = Sequential()

In [31]:
# Add the first layer where the input dimensions are the 561 columns of the training data
model.add(Dense(units=30, activation='relu', input_dim=40))

In [32]:
# Add a second hidden layer
model.add(Dense(units=100, activation='relu'))
# Add output layer
model.add(Dense(units=3, activation='softmax'))

In [33]:
# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [35]:
# Use the training data to fit (train) the model
model.fit(
    X_train_selected,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
 - 2s - loss: 0.3987 - acc: 0.7658
Epoch 2/60
 - 2s - loss: 0.3930 - acc: 0.7807
Epoch 3/60
 - 2s - loss: 0.3737 - acc: 0.8009
Epoch 4/60
 - 2s - loss: 0.3654 - acc: 0.8103
Epoch 5/60
 - 2s - loss: 0.3617 - acc: 0.8167
Epoch 6/60
 - 2s - loss: 0.3532 - acc: 0.8216
Epoch 7/60
 - 2s - loss: 0.3544 - acc: 0.8243
Epoch 8/60
 - 2s - loss: 0.3478 - acc: 0.8265
Epoch 9/60
 - 2s - loss: 0.3455 - acc: 0.8300
Epoch 10/60
 - 2s - loss: 0.3396 - acc: 0.8350
Epoch 11/60
 - 2s - loss: 0.3319 - acc: 0.8481
Epoch 12/60
 - 2s - loss: 0.3237 - acc: 0.8503
Epoch 13/60
 - 2s - loss: 0.3292 - acc: 0.8484
Epoch 14/60
 - 2s - loss: 0.3181 - acc: 0.8564
Epoch 15/60
 - 2s - loss: 0.3156 - acc: 0.8570
Epoch 16/60
 - 2s - loss: 0.3059 - acc: 0.8658
Epoch 17/60
 - 2s - loss: 0.3025 - acc: 0.8678
Epoch 18/60
 - 2s - loss: 0.3066 - acc: 0.8611
Epoch 19/60
 - 2s - loss: 0.2975 - acc: 0.8716
Epoch 20/60
 - 2s - loss: 0.2914 - acc: 0.8734
Epoch 21/60
 - 2s - loss: 0.2910 - acc: 0.8742
Epoch 22/60
 - 2s - lo

<tensorflow.python.keras.callbacks.History at 0x1a43880588>

# Save the Trained Model

In [36]:
# Save the model
model.save("kepler_trained.h5")

# Evaluate the Model

In [37]:
# Load the model
from tensorflow.keras.models import load_model
model = load_model("kepler_trained.h5")

In [38]:
# Evaluate the model using the training data
model_loss, model_accuracy = model.evaluate(X_test_selected, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 1s - loss: 0.2824 - acc: 0.8783
Loss: 0.28236591123378507, Accuracy: 0.8783165812492371


In [39]:
# Grab just one data point to test with
test = np.expand_dims(X_test_selected[0], axis=0)
test.shape

(1, 40)

In [40]:
# Make a prediction. The result should be 5 - STANDING
print(f"Predicted class: {model.predict_classes(test)}")

Predicted class: [2]


In [42]:
encoded_predictions = model.predict_classes(X_test_selected[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [44]:
print(f"Predicted Classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted Classes: ['False_Positive' 'CONFIRMED' 'CONFIRMED' 'CANDIDATE' 'CONFIRMED']
Actual Labels: ['False_Positive', 'CANDIDATE', 'CONFIRMED', 'CANDIDATE', 'CANDIDATE']
