In [2]:
import pandas as pd
import tensorflow as tf

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("sources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [8]:
#check available columns

print(list(df))

# Set features. This will also be used as your x values.
X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_kepmag']]

['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']


# Create a Train Test Split

Use `koi_disposition` for the y values

In [10]:
y = df['koi_disposition']

# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_kepmag
6122,0,0,0,0,6.768901,7.38e-05,-7.38e-05,133.07724,0.00844,-0.00844,...,0.34,-0.23,1017,253.3,204.89,-103.87,10.8,1,5737,14.725
6370,0,1,0,1,0.733726,6.06e-06,-6.06e-06,132.02005,0.00795,-0.00795,...,0.23,-0.06,1867,2891.64,2253.61,-677.78,13.8,1,5855,15.77
2879,1,0,0,0,7.652707,6.54e-05,-6.54e-05,134.46038,0.00619,-0.00619,...,0.97,-0.32,989,226.81,195.16,-64.34,254.3,1,6328,13.099
107,0,0,0,0,7.953547,1.91e-05,-1.91e-05,174.66224,0.00182,-0.00182,...,0.07,-0.14,696,55.37,7.15,-10.12,38.4,1,4768,15.66
29,0,0,0,0,4.959319,5.15e-07,-5.15e-07,172.258529,8.3e-05,-8.3e-05,...,1.96,-1.46,1103,349.4,146.52,-93.21,696.5,1,5712,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
# Scale your data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)
# Transform the training and testing data using the X_scaler

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#create label encoder and y  encoded train and test
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

#convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=50, activation='relu', input_dim=30),
    tf.keras.layers.Dense(units=3, activation='softmax')
])

# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 50)                1550      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 153       
Total params: 1,703
Trainable params: 1,703
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

model_loss, model_accuracy = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(
    f"Train Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/60
5243/5243 - 1s - loss: 0.6781 - acc: 0.7269
Epoch 2/60
5243/5243 - 0s - loss: 0.3949 - acc: 0.8566
Epoch 3/60
5243/5243 - 1s - loss: 0.3245 - acc: 0.8743
Epoch 4/60
5243/5243 - 1s - loss: 0.2981 - acc: 0.8772
Epoch 5/60
5243/5243 - 0s - loss: 0.2855 - acc: 0.8802
Epoch 6/60
5243/5243 - 0s - loss: 0.2777 - acc: 0.8837
Epoch 7/60
5243/5243 - 0s - loss: 0.2711 - acc: 0.8858
Epoch 8/60
5243/5243 - 0s - loss: 0.2664 - acc: 0.8861
Epoch 9/60
5243/5243 - 0s - loss: 0.2613 - acc: 0.8888
Epoch 10/60
5243/5243 - 0s - loss: 0.2579 - acc: 0.8899
Epoch 11/60
5243/5243 - 0s - loss: 0.2547 - acc: 0.8898
Epoch 12/60
5243/5243 - 0s - loss: 0.2512 - acc: 0.8899
Epoch 13/60
5243/5243 - 1s - loss: 0.2479 - acc: 0.8907
Epoch 14/60
5243/5243 - 1s - loss: 0.2452 - acc: 0.8924
Epoch 15/60
5243/5243 - 1s - loss: 0.2420 - acc: 0.8940
Epoch 16/60
5243/5243 - 1s - loss: 0.2400 - acc: 0.8953
Epoch 17/60
5243/5243 - 0s - loss: 0.2381 - acc: 0.8957
Epoch 18/60
5243/5243 - 0s - loss: 0.2360 - acc: 0.8962
E

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [17]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    # create model
    model = Sequential()
    model.add(Dense(units=100, activation='relu', input_dim=30))
    model.add(Dense(units=100, activation='relu'))
    model.add(Dense(units=3, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=2)

# Create the GridSearchCV model
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train_categorical)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/50
5243/5243 - 2s - loss: 0.2478 - acc: 0.8936
Epoch 2/50
5243/5243 - 2s - loss: 0.1905 - acc: 0.9187
Epoch 3/50
5243/5243 - 2s - loss: 0.1770 - acc: 0.9259
Epoch 4/50
5243/5243 - 2s - loss: 0.1687 - acc: 0.9286
Epoch 5/50
5243/5243 - 2s - loss: 0.1624 - acc: 0.9283
Epoch 6/50
5243/5243 - 2s - loss: 0.1603 - acc: 0.9314
Epoch 7/50
5243/5243 - 1s - loss: 0.1584 - acc: 0.9332
Epoch 8/50
5243/5243 - 2s - loss: 0.1545 - acc: 0.9320
Epoch 9/50
5243/5243 - 1s - loss: 0.1517 - acc: 0.9337
Epoch 10/50
5243/5243 - 1s - loss: 0.1563 - acc: 0.9337
Epoch 11/50
5243/5243 - 2s - loss: 0.1481 - acc: 0.9363
Epoch 12/50
5243/5243 - 2s - loss: 0.1458 - acc: 0.9336
Epoch 13/50
5243/5243 - 2s - loss: 0.1519 - acc: 0.9338
Epoch 14/50
5243/5243 - 1s - loss: 0.1506 - acc: 0.9338
Epoch 15/50
5243/5243 - 1s - loss: 0.1499 - acc: 0.9362
Epoch 16/50
5243/5243 - 1s - loss: 0.1454 - acc: 0.9383
Epoch 17/50
5243/52

In [18]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.928603 using {'batch_size': 20, 'epochs': 50}
0.928285 (0.000951) with: {'batch_size': 10, 'epochs': 10}
0.924852 (0.001502) with: {'batch_size': 10, 'epochs': 50}
0.918812 (0.001730) with: {'batch_size': 10, 'epochs': 100}
0.923136 (0.001969) with: {'batch_size': 20, 'epochs': 10}
0.928603 (0.002502) with: {'batch_size': 20, 'epochs': 50}
0.918558 (0.003572) with: {'batch_size': 20, 'epochs': 100}
0.923708 (0.002988) with: {'batch_size': 40, 'epochs': 10}
0.926060 (0.002257) with: {'batch_size': 40, 'epochs': 50}
0.923072 (0.001809) with: {'batch_size': 40, 'epochs': 100}
0.923644 (0.001052) with: {'batch_size': 60, 'epochs': 10}
0.927077 (0.004122) with: {'batch_size': 60, 'epochs': 50}
0.922881 (0.005475) with: {'batch_size': 60, 'epochs': 100}
0.923899 (0.004558) with: {'batch_size': 80, 'epochs': 10}
0.927205 (0.000481) with: {'batch_size': 80, 'epochs': 50}
0.926378 (0.002168) with: {'batch_size': 80, 'epochs': 100}
0.926823 (0.000804) with: {'batch_size': 100, 'epochs': 

In [19]:
print(grid.best_params_)
print(grid.best_score_)

{'batch_size': 20, 'epochs': 50}
0.9286032242559418


In [20]:
#model comparisons

print(f"First model -> Accuracy: {model_accuracy}")
print(f"Grid model -> Accuracy: {grid.best_score_}")

First model -> Accuracy: 0.9132176041603088
Grid model -> Accuracy: 0.9286032242559418


# Save the Model

In [18]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'saved_models/.sav'
joblib.dump(model, filename)

['saved_models/DNN.sav']