In [1]:
import pandas as pd
import tensorflow as tf

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("sources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
#check available columns

print(list(df))

# Set features. This will also be used as your x values.
X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_duration', 'koi_depth']]

['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
y = df['koi_disposition']

# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_duration,koi_depth
6122,0,0,0,0,6.768901,3.616,123.1
6370,0,1,0,1,0.733726,2.309,114.6
2879,1,0,0,0,7.652707,79.8969,641.1
107,0,0,0,0,7.953547,2.6312,875.4
29,0,0,0,0,4.959319,2.22739,9802.0


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [6]:
# Scale your data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)
# Transform the training and testing data using the X_scaler

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#create label encoder and y  encoded train and test
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

#convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=50, activation='relu', input_dim=7),
    tf.keras.layers.Dense(units=3, activation='softmax')
])

# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 50)                400       
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 153       
Total params: 553
Trainable params: 553
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

model_loss, model_accuracy = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(
    f"Train Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/60
5243/5243 - 1s - loss: 0.7045 - acc: 0.6821
Epoch 2/60
5243/5243 - 0s - loss: 0.4185 - acc: 0.7828
Epoch 3/60
5243/5243 - 0s - loss: 0.3830 - acc: 0.7887
Epoch 4/60
5243/5243 - 0s - loss: 0.3749 - acc: 0.7927
Epoch 5/60
5243/5243 - 0s - loss: 0.3714 - acc: 0.7961
Epoch 6/60
5243/5243 - 0s - loss: 0.3688 - acc: 0.7994
Epoch 7/60
5243/5243 - 0s - loss: 0.3673 - acc: 0.7967
Epoch 8/60
5243/5243 - 0s - loss: 0.3652 - acc: 0.7997
Epoch 9/60
5243/5243 - 0s - loss: 0.3637 - acc: 0.8047
Epoch 10/60
5243/5243 - 0s - loss: 0.3632 - acc: 0.8005
Epoch 11/60
5243/5243 - 0s - loss: 0.3623 - acc: 0.8011
Epoch 12/60
5243/5243 - 0s - loss: 0.3610 - acc: 0.8058
Epoch 13/60
5243/5243 - 0s - loss: 0.3603 - acc: 0.8020
Epoch 14/60
5243/5243 - 0s - loss: 0.3591 - acc: 0.8013
Epoch 15/60
5243/5243 - 0s - loss: 0.3580 - acc: 0.8022
Epoch 16/60
5243/5243 - 0s - loss: 0.3588 - acc: 0.7992
Epoch 17/60
5243/5243 - 0s - loss: 0.3576 - acc: 0.8062
Epoch 18/60
5243/5243 - 0s - loss: 0.3568 - acc: 0.8064
E

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [12]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    # create model
    model = Sequential()
    model.add(Dense(units=100, activation='relu', input_dim=7))
    model.add(Dense(units=100, activation='relu'))
    model.add(Dense(units=3, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=2)

# Create the GridSearchCV model
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train_categorical)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
5243/5243 - 1s - loss: 0.3675 - acc: 0.8140
Epoch 2/100
5243/5243 - 0s - loss: 0.2509 - acc: 0.8571
Epoch 3/100
5243/5243 - 0s - loss: 0.2446 - acc: 0.8665
Epoch 4/100
5243/5243 - 0s - loss: 0.2431 - acc: 0.8620
Epoch 5/100
5243/5243 - 0s - loss: 0.2408 - acc: 0.8658
Epoch 6/100
5243/5243 - 0s - loss: 0.2396 - acc: 0.8690
Epoch 7/100
5243/5243 - 0s - loss: 0.2390 - acc: 0.8652
Epoch 8/100
5243/5243 - 0s - loss: 0.2391 - acc: 0.8678
Epoch 9/100
5243/5243 - 0s - loss: 0.2371 - acc: 0.8710
Epoch 10/100
5243/5243 - 0s - loss: 0.2369 - acc: 0.8680
Epoch 11/100
5243/5243 - 0s - loss: 0.2362 - acc: 0.8688
Epoch 12/100
5243/5243 - 0s - loss: 0.2363 - acc: 0.8699
Epoch 13/100
5243/5243 - 0s - loss: 0.2360 - acc: 0.8670
Epoch 14/100
5243/5243 - 0s - loss: 0.2346 - acc: 0.8725
Epoch 15/100
5243/5243 - 0s - loss: 0.2342 - acc: 0.8718
Epoch 16/100
5243/5243 - 0s - loss: 0.2345 - acc: 0.8716
Epo

In [13]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.871956 using {'batch_size': 100, 'epochs': 100}
0.864454 (0.004112) with: {'batch_size': 10, 'epochs': 10}
0.869540 (0.003862) with: {'batch_size': 10, 'epochs': 50}
0.864391 (0.006521) with: {'batch_size': 10, 'epochs': 100}
0.868714 (0.001166) with: {'batch_size': 20, 'epochs': 10}
0.871193 (0.001880) with: {'batch_size': 20, 'epochs': 50}
0.866107 (0.008165) with: {'batch_size': 20, 'epochs': 100}
0.862483 (0.006255) with: {'batch_size': 40, 'epochs': 10}
0.864581 (0.009234) with: {'batch_size': 40, 'epochs': 50}
0.868777 (0.002605) with: {'batch_size': 40, 'epochs': 100}
0.861021 (0.005304) with: {'batch_size': 60, 'epochs': 10}
0.868968 (0.001489) with: {'batch_size': 60, 'epochs': 50}
0.868460 (0.003925) with: {'batch_size': 60, 'epochs': 100}
0.865980 (0.000936) with: {'batch_size': 80, 'epochs': 10}
0.870558 (0.002766) with: {'batch_size': 80, 'epochs': 50}
0.870875 (0.005676) with: {'batch_size': 80, 'epochs': 100}
0.867697 (0.003165) with: {'batch_size': 100, 'epochs'

In [15]:
print(grid.best_params_)
print(grid.best_score_)

{'batch_size': 100, 'epochs': 100}
0.8719562558347479


In [16]:
#model comparisons

print(f"First model -> Accuracy: {model_accuracy}")
print(f"Grid model -> Accuracy: {grid.best_score_}")

First model -> Accuracy: 0.8060270547866821
Grid model -> Accuracy: 0.8719562558347479


# Save the Model

In [18]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'saved_models/.sav'
joblib.dump(model, filename)

['saved_models/DNN.sav']