In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


# Select your features (columns)

In [3]:
# set test features from previous random forest classifier preprocessing
selected_features = df[["koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec",
                        "koi_duration_err1", "koi_duration_err2", "koi_prad", "koi_prad_err1",
                        "koi_prad_err2", "koi_model_snr", "koi_steff_err1", "koi_steff_err2"]]
feature_names = selected_features.columns

# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
# assign X(data) and y(target)
data = selected_features
target = df["koi_disposition"]
#y = df["koi_disposition"].values.reshape(-1, 1)
print(data.shape, target.shape)

(6991, 12) (6991,)


In [5]:
# split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
X_train

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_duration_err1,koi_duration_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_model_snr,koi_steff_err1,koi_steff_err2
6122,0,0,0,0,0.30600,-0.30600,1.24,0.34,-0.23,10.8,154,-171
6370,0,1,0,1,0.28200,-0.28200,0.86,0.23,-0.06,13.8,158,-175
2879,1,0,0,0,0.00000,0.00000,3.21,0.97,-0.32,254.3,151,-189
107,0,0,0,0,0.05950,-0.05950,2.25,0.07,-0.14,38.4,76,-85
29,0,0,0,0,0.00750,-0.00750,12.21,1.96,-1.46,696.5,77,-77
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0,1,0,0,0.00784,-0.00784,50.31,14.89,-4.97,1320.5,156,-190
5191,0,0,0,0,0.40700,-0.40700,1.97,0.34,-0.41,13.4,124,-124
5226,0,1,0,0,0.00306,-0.00306,100.03,34.46,-37.93,471.0,246,-458
5390,1,0,0,0,4.60000,-4.60000,64.00,10.45,-22.66,79.5,177,-197


# Pre-processing

Scale the data using the MinMaxScaler, perform some feature selection, label-encode the data set, and one-hot encode

In [7]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# label-encode data set, converting classes to integer labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [9]:
# one-hot encode the target labels, setting each row's class vlaue to an array
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)
y_train_categorical

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

# Train the Model

Define model architecture (layers) to build neural network, compile and train model

In [10]:
# create a sequential model
from tensorflow.keras.models import Sequential
model = Sequential()

In [11]:
# add first hidden layer, number of inputs and number of nodes specified
from tensorflow.keras.layers import Dense
number_inputs = 12
number_hidden_nodes = 36
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

In [12]:
# add last hidden layer, specify activation function for classification and number of classes(labels) to predict
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

In [13]:
# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 36)                468       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 111       
Total params: 579
Trainable params: 579
Non-trainable params: 0
_________________________________________________________________


In [14]:
# compile model using a loss function (categorical crossentroy for categorical data), an optimizer, and additional metric
# output layer in this instance is using software for logistic regression (i.e. categorical)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
             metrics=['accuracy'])

In [15]:
# fit(train) the model using the training data
# in this instance, we chose 100 iterations/epochs, shuffle data, and increase detail printed each training cycle
model.fit(X_train_scaled, y_train_categorical,
         epochs=1000, shuffle=True, verbose=2)

Epoch 1/1000
164/164 - 0s - loss: 0.8318 - accuracy: 0.5588
Epoch 2/1000
164/164 - 0s - loss: 0.5264 - accuracy: 0.7685
Epoch 3/1000
164/164 - 0s - loss: 0.4266 - accuracy: 0.7753
Epoch 4/1000
164/164 - 0s - loss: 0.3990 - accuracy: 0.8070
Epoch 5/1000
164/164 - 0s - loss: 0.3875 - accuracy: 0.8032
Epoch 6/1000
164/164 - 0s - loss: 0.3803 - accuracy: 0.8190
Epoch 7/1000
164/164 - 0s - loss: 0.3742 - accuracy: 0.8234
Epoch 8/1000
164/164 - 0s - loss: 0.3691 - accuracy: 0.8293
Epoch 9/1000
164/164 - 0s - loss: 0.3643 - accuracy: 0.8276
Epoch 10/1000
164/164 - 0s - loss: 0.3592 - accuracy: 0.8299
Epoch 11/1000
164/164 - 0s - loss: 0.3546 - accuracy: 0.8308
Epoch 12/1000
164/164 - 0s - loss: 0.3502 - accuracy: 0.8350
Epoch 13/1000
164/164 - 0s - loss: 0.3467 - accuracy: 0.8386
Epoch 14/1000
164/164 - 0s - loss: 0.3444 - accuracy: 0.8398
Epoch 15/1000
164/164 - 0s - loss: 0.3411 - accuracy: 0.8386
Epoch 16/1000
164/164 - 0s - loss: 0.3386 - accuracy: 0.8407
Epoch 17/1000
164/164 - 0s - loss

Epoch 135/1000
164/164 - 0s - loss: 0.3033 - accuracy: 0.8606
Epoch 136/1000
164/164 - 0s - loss: 0.3032 - accuracy: 0.8623
Epoch 137/1000
164/164 - 0s - loss: 0.3023 - accuracy: 0.8611
Epoch 138/1000
164/164 - 0s - loss: 0.3035 - accuracy: 0.8619
Epoch 139/1000
164/164 - 0s - loss: 0.3031 - accuracy: 0.8625
Epoch 140/1000
164/164 - 0s - loss: 0.3032 - accuracy: 0.8613
Epoch 141/1000
164/164 - 0s - loss: 0.3037 - accuracy: 0.8594
Epoch 142/1000
164/164 - 0s - loss: 0.3041 - accuracy: 0.8611
Epoch 143/1000
164/164 - 0s - loss: 0.3046 - accuracy: 0.8596
Epoch 144/1000
164/164 - 0s - loss: 0.3030 - accuracy: 0.8608
Epoch 145/1000
164/164 - 0s - loss: 0.3052 - accuracy: 0.8623
Epoch 146/1000
164/164 - 0s - loss: 0.3025 - accuracy: 0.8619
Epoch 147/1000
164/164 - 0s - loss: 0.3029 - accuracy: 0.8583
Epoch 148/1000
164/164 - 0s - loss: 0.3037 - accuracy: 0.8604
Epoch 149/1000
164/164 - 0s - loss: 0.3023 - accuracy: 0.8625
Epoch 150/1000
164/164 - 0s - loss: 0.3017 - accuracy: 0.8596
Epoch 15

Epoch 268/1000
164/164 - 0s - loss: 0.2997 - accuracy: 0.8625
Epoch 269/1000
164/164 - 0s - loss: 0.2992 - accuracy: 0.8623
Epoch 270/1000
164/164 - 0s - loss: 0.2992 - accuracy: 0.8631
Epoch 271/1000
164/164 - 0s - loss: 0.2987 - accuracy: 0.8615
Epoch 272/1000
164/164 - 0s - loss: 0.3002 - accuracy: 0.8617
Epoch 273/1000
164/164 - 0s - loss: 0.2992 - accuracy: 0.8611
Epoch 274/1000
164/164 - 0s - loss: 0.2988 - accuracy: 0.8589
Epoch 275/1000
164/164 - 0s - loss: 0.3009 - accuracy: 0.8604
Epoch 276/1000
164/164 - 0s - loss: 0.2990 - accuracy: 0.8610
Epoch 277/1000
164/164 - 0s - loss: 0.2994 - accuracy: 0.8589
Epoch 278/1000
164/164 - 0s - loss: 0.2988 - accuracy: 0.8638
Epoch 279/1000
164/164 - 0s - loss: 0.3019 - accuracy: 0.8606
Epoch 280/1000
164/164 - 0s - loss: 0.2988 - accuracy: 0.8625
Epoch 281/1000
164/164 - 0s - loss: 0.2999 - accuracy: 0.8617
Epoch 282/1000
164/164 - 0s - loss: 0.2995 - accuracy: 0.8598
Epoch 283/1000
164/164 - 0s - loss: 0.2995 - accuracy: 0.8585
Epoch 28

164/164 - 0s - loss: 0.2960 - accuracy: 0.8638
Epoch 401/1000
164/164 - 0s - loss: 0.2959 - accuracy: 0.8636
Epoch 402/1000
164/164 - 0s - loss: 0.2954 - accuracy: 0.8629
Epoch 403/1000
164/164 - 0s - loss: 0.2954 - accuracy: 0.8648
Epoch 404/1000
164/164 - 0s - loss: 0.2965 - accuracy: 0.8642
Epoch 405/1000
164/164 - 0s - loss: 0.2961 - accuracy: 0.8627
Epoch 406/1000
164/164 - 0s - loss: 0.2964 - accuracy: 0.8598
Epoch 407/1000
164/164 - 0s - loss: 0.2964 - accuracy: 0.8631
Epoch 408/1000
164/164 - 0s - loss: 0.2967 - accuracy: 0.8627
Epoch 409/1000
164/164 - 0s - loss: 0.2977 - accuracy: 0.8619
Epoch 410/1000
164/164 - 0s - loss: 0.2958 - accuracy: 0.8600
Epoch 411/1000
164/164 - 0s - loss: 0.2951 - accuracy: 0.8655
Epoch 412/1000
164/164 - 0s - loss: 0.2956 - accuracy: 0.8652
Epoch 413/1000
164/164 - 0s - loss: 0.2961 - accuracy: 0.8621
Epoch 414/1000
164/164 - 0s - loss: 0.2949 - accuracy: 0.8621
Epoch 415/1000
164/164 - 0s - loss: 0.2972 - accuracy: 0.8636
Epoch 416/1000
164/164 

Epoch 533/1000
164/164 - 0s - loss: 0.2931 - accuracy: 0.8636
Epoch 534/1000
164/164 - 0s - loss: 0.2934 - accuracy: 0.8652
Epoch 535/1000
164/164 - 0s - loss: 0.2925 - accuracy: 0.8650
Epoch 536/1000
164/164 - 0s - loss: 0.2943 - accuracy: 0.8636
Epoch 537/1000
164/164 - 0s - loss: 0.2935 - accuracy: 0.8640
Epoch 538/1000
164/164 - 0s - loss: 0.2947 - accuracy: 0.8638
Epoch 539/1000
164/164 - 0s - loss: 0.2928 - accuracy: 0.8623
Epoch 540/1000
164/164 - 0s - loss: 0.2931 - accuracy: 0.8638
Epoch 541/1000
164/164 - 0s - loss: 0.2934 - accuracy: 0.8646
Epoch 542/1000
164/164 - 0s - loss: 0.2940 - accuracy: 0.8644
Epoch 543/1000
164/164 - 0s - loss: 0.2927 - accuracy: 0.8621
Epoch 544/1000
164/164 - 0s - loss: 0.2935 - accuracy: 0.8653
Epoch 545/1000
164/164 - 0s - loss: 0.2936 - accuracy: 0.8636
Epoch 546/1000
164/164 - 0s - loss: 0.2929 - accuracy: 0.8638
Epoch 547/1000
164/164 - 0s - loss: 0.2935 - accuracy: 0.8661
Epoch 548/1000
164/164 - 0s - loss: 0.2934 - accuracy: 0.8638
Epoch 54

164/164 - 0s - loss: 0.2904 - accuracy: 0.8646
Epoch 666/1000
164/164 - 0s - loss: 0.2911 - accuracy: 0.8661
Epoch 667/1000
164/164 - 0s - loss: 0.2909 - accuracy: 0.8652
Epoch 668/1000
164/164 - 0s - loss: 0.2905 - accuracy: 0.8642
Epoch 669/1000
164/164 - 0s - loss: 0.2902 - accuracy: 0.8663
Epoch 670/1000
164/164 - 0s - loss: 0.2907 - accuracy: 0.8638
Epoch 671/1000
164/164 - 0s - loss: 0.2907 - accuracy: 0.8650
Epoch 672/1000
164/164 - 0s - loss: 0.2918 - accuracy: 0.8625
Epoch 673/1000
164/164 - 0s - loss: 0.2908 - accuracy: 0.8667
Epoch 674/1000
164/164 - 0s - loss: 0.2903 - accuracy: 0.8659
Epoch 675/1000
164/164 - 0s - loss: 0.2907 - accuracy: 0.8663
Epoch 676/1000
164/164 - 0s - loss: 0.2922 - accuracy: 0.8636
Epoch 677/1000
164/164 - 0s - loss: 0.2913 - accuracy: 0.8657
Epoch 678/1000
164/164 - 0s - loss: 0.2908 - accuracy: 0.8652
Epoch 679/1000
164/164 - 0s - loss: 0.2896 - accuracy: 0.8653
Epoch 680/1000
164/164 - 0s - loss: 0.2910 - accuracy: 0.8653
Epoch 681/1000
164/164 

Epoch 798/1000
164/164 - 0s - loss: 0.2886 - accuracy: 0.8669
Epoch 799/1000
164/164 - 0s - loss: 0.2869 - accuracy: 0.8705
Epoch 800/1000
164/164 - 0s - loss: 0.2880 - accuracy: 0.8665
Epoch 801/1000
164/164 - 0s - loss: 0.2881 - accuracy: 0.8686
Epoch 802/1000
164/164 - 0s - loss: 0.2870 - accuracy: 0.8667
Epoch 803/1000
164/164 - 0s - loss: 0.2881 - accuracy: 0.8673
Epoch 804/1000
164/164 - 0s - loss: 0.2876 - accuracy: 0.8665
Epoch 805/1000
164/164 - 0s - loss: 0.2869 - accuracy: 0.8676
Epoch 806/1000
164/164 - 0s - loss: 0.2877 - accuracy: 0.8673
Epoch 807/1000
164/164 - 0s - loss: 0.2880 - accuracy: 0.8705
Epoch 808/1000
164/164 - 0s - loss: 0.2867 - accuracy: 0.8713
Epoch 809/1000
164/164 - 0s - loss: 0.2886 - accuracy: 0.8665
Epoch 810/1000
164/164 - 0s - loss: 0.2885 - accuracy: 0.8682
Epoch 811/1000
164/164 - 0s - loss: 0.2873 - accuracy: 0.8680
Epoch 812/1000
164/164 - 0s - loss: 0.2863 - accuracy: 0.8693
Epoch 813/1000
164/164 - 0s - loss: 0.2869 - accuracy: 0.8699
Epoch 81

Epoch 931/1000
164/164 - 0s - loss: 0.2827 - accuracy: 0.8735
Epoch 932/1000
164/164 - 0s - loss: 0.2836 - accuracy: 0.8735
Epoch 933/1000
164/164 - 0s - loss: 0.2829 - accuracy: 0.8737
Epoch 934/1000
164/164 - 0s - loss: 0.2816 - accuracy: 0.8739
Epoch 935/1000
164/164 - 0s - loss: 0.2826 - accuracy: 0.8703
Epoch 936/1000
164/164 - 0s - loss: 0.2829 - accuracy: 0.8692
Epoch 937/1000
164/164 - 0s - loss: 0.2825 - accuracy: 0.8745
Epoch 938/1000
164/164 - 0s - loss: 0.2835 - accuracy: 0.8705
Epoch 939/1000
164/164 - 0s - loss: 0.2824 - accuracy: 0.8716
Epoch 940/1000
164/164 - 0s - loss: 0.2827 - accuracy: 0.8692
Epoch 941/1000
164/164 - 0s - loss: 0.2817 - accuracy: 0.8741
Epoch 942/1000
164/164 - 0s - loss: 0.2827 - accuracy: 0.8709
Epoch 943/1000
164/164 - 0s - loss: 0.2833 - accuracy: 0.8709
Epoch 944/1000
164/164 - 0s - loss: 0.2833 - accuracy: 0.8716
Epoch 945/1000
164/164 - 0s - loss: 0.2820 - accuracy: 0.8713
Epoch 946/1000
164/164 - 0s - loss: 0.2827 - accuracy: 0.8741
Epoch 94

<tensorflow.python.keras.callbacks.History at 0x27e4a87d130>

In [16]:
# test ability to predict new data points (validity) by using the testing data to evaluate the model
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.3007 - accuracy: 0.8673
Loss: 0.3007114827632904, Accuracy: 0.8672769069671631


In [17]:
# use trained model to make predictions
encoded_predictions = model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [18]:
print(f"Predicted Classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted Classes: ['FALSE POSITIVE' 'CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'CONFIRMED' 'CONFIRMED' 'CANDIDATE' 'CANDIDATE'
 'FALSE POSITIVE']
Actual Labels: ['FALSE POSITIVE', 'CANDIDATE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CANDIDATE', 'CANDIDATE', 'CANDIDATE', 'FALSE POSITIVE']


In [19]:
y_tlist = list(y_test[:10])
pred_df = pd.DataFrame({"Predictions": prediction_labels, "Actual": y_tlist})
pred_df.head(10)

Unnamed: 0,Predictions,Actual
0,FALSE POSITIVE,FALSE POSITIVE
1,CANDIDATE,CANDIDATE
2,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE
4,FALSE POSITIVE,FALSE POSITIVE
5,CONFIRMED,CONFIRMED
6,CONFIRMED,CANDIDATE
7,CANDIDATE,CANDIDATE
8,CANDIDATE,CANDIDATE
9,FALSE POSITIVE,FALSE POSITIVE


# Save the Model

In [20]:
# saving the model with the best accuracy
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.3007 - accuracy: 0.8673
Loss: 0.3007114827632904, Accuracy: 0.8672769069671631


In [21]:
# save the model using the HDF5 binary format with the extension .h5
model.save("models/exoplanets_test3_NeuralNetwork.h5")

In [22]:
# load the other model
from tensorflow.keras.models import load_model
exo_model1 = load_model("models/exoplanets_NeuralNetwork_best.h5")

In [23]:
model_loss, model_accuracy = exo_model1.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.2906 - accuracy: 0.8867
Loss: 0.2906394600868225, Accuracy: 0.8867276906967163


In [24]:
# load the other model
from tensorflow.keras.models import load_model
exo_model2 = load_model("models/exoplanets_test3_NeuralNetwork.h5")

In [25]:
model_loss, model_accuracy = exo_model2.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.3007 - accuracy: 0.8673
Loss: 0.3007114827632904, Accuracy: 0.8672769069671631
