# Library Imports


In [2]:
#import all the necessary libraries to read the data and build neural network
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Reading data

In [3]:
#read the data
planet_info = pd.read_csv('planetdata.csv') 

In [4]:
planet_info.shape

(1522, 39)

# list the last two rows of the data frame to have glimpse of the *data*

In [5]:
planet_info.tail(2)

Unnamed: 0,hostnames,fpl_letter,fst_mass,fst_masserr1,fst_masserr2,fst_age,fst_ageerr1,fst_ageerr2,fst_met,fst_meterr1,...,fpl_orbpererr2,fpl_eccen,fpl_eccenerr1,fpl_eccenerr2,fpl_rade,fpl_radeerr1,fpl_radeerr2,fpl_dens,fpl_denserr1,fpl_denserr2
1520,ups And,d,1.3,-,-,5.0,-,-,0.04,0.03,...,-0.57,0.2987,0.0072,-0.0072,13.0,-,-,3.28,-,-
1521,xi Aql,b,2.2,-,-,7.1,3.6,-3.6,-0.205,0.039,...,-0.25,0.0,-,-,13.2,-,-,2.13,-,-


In [6]:
#seems that fpl_letter won't have any significance to the p value
planet_info = planet_info.drop(['fpl_letter','hostnames'], axis = 1)

In [7]:
#convert colums containing Y and N into numeric 1 and 0 to make it
#suitable for feeding into the neural network
def binary_text_to_integer_class(value):
  if value == "Y":
    return 1
  else:
    return 0


In [8]:
#change HJflag and insamp into numeric using above function
planet_info['HJflag'] = planet_info['HJflag'].apply(binary_text_to_integer_class)
planet_info['insamp'] = planet_info['insamp'].apply(binary_text_to_integer_class)

In [9]:
planet_info.head(2)

Unnamed: 0,fst_mass,fst_masserr1,fst_masserr2,fst_age,fst_ageerr1,fst_ageerr2,fst_met,fst_meterr1,fst_meterr2,fst_dist,...,fpl_orbpererr2,fpl_eccen,fpl_eccenerr1,fpl_eccenerr2,fpl_rade,fpl_radeerr1,fpl_radeerr2,fpl_dens,fpl_denserr1,fpl_denserr2
0,2.7,0.3,-0.3,-,-,-,-0.35,0.09,-0.09,93.37,...,-0.32,0.231,0.005,-0.005,12.1,-,-,19.1,-,-
1,2.78,0.69,-0.69,1.56,0.54,-0.54,-0.02,-,-,125.72,...,-3.2,0.08,0.03,-0.03,12.3,-,-,13.8,-,-


In [10]:
planet_info = planet_info.replace(to_replace ="-",
                            value =0.0)


In [11]:
#split the data into training data and outcome
#X_train holds features 
#Y_train holds outcome
X_train = planet_info.loc[:, planet_info.columns != 'Phigh']
Y_train = planet_info['Phigh']

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)

In [13]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

In [14]:
X_train = np.asarray(X_train).astype('float32')


In [15]:
y_train = np.asarray(y_train).astype('float32')


In [16]:
# define base model with 2 layers 
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=36, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [17]:
model = baseline_model()
model.fit(X_train,y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1df5a80c88>

In [None]:
#build keras regressor model and train it for 25 epochs with batch size 32 
#and use kfold cross validation
estimator = KerasRegressor(build_fn=baseline_model, epochs=25, batch_size=32, verbose=0)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Baseline: -5.46 (14.14) MSE


In [None]:
results  

array([ -6.32577038,  -0.63954699,  -0.28776535,  -0.35346672,
        -0.49080363,  -2.22090602, -92.65856171,  -3.01690292,
        -0.25577083,  -0.36280063])