# Importing libraries

In [2]:
#import all the necessary libraries to read the data and build neural network
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Reading Data

In [3]:
#read the data
planet_info = pd.read_csv('planetdata.csv') 

In [4]:
planet_info.shape

(1522, 39)

In [5]:
# list the last two rows of the data frame to have glimpse of the *data*
planet_info.tail(2)

Unnamed: 0,hostnames,fpl_letter,fst_mass,fst_masserr1,fst_masserr2,fst_age,fst_ageerr1,fst_ageerr2,fst_met,fst_meterr1,...,fpl_orbpererr2,fpl_eccen,fpl_eccenerr1,fpl_eccenerr2,fpl_rade,fpl_radeerr1,fpl_radeerr2,fpl_dens,fpl_denserr1,fpl_denserr2
1520,ups And,d,1.3,-,-,5.0,-,-,0.04,0.03,...,-0.57,0.2987,0.0072,-0.0072,13.0,-,-,3.28,-,-
1521,xi Aql,b,2.2,-,-,7.1,3.6,-3.6,-0.205,0.039,...,-0.25,0.0,-,-,13.2,-,-,2.13,-,-


# Data Preprocessing

In [6]:
#seems that fpl_letter won't have any significance to the p value
planet_info = planet_info.drop(['fpl_letter','hostnames'], axis = 1)

In [7]:
#convert colums containing Y and N into numeric 1 and 0 to make it
#suitable for feeding into the neural network
def binary_text_to_integer_class(value):
  if value == "Y":
    return 1
  else:
    return 0


In [8]:
#change HJflag and insamp into numeric using above function
planet_info['HJflag'] = planet_info['HJflag'].apply(binary_text_to_integer_class)
planet_info['insamp'] = planet_info['insamp'].apply(binary_text_to_integer_class)

In [9]:
planet_info.head(2)

Unnamed: 0,fst_mass,fst_masserr1,fst_masserr2,fst_age,fst_ageerr1,fst_ageerr2,fst_met,fst_meterr1,fst_meterr2,fst_dist,...,fpl_orbpererr2,fpl_eccen,fpl_eccenerr1,fpl_eccenerr2,fpl_rade,fpl_radeerr1,fpl_radeerr2,fpl_dens,fpl_denserr1,fpl_denserr2
0,2.7,0.3,-0.3,-,-,-,-0.35,0.09,-0.09,93.37,...,-0.32,0.231,0.005,-0.005,12.1,-,-,19.1,-,-
1,2.78,0.69,-0.69,1.56,0.54,-0.54,-0.02,-,-,125.72,...,-3.2,0.08,0.03,-0.03,12.3,-,-,13.8,-,-


# Handling missing values

In [10]:
value = planet_info['fst_ageerr2'][0]

In [11]:
planet_info.shape

(1522, 37)

In [12]:
for col in planet_info.columns:
    print("Column name {} : \t Number of missing values -> {} \t".format(col,len(planet_info[planet_info[col] == value])))

Column name fst_mass : 	 Number of missing values -> 2 	
Column name fst_masserr1 : 	 Number of missing values -> 180 	
Column name fst_masserr2 : 	 Number of missing values -> 192 	
Column name fst_age : 	 Number of missing values -> 349 	
Column name fst_ageerr1 : 	 Number of missing values -> 477 	
Column name fst_ageerr2 : 	 Number of missing values -> 477 	
Column name fst_met : 	 Number of missing values -> 64 	
Column name fst_meterr1 : 	 Number of missing values -> 424 	
Column name fst_meterr2 : 	 Number of missing values -> 424 	
Column name fst_dist : 	 Number of missing values -> 0 	
Column name fst_disterr1 : 	 Number of missing values -> 36 	
Column name fst_disterr2 : 	 Number of missing values -> 36 	
Column name Nss : 	 Number of missing values -> 0 	
Column name Pnull : 	 Number of missing values -> 0 	
Column name BIC1 : 	 Number of missing values -> 0 	
Column name rhoN : 	 Number of missing values -> 0 	
Column name Phigh : 	 Number of missing values -> 0 	
Column 

In [13]:
planet_info.dtypes

fst_mass           object
fst_masserr1       object
fst_masserr2       object
fst_age            object
fst_ageerr1        object
fst_ageerr2        object
fst_met            object
fst_meterr1        object
fst_meterr2        object
fst_dist          float64
fst_disterr1       object
fst_disterr2       object
Nss                 int64
Pnull             float64
BIC1              float64
rhoN              float64
Phigh             float64
HJflag              int64
insamp              int64
fpl_bmasse         object
fpl_bmasseerr1     object
fpl_bmasseerr2     object
fpl_smax           object
fpl_smaxerr1       object
fpl_smaxerr2       object
fpl_orbper         object
fpl_orbpererr1     object
fpl_orbpererr2     object
fpl_eccen          object
fpl_eccenerr1      object
fpl_eccenerr2      object
fpl_rade           object
fpl_radeerr1       object
fpl_radeerr2       object
fpl_dens           object
fpl_denserr1       object
fpl_denserr2       object
dtype: object

In [14]:
planet_info = planet_info.replace(to_replace ="-",
                            value =0.0)


In [15]:
planet_info = planet_info.apply(pd.to_numeric)

In [16]:
mean_value = planet_info['Phigh'].mean()

In [17]:
mean_value

0.6215297609721049

# Seperating X's and Y's

In [18]:
#split the data into training data and outcome
#X_train holds features 
#Y_train holds outcome
X_train = planet_info.loc[:, planet_info.columns != 'Phigh']
Y_train = planet_info['Phigh']

# Train Test Split

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_train = np.asarray(X_train).astype('float32')
y_train = np.asarray(y_train).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_test = np.asarray(y_test).astype('float32')

# Building model

In [20]:
# define base model with 2 layers 
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=36, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

# Training Model

In [21]:
model = baseline_model()
history = model.fit(X_train,y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
history.history

{'loss': [2723.95947265625,
  166.8605499267578,
  172.99090576171875,
  16.481586456298828,
  4.990438938140869,
  2.916015625,
  2.8835690021514893,
  2.3328676223754883,
  1.7286638021469116,
  1.1784977912902832]}

# Testing Model

In [23]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(X_test[:3])
print("predictions shape:", predictions.shape)

Evaluate on test data
test loss, test acc: 4.913088798522949
Generate predictions for 3 samples
predictions shape: (3, 1)


In [24]:
predictions

array([[0.34447014],
       [0.8739035 ],
       [0.7760011 ]], dtype=float32)

# Cross Validation

In [25]:
#build keras regressor model and train it for 25 epochs with batch size 32 
#and use kfold cross validation
estimator = KerasRegressor(build_fn=baseline_model, epochs=25, batch_size=32, verbose=0)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Baseline: -13.17 (36.87) MSE


In [26]:
results  

array([  -2.21823144,   -0.4620406 ,   -0.20647293,   -1.31697178,
         -0.6218223 ,   -1.10014558,   -0.58182871, -123.76695251,
         -0.2712402 ,   -1.18901873])