In [7]:
#Importing necessary Libraries

import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [8]:
import os

## Data reading and preprocessing

In [9]:
data = pd.read_csv("data_full.csv")

#Dropping unnecessary columns
delete = ["kepid","tce_plnt_num","tce_rogue_flag","tce_insol","tce_impact","tce_insol_err","tce_period_err","tce_time0bk_err","tce_impact_err","tce_duration_err","tce_depth_err","tce_prad_err","tce_eqt_err","tce_eqt_err","tce_steff_err","tce_slogg_err","tce_sradius_err"]
data_new = data.drop(delete,axis=1)

#Deleting rows having unknown labels
data_new = data_new[data_new['av_training_set'] != 'UNK']

#Deleting rows having no values
data_new.dropna(subset=['tce_period', 'tce_time0bk', 'tce_duration', 'tce_depth',
       'tce_model_snr', 'tce_prad', 'tce_eqt', 'tce_steff', 'tce_slogg',
       'tce_sradius', 'av_training_set'], inplace=True) 

print(data_new.columns)

Index(['tce_period', 'tce_time0bk', 'tce_duration', 'tce_depth',
       'tce_model_snr', 'tce_prad', 'tce_eqt', 'tce_steff', 'tce_slogg',
       'tce_sradius', 'av_training_set'],
      dtype='object')


In [10]:
a = data['av_training_set'].unique()
print(a)

['AFP' 'UNK' 'PC' 'NTP' nan]


In [11]:
#Train and test split
Y = data_new.iloc[:,-1]
X = data_new.iloc[:,:-1]

train_X, test_X, train_Y, test_Y = train_test_split(X,Y,test_size = 0.15)


## Random Forest Classifier

In [12]:
model = RFC(n_estimators=13000)
model.fit(train_X,train_Y)

RandomForestClassifier(n_estimators=13000)

In [13]:
model.score(train_X,train_Y)

1.0

In [14]:
model.score(test_X,test_Y)

0.7815384615384615

In [15]:
pred_Y = model.predict(test_X)

In [16]:
cm = confusion_matrix(test_Y, pred_Y)
cm

array([[891,  16,  74],
       [197, 106,   3],
       [ 64,   1, 273]])

In [17]:
classification_report(test_Y, pred_Y)

'              precision    recall  f1-score   support\n\n         AFP       0.77      0.91      0.84       981\n         NTP       0.86      0.35      0.49       306\n          PC       0.78      0.81      0.79       338\n\n    accuracy                           0.78      1625\n   macro avg       0.81      0.69      0.71      1625\nweighted avg       0.79      0.78      0.76      1625\n'

In [18]:
import pickle
filename = "rfc.pkl"
with open(filename,"wb") as f:
  pickle.dump(model,f)

## Neural Network

In [19]:
#Normalising train set
scaler = StandardScaler().fit(train_X)


In [20]:
neural = MLPClassifier(hidden_layer_sizes=(64,64,64))
neural.fit(train_X,train_Y)

MLPClassifier(hidden_layer_sizes=(64, 64, 64))

In [21]:
neural.score(train_X,train_Y)


0.7401976756815466

In [22]:
neural.score(test_X,test_Y)

0.7169230769230769

In [23]:
nue_pred = neural.predict(test_X)

In [24]:
cm_n = confusion_matrix(test_Y, nue_pred)
cm_n

array([[895,  13,  73],
       [228,  72,   6],
       [134,   6, 198]])

In [25]:
print(classification_report(test_Y, nue_pred))

              precision    recall  f1-score   support

         AFP       0.71      0.91      0.80       981
         NTP       0.79      0.24      0.36       306
          PC       0.71      0.59      0.64       338

    accuracy                           0.72      1625
   macro avg       0.74      0.58      0.60      1625
weighted avg       0.73      0.72      0.69      1625



In [26]:
import pickle
filename = "nnc.pkl"
with open(filename,"wb") as f:
  pickle.dump(model,f)