In [1]:
#Importing necessary Libraries

import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
import os

## Data reading and preprocessing

In [3]:
data = pd.read_csv("data_full.csv")

#Dropping unnecessary columns
delete = ["kepid","tce_plnt_num","tce_rogue_flag","tce_insol","tce_impact","tce_insol_err","tce_period_err","tce_time0bk_err","tce_impact_err","tce_duration_err","tce_depth_err","tce_prad_err","tce_eqt_err","tce_eqt_err","tce_steff_err","tce_slogg_err","tce_sradius_err"]
data_new = data.drop(delete,axis=1)

#Deleting rows having unknown labels
data_new = data_new[data_new['av_training_set'] != 'UNK']

#Deleting rows having no values
data_new.dropna(subset=['tce_period', 'tce_time0bk', 'tce_duration', 'tce_depth',
       'tce_model_snr', 'tce_prad', 'tce_eqt', 'tce_steff', 'tce_slogg',
       'tce_sradius', 'av_training_set'], inplace=True) 

print(data_new.columns)

Index(['tce_period', 'tce_time0bk', 'tce_duration', 'tce_depth',
       'tce_model_snr', 'tce_prad', 'tce_eqt', 'tce_steff', 'tce_slogg',
       'tce_sradius', 'av_training_set'],
      dtype='object')


In [4]:
a = data['av_training_set'].unique()
print(a)

['AFP' 'UNK' 'PC' 'NTP']


In [5]:
#Train and test split
Y = data_new.iloc[:,-1]
X = data_new.iloc[:,:-1]

train_X, test_X, train_Y, test_Y = train_test_split(X,Y,test_size = 0.15)


## Random Forest Classifier

In [6]:
model = RFC(n_estimators=13000)
model.fit(train_X,train_Y)

In [7]:
model.score(train_X,train_Y)

1.0

In [8]:
model.score(test_X,test_Y)

0.8026260059296908

In [9]:
pred_Y = model.predict(test_X)

In [10]:
cm = confusion_matrix(test_Y, pred_Y)
cm

array([[1304,   17,   96],
       [ 260,  133,    3],
       [  89,    1,  458]])

In [11]:
classification_report(test_Y, pred_Y)

'              precision    recall  f1-score   support\n\n         AFP       0.79      0.92      0.85      1417\n         NTP       0.88      0.34      0.49       396\n          PC       0.82      0.84      0.83       548\n\n    accuracy                           0.80      2361\n   macro avg       0.83      0.70      0.72      2361\nweighted avg       0.81      0.80      0.78      2361\n'

In [12]:
import pickle
filename = "rfc.pkl"
with open(filename,"wb") as f:
  pickle.dump(model,f)

## Neural Network

In [None]:
#Normalising train set
scaler = StandardScaler().fit(train_X)


In [None]:
neural = MLPClassifier(hidden_layer_sizes=(64,64,64))
neural.fit(train_X,train_Y)

MLPClassifier(hidden_layer_sizes=(64, 64, 64))

In [None]:
neural.score(train_X,train_Y)


0.7716058612440191

In [None]:
neural.score(test_X,test_Y)

0.7657772130453198

In [None]:
nue_pred = neural.predict(test_X)

In [None]:
cm_n = confusion_matrix(test_Y, nue_pred)
cm_n

array([[1262,   12,  176],
       [ 292,   88,   13],
       [  58,    2,  458]])

In [None]:
print(classification_report(test_Y, nue_pred))

              precision    recall  f1-score   support

         AFP       0.78      0.87      0.82      1450
         NTP       0.86      0.22      0.36       393
          PC       0.71      0.88      0.79       518

    accuracy                           0.77      2361
   macro avg       0.78      0.66      0.66      2361
weighted avg       0.78      0.77      0.74      2361



In [None]:
import pickle
filename = "nnc.pkl"
with open(filename,"wb") as f:
  pickle.dump(model,f)

array([[1247,    6,  171],
       [ 298,   57,   11],
       [  94,    1,  476]])