# Artifical Neural Network

Bringing everything from previous tutorials together and some new packages, in this notebook I built my first neural network!

Author: Ridha Fathima Mohideen Malik

In [100]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
import keras.optimizers

import warnings
warnings.filterwarnings('ignore', message='FutureWarning')

In [63]:
# importing data

lc = pd.read_csv('D:/comp-astro/comp-astro/exoplanet-data/star_exo_lc.csv', sep=',')
lc['LABEL_BIN'] = np.where(lc['LABEL']==2, 1, 0) ## replacing labels for binary value: 1 - has exoplanet, 0 - no exoplanet

targets = lc['LABEL_BIN']
lc_ft = lc.drop(labels=['LABEL', 'LABEL_BIN'], axis=1)
targets.value_counts()

0    5615
1      42
Name: LABEL_BIN, dtype: int64

In [65]:
# dimensionality reduction based on pca-decision/decision-trees.ipynb

kpca= KernelPCA(n_components=20, kernel = 'cosine', random_state=42, n_jobs=8)

lc_kpca = kpca.fit_transform(lc_ft)
lc_kpca = pd.DataFrame(lc_kpca)

In [66]:
# scaling

scaler = StandardScaler()
lc_scaled = scaler.fit_transform(lc_kpca)

In [70]:
# Chauhan's NN

def build_classifier():
    classifier = Sequential() # initialize neural network
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = x_train_res.shape[1])) ## input layer
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu')) ## hidden layer
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) ## output layer
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier, epochs = 40)

# k-fold cross validation for imbalanced dataset

cv = KFold(shuffle = True, n_splits = 5, random_state=42)

for train, test in cv.split(lc_kpca, targets):
    print('train -  {}   |   test -  {}'.format(
    np.bincount(targets.loc[train]), np.bincount(targets.loc[test])))

accuracy = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'accuracy', return_train_score = True, n_jobs = 8)
acc_df = pd.DataFrame(accuracy)
            
# checking recall

recall = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'recall', return_train_score = True, n_jobs = 8)
rec_df = pd.DataFrame(recall)

print("Accuracy:\n", acc_df)
print("Recall:\n", rec_df)

train -  [4492   33]   |   test -  [1123    9]
train -  [4491   34]   |   test -  [1124    8]
train -  [4489   37]   |   test -  [1126    5]
train -  [4497   29]   |   test -  [1118   13]
train -  [4491   35]   |   test -  [1124    7]
Accuracy:
    fit_time  score_time  test_score  train_score
0  6.755582    0.145449    0.992049     0.992707
1  6.802542    0.132873    0.992933     0.992486
2  6.841251    0.136831    0.995579     0.991825
3  7.017888    0.135614    0.988506     0.993593
4  6.783923    0.136002    0.993811     0.992267
Recall:
    fit_time  score_time  test_score  train_score
0  6.603257    0.145348         0.0          0.0
1  6.573338    0.150827         0.0          0.0
2  6.717478    0.139906         0.0          0.0
3  6.736955    0.143421         0.0          0.0
4  6.527054    0.144220         0.0          0.0


In [89]:
# regularised NN

def build_classifier():
    classifier = Sequential() # initialize neural network
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = x_train_res.shape[1])) ## input layer
    classifier.add(Dropout(0.2)) ## dropout fraction - prevents overfitting
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu')) ## hidden layer
    classifier.add(Dropout(0.2)) ## dropout fraction
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) ## output layer
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['Recall'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier, epochs = 40) 

# k-fold cross validation for imbalanced dataset

cv = KFold(shuffle = True, n_splits = 5, random_state=42)

for train, test in cv.split(lc_kpca, targets):
    print('train -  {}   |   test -  {}'.format(
    np.bincount(targets.loc[train]), np.bincount(targets.loc[test])))

accuracy = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'accuracy', return_train_score = True, n_jobs = 8)
acc_df = pd.DataFrame(accuracy)
            
# checking recall

recall = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'recall', return_train_score = True, n_jobs = 8)
rec_df = pd.DataFrame(recall)

print("Accuracy:\n", acc_df)
print("Recall:\n", rec_df)

train -  [4492   33]   |   test -  [1123    9]
train -  [4491   34]   |   test -  [1124    8]
train -  [4489   37]   |   test -  [1126    5]
train -  [4497   29]   |   test -  [1118   13]
train -  [4491   35]   |   test -  [1124    7]
Accuracy:
    fit_time  score_time  test_score  train_score
0  7.007162    0.141775    0.992049     0.992707
1  7.063291    0.139215    0.992933     0.992486
2  7.016701    0.143136    0.995579     0.991825
3  6.981329    0.139193    0.988506     0.993593
4  7.102070    0.136056    0.993811     0.992267
Recall:
    fit_time  score_time  test_score  train_score
0  6.765758    0.141199         0.0          0.0
1  6.986077    0.153202         0.0          0.0
2  6.785583    0.137176         0.0          0.0
3  6.726448    0.419088         0.0          0.0
4  6.767375    0.139506         0.0          0.0


In [90]:
# regularised NN, taxing misclassifications with Hinge

def build_classifier():
    classifier = Sequential() # initialize neural network
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = x_train_res.shape[1])) ## input layer
    classifier.add(Dropout(0.2)) ## dropout fraction - prevents overfitting
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu')) ## hidden layer
    classifier.add(Dropout(0.2)) ## dropout fraction
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) ## output layer
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    classifier.compile(optimizer = optimizer, loss = 'Hinge', metrics = ['Recall'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier, epochs = 40) 

# k-fold cross validation for imbalanced dataset

cv = KFold(shuffle = True, n_splits = 5, random_state=42)

for train, test in cv.split(lc_kpca, targets):
    print('train -  {}   |   test -  {}'.format(
    np.bincount(targets.loc[train]), np.bincount(targets.loc[test])))

accuracy = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'accuracy', return_train_score = True, n_jobs = 8)
acc_df = pd.DataFrame(accuracy)
            
# checking recall

recall = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'recall', return_train_score = True, n_jobs = 8)
rec_df = pd.DataFrame(recall)

print("Accuracy:\n", acc_df)
print("Recall:\n", rec_df)

train -  [4492   33]   |   test -  [1123    9]
train -  [4491   34]   |   test -  [1124    8]
train -  [4489   37]   |   test -  [1126    5]
train -  [4497   29]   |   test -  [1118   13]
train -  [4491   35]   |   test -  [1124    7]
Accuracy:
    fit_time  score_time  test_score  train_score
0  7.468027    0.132435    0.992049     0.992707
1  7.468249    0.141819    0.992933     0.992486
2  7.455082    0.146210    0.995579     0.991825
3  7.541857    0.131155    0.988506     0.993593
4  7.471662    0.127000    0.993811     0.992267
Recall:
    fit_time  score_time  test_score  train_score
0  7.293148    0.138387         0.0          0.0
1  7.240536    0.151146         0.0          0.0
2  7.275402    0.125198         0.0          0.0
3  7.276563    0.134093         0.0          0.0
4  7.296754    0.140500         0.0          0.0


In [111]:
# regularised NN and softmax as output layer

from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced',
                                                 classes = np.unique(targets),
                                                 y = targets)


def build_classifier():
    classifier = Sequential() # initialize neural network
    classifier.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu', input_dim = x_train_res.shape[1])) ## input layer
    classifier.add(Dropout(0.2)) ## dropout fraction - prevents overfitting
    classifier.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu')) ## hidden layer
    classifier.add(Dropout(0.2)) ## dropout fraction
    classifier.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'softmax')) ## output layer
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    classifier.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics = ['Recall'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier, epochs = 40) 

# k-fold cross validation for imbalanced dataset

cv = KFold(shuffle = True, n_splits = 5, random_state=42)

for train, test in cv.split(lc_kpca, targets):
    print('train -  {}   |   test -  {}'.format(
    np.bincount(targets.loc[train]), np.bincount(targets.loc[test])))

accuracy = cross_validate(estimator = classifier, X = lc_kpca, y = targets,
            cv = cv, scoring = 'accuracy', return_train_score = True, n_jobs = 8)
acc_df = pd.DataFrame(accuracy)
            
# checking recall

recall = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'recall', return_train_score = True, n_jobs = 8)
rec_df = pd.DataFrame(recall)

print("Accuracy:\n", acc_df)
print("Recall:\n", rec_df)

train -  [4492   33]   |   test -  [1123    9]
train -  [4491   34]   |   test -  [1124    8]
train -  [4489   37]   |   test -  [1126    5]
train -  [4497   29]   |   test -  [1118   13]
train -  [4491   35]   |   test -  [1124    7]
Accuracy:
    fit_time  score_time  test_score  train_score
0  7.702149    0.181643    0.992049     0.992707
1  7.700049    0.184134    0.992933     0.992486
2  7.755182    0.178935    0.995579     0.991825
3  7.700748    0.196398    0.988506     0.993593
4  7.704636    0.183822    0.993811     0.992267
Recall:
    fit_time  score_time  test_score  train_score
0  7.484771    0.231644         0.0          0.0
1  7.634970    0.224294         0.0          0.0
2  7.466532    0.217277         0.0          0.0
3  7.318944    0.248976         0.0          0.0
4  7.364322    0.226299         0.0          0.0


Now the accuracy is down. Softmax is preferred for multi-class classifications.

In [115]:
# regularised NN adding more layers

def build_classifier():
    classifier = Sequential() # initialize neural network
    classifier.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu', input_dim = x_train_res.shape[1])) ## input layer
    classifier.add(BatchNormalization(synchronized=True))
    classifier.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu')) ## hidden layer
    classifier.add(Dropout(0.2)) ## dropout fraction
    classifier.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu')) ## hidden layer
    classifier.add(Dropout(0.2)) ## dropout fraction
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) ## output layer
    optimizer = keras.optimizers.Adam(learning_rate=0.005)
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['Recall'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier, epochs = 100) 

# k-fold cross validation for imbalanced dataset

cv = KFold(shuffle = True, n_splits = 5, random_state=42)

for train, test in cv.split(lc_kpca, targets):
    print('train -  {}   |   test -  {}'.format(
    np.bincount(targets.loc[train]), np.bincount(targets.loc[test])))

accuracy = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'accuracy', return_train_score = True, n_jobs = 8)
acc_df = pd.DataFrame(accuracy)
            
# checking recall

recall = cross_validate(estimator = classifier, X = lc_kpca, y = targets, 
            cv = cv, scoring = 'recall', return_train_score = True, n_jobs = 8)
rec_df = pd.DataFrame(recall)

print("Accuracy:\n", acc_df)
print("Recall:\n", rec_df)

train -  [4492   33]   |   test -  [1123    9]
train -  [4491   34]   |   test -  [1124    8]
train -  [4489   37]   |   test -  [1126    5]
train -  [4497   29]   |   test -  [1118   13]
train -  [4491   35]   |   test -  [1124    7]
Accuracy:
     fit_time  score_time  test_score  train_score
0  22.088658    0.278243    0.992049     0.992707
1  22.848060    0.177087    0.992933     0.992486
2  23.225072    0.139158    0.995579     0.991825
3  22.610922    0.177539    0.988506     0.993593
4  22.519054    0.446784    0.993811     0.992267
Recall:
     fit_time  score_time  test_score  train_score
0  22.098625    0.180219         0.0          0.0
1  22.620404    0.364664         0.0          0.0
2  22.073492    0.181749         0.0          0.0
3  22.080716    0.190374         0.0          0.0
4  21.900666    0.189934         0.0          0.0


<img src="https://pbs.twimg.com/media/FFZpaMwXsAEzE5f?format=jpg&name=small">