In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    auc,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    precision_score,
    recall_score,
    roc_curve)
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

import numpy as np
import os
import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
import tensorflow as tf
from tensorflow import keras

In [4]:
X_train = pd.read_csv("../x_train_all.csv")
X_test = pd.read_csv("../x_test_all.csv")
y_train = pd.read_csv("../y_train_all.csv")
y_test = pd.read_csv("../y_test_all.csv")

# Feature selection

In [5]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
def selectFTest(top_features, range_arg, x_train_arg, y_train_arg, x_test_arg, y_test_arg):
    selected_features = []

    for y_class in range(range_arg):
        selector = SelectKBest(score_func=f_classif, k=top_features)
        selector.fit(x_train_arg, (y_train_arg == y_class).astype(int).values.ravel())
        selected_indices = selector.get_support(indices=True)
        
        # selected_features.update(selected_indices)
        selected_features.extend(selected_indices)

    return x_train_arg.iloc[:, selected_features], x_test_arg.iloc[:, selected_features]

X_train_50, X_test_50 = selectFTest(5, 10, X_train, y_train, X_test, y_test) # 50 features
X_train_100, X_test_100 = selectFTest(10, 10, X_train, y_train, X_test, y_test) # 100 features
X_train_200, X_test_200 = selectFTest(20, 10, X_train, y_train, X_test, y_test) # 200 features

In [6]:
X_train_50

Unnamed: 0,2213,2260,2261,2262,2263,1072,1073,1074,1120,1121,...,1666,1714,1715,1743,1761,1086,1134,1215,1216,1263
0,73.0,73.0,72.0,71.0,68.0,184.0,174.0,163.0,207.0,197.0,...,102.0,99.0,95.0,118.0,108.0,123.0,127.0,220.0,231.0,226.0
1,89.0,91.0,85.0,76.0,70.0,169.0,192.0,195.0,183.0,203.0,...,99.0,118.0,111.0,98.0,133.0,185.0,193.0,124.0,199.0,123.0
2,92.0,90.0,81.0,71.0,66.0,196.0,191.0,179.0,211.0,207.0,...,101.0,122.0,114.0,112.0,136.0,161.0,167.0,196.0,225.0,208.0
3,100.0,86.0,92.0,81.0,72.0,213.0,204.0,207.0,227.0,218.0,...,113.0,124.0,116.0,105.0,134.0,177.0,179.0,206.0,230.0,191.0
4,138.0,123.0,126.0,127.0,128.0,163.0,162.0,150.0,178.0,170.0,...,104.0,87.0,81.0,92.0,102.0,139.0,186.0,199.0,202.0,209.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9685,42.0,40.0,41.0,42.0,42.0,18.0,18.0,47.0,18.0,20.0,...,99.0,104.0,80.0,94.0,106.0,18.0,21.0,19.0,22.0,19.0
9686,36.0,36.0,36.0,35.0,32.0,17.0,18.0,78.0,18.0,33.0,...,98.0,101.0,82.0,82.0,104.0,18.0,24.0,19.0,30.0,20.0
9687,30.0,33.0,29.0,31.0,30.0,17.0,19.0,83.0,17.0,45.0,...,98.0,99.0,95.0,84.0,102.0,19.0,25.0,18.0,36.0,18.0
9688,36.0,39.0,36.0,37.0,32.0,16.0,16.0,77.0,16.0,31.0,...,93.0,92.0,88.0,93.0,93.0,19.0,24.0,16.0,25.0,17.0


In [7]:
X_train_100

Unnamed: 0,2165,2212,2213,2214,2215,2260,2261,2262,2263,2264,...,1086,1134,1167,1168,1215,1216,1263,1561,1562,1610
0,77.0,77.0,73.0,72.0,72.0,73.0,72.0,71.0,68.0,75.0,...,123.0,127.0,211.0,224.0,220.0,231.0,226.0,225.0,225.0,236.0
1,98.0,94.0,89.0,85.0,81.0,91.0,85.0,76.0,70.0,71.0,...,185.0,193.0,122.0,198.0,124.0,199.0,123.0,231.0,232.0,237.0
2,108.0,99.0,92.0,86.0,81.0,90.0,81.0,71.0,66.0,68.0,...,161.0,167.0,189.0,220.0,196.0,225.0,208.0,234.0,235.0,233.0
3,111.0,96.0,100.0,87.0,80.0,86.0,92.0,81.0,72.0,63.0,...,177.0,179.0,202.0,233.0,206.0,230.0,191.0,237.0,237.0,217.0
4,144.0,136.0,138.0,131.0,121.0,123.0,126.0,127.0,128.0,129.0,...,139.0,186.0,186.0,180.0,199.0,202.0,209.0,152.0,169.0,214.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9685,41.0,40.0,42.0,43.0,45.0,40.0,41.0,42.0,42.0,40.0,...,18.0,21.0,20.0,19.0,19.0,22.0,19.0,50.0,44.0,36.0
9686,34.0,38.0,36.0,36.0,33.0,36.0,36.0,35.0,32.0,30.0,...,18.0,24.0,18.0,19.0,19.0,30.0,20.0,41.0,40.0,37.0
9687,32.0,35.0,30.0,32.0,29.0,33.0,29.0,31.0,30.0,27.0,...,19.0,25.0,17.0,18.0,18.0,36.0,18.0,48.0,40.0,33.0
9688,45.0,46.0,36.0,40.0,34.0,39.0,36.0,37.0,32.0,32.0,...,19.0,24.0,16.0,16.0,16.0,25.0,17.0,36.0,31.0,23.0


In [8]:
X_train_200

Unnamed: 0,1979,2025,2026,2027,2028,2164,2165,2166,2211,2212,...,1214,1215,1216,1262,1263,1264,1311,1561,1562,1610
0,130.0,155.0,150.0,146.0,143.0,85.0,77.0,76.0,87.0,77.0,...,168.0,220.0,231.0,176.0,226.0,234.0,213.0,225.0,225.0,236.0
1,139.0,117.0,139.0,144.0,144.0,100.0,98.0,99.0,95.0,94.0,...,95.0,124.0,199.0,100.0,123.0,194.0,113.0,231.0,232.0,237.0
2,132.0,127.0,147.0,138.0,131.0,110.0,108.0,105.0,100.0,99.0,...,147.0,196.0,225.0,153.0,208.0,227.0,180.0,234.0,235.0,233.0
3,146.0,121.0,140.0,144.0,143.0,108.0,111.0,97.0,92.0,96.0,...,167.0,206.0,230.0,142.0,191.0,225.0,170.0,237.0,237.0,217.0
4,141.0,89.0,125.0,142.0,137.0,142.0,144.0,132.0,136.0,136.0,...,162.0,199.0,202.0,158.0,209.0,224.0,216.0,152.0,169.0,214.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9685,18.0,19.0,20.0,20.0,21.0,42.0,41.0,44.0,38.0,40.0,...,21.0,19.0,22.0,20.0,19.0,38.0,22.0,50.0,44.0,36.0
9686,18.0,18.0,18.0,18.0,18.0,39.0,34.0,36.0,35.0,38.0,...,19.0,19.0,30.0,19.0,20.0,60.0,26.0,41.0,40.0,37.0
9687,17.0,18.0,18.0,18.0,19.0,37.0,32.0,30.0,32.0,35.0,...,17.0,18.0,36.0,17.0,18.0,80.0,36.0,48.0,40.0,33.0
9688,16.0,17.0,17.0,17.0,17.0,39.0,45.0,38.0,49.0,46.0,...,16.0,16.0,25.0,16.0,17.0,58.0,21.0,36.0,31.0,23.0


# Evaluating using a linear classifier, i.e. perceptron

In [9]:
perceptron = Perceptron()

In [10]:
def print_perceptron_cv_accuracy(X_train, X_test, y_train, y_test):
    cv_scores = cross_val_score(perceptron, X_train, y_train.values.ravel(), cv=10)  
    cv_mean_accuracy = np.mean(cv_scores)
    print(f'Mean accuracy with cross-validation for a perceptron with {len(X_train.columns)} selected features is: {cv_mean_accuracy:.4f}')
    
print_perceptron_cv_accuracy(X_train_50, X_test_50, y_train, y_test)
print_perceptron_cv_accuracy(X_train_100, X_test_100, y_train, y_test)
print_perceptron_cv_accuracy(X_train_200, X_test_200, y_train, y_test)

Mean accuracy with cross-validation for a perceptron with 50 selected features is: 0.6954
Mean accuracy with cross-validation for a perceptron with 100 selected features is: 0.7535
Mean accuracy with cross-validation for a perceptron with 200 selected features is: 0.7674


In [11]:
def print_perceptron_noncv_accuracy(X_train, X_test, y_train, y_test):
    perceptron.fit(X_train, y_train.values.ravel())
    y_pred = perceptron.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f'Accuracy without cross-validation for a perceptron with {len(X_train.columns)} selected features is: {accuracy:.4f}')
    
print_perceptron_noncv_accuracy(X_train_50, X_test_50, y_train, y_test)
print_perceptron_noncv_accuracy(X_train_100, X_test_100, y_train, y_test)
print_perceptron_noncv_accuracy(X_train_200, X_test_200, y_train, y_test)

Accuracy without cross-validation for a perceptron with 50 selected features is: 0.6647
Accuracy without cross-validation for a perceptron with 100 selected features is: 0.7848
Accuracy without cross-validation for a perceptron with 200 selected features is: 0.7828


At first glance, we can see that the performance of the perceptron for both the 10 fold cross validation as well as with the training and testing datasets are quite similar. However, the 60-70% accuracy provides little conclusive information on whether the data is truly linearly separable. We can still guess that the data is mostly likely not linearly separable as there seems to be some additional noise regarding the results.

# Evaluating using a multilayer perceptron from sklearn

We use the MLPClassifier model provided by sklearn as it seems to suit most of our needs. To establish a baseline for comparison, we first run on a default model that uses the "adam" solver as well using 5 hidden layers as well as allowing a maaximum of 100 iterations for training. For every test we do from this point, we pass in the three groups of datasets to the models that each contain 50, 100 and 200 selected features respectively to allow for a larger frame of reference for comparing model results.

In [11]:
def print_mlp_accuracy(model, X_train, X_test, y_train, y_test, iterations = 3):
    accuracy = 0

    for x in range(iterations):
        model.fit(X_train, y_train.values.ravel())
        y_pred = model.predict(X_test)
        accuracy += accuracy_score(y_test, y_pred)

    accuracy /= iterations
    print(f'Average accuracy using an MLP over {iterations} runs with {len(X_train.columns)} selected features is: {accuracy:.4f}')

In [26]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(5,), max_iter=100)

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)



Average accuracy using an MLP over 3 runs with 50 selected features is: 0.4124




Average accuracy using an MLP over 3 runs with 100 selected features is: 0.2472




Average accuracy using an MLP over 3 runs with 200 selected features is: 0.2447


On noticing the "hasn't converged yet" warning, we now try with a larger amount of iterations

In [27]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(5,), max_iter=500)

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.5152
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.2947
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.2434


On keeping the hidden layer size the same and increasing the iterations allowed, we see that the 50 selected features dataset has a higher accuracy while noticing a similar value for the 200 selected features dataset and a dip in accuracy for the 100 selected features

# Checking the effects of increasing the nuumber of iterations allowed on an MLP classifier

In [28]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(10,), max_iter=50)

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)



Average accuracy using an MLP over 3 runs with 50 selected features is: 0.3953




Average accuracy using an MLP over 3 runs with 100 selected features is: 0.3523




Average accuracy using an MLP over 3 runs with 200 selected features is: 0.3275




In [29]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(10,), max_iter=100) 

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)



Average accuracy using an MLP over 3 runs with 50 selected features is: 0.6475




Average accuracy using an MLP over 3 runs with 100 selected features is: 0.3904




Average accuracy using an MLP over 3 runs with 200 selected features is: 0.3454




In [30]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(10,), max_iter=500) 

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.7648
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.7886
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.5858


In [31]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000) 

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.7840
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.6248
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.5360


In [32]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(10,), max_iter=2000) 

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.7781
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8147
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.6661


For iteration values 50 and 100, we see that the MLP does not converge and also notice low accuracy results except for the 100 selected features with 100 iterations. We also see that on reaching ~500 max_iterations, there is lesser returns on the accuracy on average. However, we cannot conclude that this means that 500 maxmimum iterations is the optimal number for this dataset as it the optimization may go differently with other values for the hidden layer sizes. 

# Checking the effects of increasing the nuumber of hidden layers of an MLP

In [33]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(20,), max_iter=500)

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.8204
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8447
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.6697


In [34]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500)

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.8440
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8961
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.8873


In [35]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(70,), max_iter=500)

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.8467
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8764
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.8931


We notice that there is an increase in performance upto an amount of hidden layer sizes (50) after which, the return in performance seems to be minimal.

# Testing different activation functions

In [36]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(15,), max_iter=500, activation='logistic')

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.8247
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8434
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.8300


In [37]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(30,), max_iter=500, activation='logistic')

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.8436
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8585
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.8383


In [38]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(15,), max_iter=500, activation='tanh')

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.7466
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.7817
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.7254


In [39]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(30,), max_iter=500, activation='tanh')

print_mlp_accuracy(mlp_classifier, X_train_50, X_test_50, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_100, X_test_100, y_train, y_test)
print_mlp_accuracy(mlp_classifier, X_train_200, X_test_200, y_train, y_test)

Average accuracy using an MLP over 3 runs with 50 selected features is: 0.7970
Average accuracy using an MLP over 3 runs with 100 selected features is: 0.8145
Average accuracy using an MLP over 3 runs with 200 selected features is: 0.8138


# Testing using a keras MLP

Now that we are testing with the keras model, we have more finer control over the number of neurons per layer as well as the activation functions used. Again, as a baseline we use 50 neurons per hidden layer while running it over 10 epochs.

In [83]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test, epoch):
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=epoch)
    
    history = model.predict(X_test)
    loss, accuracy = model.evaluate(X_test, y_test)
    
    print(f'Accuracy of MLP trained over {epoch} epochs is: {accuracy:.4f}')

In [84]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(10, activation="relu"),
    keras.layers.Dense(10, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of MLP trained over 10 epochs is: 0.2421


Here, we see a similar result to that of the sklearn mlp model where low number of neurons per layers resulted in accuracy values of 0.30-0.50. We now will test whether the accuracy does indeed increase with larger number of neurons per layers.

In [65]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of MLP trained over 10 epochs is: 0.8217


In [63]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(80, activation="relu"),
    keras.layers.Dense(70, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of MLP trained over 10 epochs is: 0.8343


In [64]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of MLP trained over 10 epochs is: 0.8495


While increasing only the neurons sizes as well as the number of layers, we do notice that there is a general improvement in the accuracy of the model suggesting that the available data is indeed not linear as multiple hidden layers are required to more accurately classify the data. 

# Experimenting with activation functions 

We now shall try experimenting with different combinations of layer numbers, neuron numbers as well as activation functions while keeping the other parameters the same.

In [66]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(50, activation="tanh"),
    keras.layers.Dense(50, activation="tanh"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of MLP trained over 10 epochs is: 0.7188


In [67]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="tanh"),
    keras.layers.Dense(100, activation="tanh"),
    keras.layers.Dense(50, activation="tanh"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of MLP trained over 10 epochs is: 0.7061


In [68]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="tanh"),
    keras.layers.Dense(100, activation="tanh"),
    keras.layers.Dense(50, activation="tanh"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy of MLP trained over 50 epochs is: 0.7579


The tanh activation function finds difficulty in obtaining the same level of final accuracy and loss as seen by the relu activation function. When we noticed this with a smaller epoch size, we tried with a larger epoch size. In order to confirm whether this is the result of a vanishing gradient we try the same with the relu while keeping the other parameters the same. 

In [69]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy of MLP trained over 50 epochs is: 0.8832


The results of this shows that no vanishing gradient problem is present as there is a similar drop in improvement while using only the relu activation function. We also try this with the sigmoid function.

In [70]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="sigmoid"),
    keras.layers.Dense(100, activation="sigmoid"),
    keras.layers.Dense(50, activation="sigmoid"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy of MLP trained over 50 epochs is: 0.7854


Although the sigmoid function seems to be performing slightly better than the tanh function, it still seems to suffer from a similar problem where the results seem to not improve as much during the later epochs. 

Trying a variety of activation functions on the same network:

In [71]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="tanh"),
    keras.layers.Dense(200, activation="relu"),
    keras.layers.Dense(500, activation="sigmoid"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy of MLP trained over 30 epochs is: 0.7832


In [72]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(50,)),
    keras.layers.Dense(100, activation="sigmoid"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dense(100, activation="tanh"),
    keras.layers.Dense(10, activation="softmax")
])

train_and_evaluate(model, X_train_50, X_test_50, y_train, y_test, 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy of MLP trained over 30 epochs is: 0.8165
