### Exercise 3.1
#### Python Challenge  

- Try training the FNN model with two selected classification datasets from the UCI machine learning repository. Evaluate the effect of number of hidden neurons, and learning rate [0.1, 0.2 ...1]. Try 10 experiments with different random state in data split (60/40 train/test) or different initial weights for each run and report the mean and standard deviation for each experiment and plot your results. 

Optional (this will require more time and not part of the course assessment)

- Try to add another hidden later and update the forward and backward pass. 

- Can you generalise to any user-selected number of hidden layers? Discuss how you will do this and implement it if possible. 

Resources

Example datasets  http://archive.ics.uci.edu/ml/datasets

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import random
from numpy import *
from sklearn import datasets 
from sklearn import metrics
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split 
from sklearn.neural_network import MLPClassifier


def read_data(run_num):
    #Source: Pima-Indian diabetes dataset: https://www.kaggle.com/kumargh/pimaindiansdiabetescsv
    data = genfromtxt("datasets/pima.csv", delimiter=",") 
    data_X = data[:, 0:8]    # all features 0-7 

    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
    # sklearn.preprocessing.Normalizer
    #transformer = Normalizer().fit(data_inputx)  # fit does nothing
    #data_inputx = transformer.transform(data_inputx)

    data_y = data[:, -1]   # this is target - so that last col is selected from data
 
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.4, random_state=run_num)

    return X_train, X_test, y_train, y_test
 
    
def scipy_nn(X_train, X_test, y_train, y_test, type_model, hidden, learn_rate, run_num):
    # Source: https://scikit-learn.org/stable/modules/neural_networks_supervised.html
    # random_stateint, RandomState instance, default=None Determines random number generation
    # for weights and bias initialization, train-test split if early stopping is used, and 
    # batch sampling when solver=’sgd’(stochastic gradient descent) or ‘adam’(stochastic gradient-based optimizer).
    # Pass an int for reproducible results across multiple function calls.
    # learning_rate_initdouble, default=0.001
    # The initial learning rate used. It controls the step-size in updating the weights.
    # Only used when solver=’sgd’ or ‘adam’.

    # Note: Adam does not need momentum and constant learning rate since they are adjusted in Adam itself

    if type_model == 0:   # SGD
        # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
        nn = MLPClassifier(hidden_layer_sizes=(hidden,), random_state=run_num, 
                           max_iter=1000, solver='sgd', learning_rate_init=learn_rate)
        
    elif type_model == 1: # Adam
        # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
        nn = MLPClassifier(hidden_layer_sizes=(hidden,), random_state=run_num, 
                           max_iter=1000, solver='adam', learning_rate_init=learn_rate)
        
    elif type_model == 2: # SGD with 2 hidden layers
        nn = MLPClassifier(hidden_layer_sizes=(hidden, hidden), random_state=run_num, max_iter=1000,
                           solver='sgd', learning_rate='constant', learning_rate_init=learn_rate)
        #hidden_layer_sizes = (hidden, hidden, hidden) would implement 3 hidden layers
    else:
        print('no model')    
 
    # Train the model using the training sets
    nn.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred_test = nn.predict(X_test)
    y_pred_train = nn.predict(X_train)

    #print('weights shape: ', [coef.shape for coef in nn.coefs_]) 
    #print("RMSE: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))
    # or, mean_squared_error(y_test, y_pred, squared=False)
    
    acc_test = accuracy_score(y_pred_test, y_test) 
    acc_train = accuracy_score(y_pred_train, y_train) 

    cm = confusion_matrix(y_pred_test, y_test) 
    #print(cm, 'is confusion matrix')

    #auc = roc_auc_score(y_pred, y_test, average=None) 
    return acc_test #,acc_train


def main(): 
    max_expruns = 5
    SGD_all = np.zeros(max_expruns) 
    Adam_all = np.zeros(max_expruns)     
    SGD2_all = np.zeros(max_expruns)  
    max_hidden = 12
    learn_rate = 0.01
    #hidden = 8
    
    #for learn_rate in range(0.1, 1, 0.2):
    
    for hidden in range(6, max_hidden, 2): 
        for run_num in range(max_expruns):     
            x_train, x_test, y_train, y_test = read_data(0)
            
            acc_sgd = scipy_nn(x_train, x_test, y_train, y_test, 0, hidden, learn_rate, run_num)   # SGD
            acc_adam = scipy_nn(x_train, x_test, y_train, y_test, 1, hidden, learn_rate, run_num)  # Adam 
            acc_sgd2 = scipy_nn(x_train, x_test, y_train, y_test, 2, hidden, learn_rate,  run_num) # SGD2
           
            SGD_all[run_num] = acc_sgd
            Adam_all[run_num] = acc_adam

            SGD2_all[run_num] = acc_sgd2   # two hidden layers
        
        print('SGD_all: ', SGD_all, hidden)
        print('Mean SGD_all: ', np.mean(SGD_all), hidden)
        print('std SGD_all: ', np.std(SGD_all), hidden)

        print('Adam_all: ', Adam_all, hidden)
        print('Adam_all: ', np.mean(Adam_all), hidden)
        print('Adam_all: ', np.std(Adam_all), hidden)

        print('SGD2_all: ', SGD2_all, hidden)

        # print for Adam
    # next try a paragraph to describe your results and discuss which models are better to use.
    # repeat for another dataset
    # you can save results to a file as well
    

if __name__ == '__main__':
     main() 

SGD_all:  [0.66233766 0.66233766 0.66558442 0.66233766 0.66233766] 6
Mean SGD_all:  0.662987012987013 6
std SGD_all:  0.0012987012987013102 6
Adam_all:  [0.76623377 0.68831169 0.66558442 0.64935065 0.66558442] 6
Adam_all:  0.6870129870129871 6
Adam_all:  0.04150768003765426 6
SGD2_all:  [0.65909091 0.66233766 0.66558442 0.66233766 0.71103896] 6
SGD_all:  [0.66558442 0.66558442 0.66233766 0.66558442 0.66233766] 8
Mean SGD_all:  0.6642857142857144 8
std SGD_all:  0.001590577755054026 8
Adam_all:  [0.71428571 0.66883117 0.73376623 0.66883117 0.72727273] 8
Adam_all:  0.7025974025974027 8
Adam_all:  0.028274728645554883 8
SGD2_all:  [0.66558442 0.66558442 0.66233766 0.69805195 0.66558442] 8
SGD_all:  [0.66558442 0.66233766 0.66558442 0.66558442 0.66233766] 10
Mean SGD_all:  0.6642857142857144 10
std SGD_all:  0.001590577755054026 10
Adam_all:  [0.65909091 0.69480519 0.75974026 0.72402597 0.7012987 ] 10
Adam_all:  0.7077922077922079 10
Adam_all:  0.033300990662228795 10
SGD2_all:  [0.6850649