### Exercise 4.3
#### Use either R or Python, with Keras and see the effect of Adam vs SGD for any Classification and Regression problem selected from UCI ML repository. 

- Then, apply dropouts and compare the generalisation performance. 

- Compare the performance of dropouts with weight decay (L2 Regularization or Ridge Regression). 

- Discuss the major similarities and differences between, weight decay and L1/L2 regularisation. 

In [4]:
import random
import numpy as np
import matplotlib.pyplot as plt
from numpy import *
from sklearn import datasets 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
#keras
from keras.layers import Dense
from keras.models import Sequential 
from keras.layers import Dropout
from keras.regularizers import l2
 

def read_data(run_num):
    # Source: Pima-Indian diabetes dataset: https://www.kaggle.com/kumargh/pimaindiansdiabetescsv
    data = genfromtxt("datasets/pima.csv", delimiter=",")
    data_X = data[:, 0:8]     # all features 0, 1, 2, 3, 4, 5, 6, 7 

    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
    # sklearn.preprocessing.Normalizer
    #transformer = Normalizer().fit(data_X)  # fit does nothing
    #data_X = transformer.transform(data_X)
    
    data_y = data[:, -1]      # this is target - so that last col is selected from data

    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.40, random_state=run_num)

    return X_train, X_test, y_train, y_test

    
def keras_nn(X_train, X_test, y_train, y_test, type_model, hidden, dropout_rate, learn_rate, run_num):
 
    # https://keras.io/api/models/model_training_apis/
    # note that keras model on own ensures that every run begins with different initial 
    # weights so run_num is not needed 

    if type_model==0:    # SGD
        #nn = MLPClassifier(hidden_layer_sizes=(hidden,), random_state=run_num, 
                           #max_iter=100,solver='sgd', learning_rate_init=learn_rate )
        model = Sequential()
        model.add(Dense(hidden, input_dim=X_train.shape[1], activation='relu'))
        model.add(Dropout(dropout_rate))   # create a dropout layer with a 40% chance of setting inputs to zero
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='sgd',  metrics=['accuracy'])
    
    elif type_model==1:   # Adam
        #nn = MLPClassifier(hidden_layer_sizes=(hidden,), random_state=run_num, 
                           #max_iter=100,solver='adam', learning_rate_init=learn_rate)
        model = Sequential()
        model.add(Dense(hidden, input_dim=X_train.shape[1], activation='sigmoid'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    elif type_model==2:    # Adam with l2 regularisation
        model = Sequential()
        model.add(Dense(hidden, input_dim=X_train.shape[1], activation='sigmoid', kernel_regularizer=l2(0.001)))
        #model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    else:
        print('no model')
    
    # Fit model
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=500, batch_size=10, verbose=0)

    # Evaluate the model
    # https://keras.io/api/models/model_training_apis/
    _, acc_train = model.evaluate(X_train, y_train, verbose=0)
    _, acc_test = model.evaluate(X_test, y_test, verbose=0)
    #print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

    # Plot history
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='test')
    plt.legend()
    plt.savefig('figures/'+str(type_model)+'nodp.png') 
    plt.clf()
   
    #auc = roc_auc_score(y_pred, y_test, average=None) 
    return acc_test #,acc_train


def main(): 

    max_expruns = 2

    SGD_all = np.zeros(max_expruns) 
    Adam_all = np.zeros(max_expruns) 
    
    Adam2_all = np.zeros(max_expruns)  
    max_hidden = 10 

    learn_rate = 0.01

    #for learn_rate in range(0.1, 1, 0.2):
    
    for hidden in range(6, max_hidden, 2):     # only cover 6 hidden neurons for now
 
        for run_num in range(0, max_expruns): 
    
            X_train, X_test, y_train, y_test = read_data(run_num)   
            
            acc_sgd = keras_nn(X_train, X_test, y_train, y_test, 0, hidden, 0.4, learn_rate, run_num) # SGD with dropout
            acc_adam = keras_nn(X_train, X_test, y_train, y_test, 1, hidden, 0.4, learn_rate, run_num) # Adam with dropout
            acc_adam2 = keras_nn(X_train, X_test, y_train, y_test, 2, hidden, 0.4, learn_rate, run_num) # Adam with l2
           
            SGD_all[run_num] = acc_sgd
            Adam_all[run_num] = acc_adam

            Adam2_all[run_num] = acc_adam2   # two hidden layers
        
        print(SGD_all, hidden,' SGD_all')
        print(np.mean(SGD_all), hidden, ' mean SGD_all')
        print(np.std(SGD_all), hidden, ' std SGD_all')

        print(Adam_all, hidden,' Adam_all')
        print(np.mean(Adam_all), hidden, ' Adam _all')
        print(np.std(Adam_all), hidden, ' Adam _all')

        print(Adam2_all, hidden,' SGD2_all')

        # you can also print  for Adam 
 
    # next try a paragraph to describe your results and discuss which models are better to use
    # repeat for another dataset
    # you can save results to a file as well
    
    
if __name__ == '__main__':
     main()

[0.66233766 0.64935064] 6  SGD_all
0.6558441519737244 6  mean SGD_all
0.006493508815765381 6  std SGD_all
[0.72077924 0.63961041] 6  Adam_all
0.6801948249340057 6  Adam _all
0.040584415197372437 6  Adam _all
[0.6785714  0.66558444] 6  SGD2_all
[0.66558444 0.64285713] 8  SGD_all
0.6542207896709442 8  mean SGD_all
0.01136365532875061 8  std SGD_all
[0.6785714  0.62012988] 8  Adam_all
0.649350643157959 8  Adam _all
0.029220759868621826 8  Adam _all
[0.66883117 0.64610392] 8  SGD2_all


<Figure size 432x288 with 0 Axes>