# Deep Learning Assignment 1
Due on Thursday, Feb 8, 11:59pm

This assignment can be done in groups of at most 2 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Ryan Brand, rmb2208

Member 2: Saaya Yasuda, sy2569

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

# for data augmentation
from scipy.ndimage.interpolation import shift

In [13]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0, momentum=0.0, decay_rate=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        self.momentum = momentum
        self.decay_rate = decay_rate
        
        # init parameters
        for l in range(1, self.num_layers):
            n_in = layer_dimensions[l-1]
            n_out = layer_dimensions[l]
            xavier = 2.0/np.float(n_in + n_out)
            
            W = xavier*np.random.randn(n_out, n_in)
            b = np.zeros((n_out, 1))
            
            self.parameters["W" + str(l)] = W
            self.parameters["b" + str(l)] = b
            
            if self.momentum > 0 and self.decay_rate > 0:
                vW = np.zeros((n_out, n_in))
                vb = np.zeros((n_out, 1))
                sW = np.zeros((n_out, n_in))
                sb = np.zeros((n_out, 1))
                
                self.parameters["vW" + str(l)] = vW
                self.parameters["vb" + str(l)] = vb
                self.parameters["sW" + str(l)] = sW
                self.parameters["sb" + str(l)] = sb
            
            elif self.momentum > 0:
                vW = np.zeros((n_out, n_in))
                vb = np.zeros((n_out, 1))
                
                self.parameters["vW" + str(l)] = vW
                self.parameters["vb" + str(l)] = vb
                
            elif self.decay_rate > 0:
                sW = np.zeros((n_out, n_in))
                sb = np.zeros((n_out, 1))
                
                self.parameters["sW" + str(l)] = sW
                self.parameters["sb" + str(l)] = sb
        
        ##############################
        self.debug = False
        ##############################
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        Z = np.dot(W, A) + b
        cache_layer = (A, W, b, Z)
        
        return Z, cache_layer
    

    def activationForward(self, Z, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        
        return eval("self." + activation)(Z)
    

    def relu(self, X):
        
        return X*(X>0)
    
    
    def softmax(self, X):
        
        X_exp = np.exp(X - np.max(X))

        return X_exp / np.sum(X_exp, axis=0, keepdims=True)
    
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0], A.shape[1])
        M = (M > prob) * 1.0
        M /= (1 - prob)
        A *= M
        return A, M
    

    def forwardPropagation(self, X, training=True):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        cache = {}
        
        A = X
        L = self.num_layers-1
        for l in range(1, L):
            W = self.parameters["W" + str(l)]
            b = self.parameters["b" + str(l)]
            Z, cache_layer = self.affineForward(A, W, b) # cache_layer is (A, W, b, Z)
            A = self.activationForward(Z)
            
            # dropout
            M = np.zeros(A.shape)
            if self.drop_prob > 0 and training:
                A, M = self.dropout(A, self.drop_prob)
            cache[str(l)] = cache_layer + (M,) # cache_layer is (A, W, b, Z, M)
                
        
        WL = self.parameters["W" + str(L)]
        bL = self.parameters["b" + str(L)]
        ZL, cache_layer = self.affineForward(A, WL, bL) # cache_layer is (A, W, b, Z)
        
        # dropout - adding a fake M to keep the len(cache[l]) the same.
        M = np.zeros(A.shape)
        cache[str(L)] = cache_layer + (M,) # cache_layer is (A, W, b, Z, M)
        
        AL = self.activationForward(ZL, activation="softmax")            

        return AL, cache
    
    def costFunction(self, AL, y, reg_type = None):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param reg_type: regularization type. L1 or L2.
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        S = np.float(AL.shape[1])
        Y = one_hot(y)
        cost = -np.sum(np.multiply(Y, np.log(AL))) / S
        
        if self.debug:
            print "########### cost pre-regularization: ", cost
        
        # Reference for formula: 
        # L1: http://neuralnetworksanddeeplearning.com/chap3.html#other_techniques_for_regularization
        # L2: http://neuralnetworksanddeeplearning.com/chap3.html#regularization
        
        if self.reg_lambda > 0 and reg_type is not None:
            
            weight_sum = 0
            
            if reg_type == "L1":
                for l in range(1, self.num_layers):
                    w_l = self.parameters["W" + str(l)]
                    weight_sum += np.abs(w_l).sum() / w_l.shape[1]

            elif reg_type == "L2":
                for l in range(1, self.num_layers):
                    w_l = self.parameters["W" + str(l)]
                    weight_sum += np.sum(w_l ** 2) / (2 * w_l.shape[1])                

            cost += weight_sum * self.reg_lambda
            
            if self.debug:
                print "########### cost POST-regularization: ", cost
        
        # gradient of cost
        dZL = np.subtract(AL, Y) #10x100
        
        return cost, dZL

    def affineBackward(self, dZ, cache_layer):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :param reg_type: regularization type. L1 or L2.
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        A_prev, W, b, Z, M = cache_layer
        S = np.float(A_prev.shape[1])
        
        dA_prev = np.dot(W.transpose(), dZ)
        dW = np.dot(dZ, A_prev.transpose())
        db = np.sum(dZ, axis=1, keepdims=True)

        return dA_prev, dW, db
    

    def activationBackward(self, dA, cache_layer, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        A_prev, W, b, Z, M = cache_layer
        dZ = np.multiply(dA, self.relu_derivative(Z))
        
        return dZ

        
    def relu_derivative(self, X):
        
        return 1.0*(X>0)
    

    def dropout_backward(self, dZ, cache_layer):
        
        # Reference:
        # https://stats.stackexchange.com/questions/219236/dropout-forward-prop-vs-back-prop-in-machine-learning-neural-network
        # https://stats.stackexchange.com/questions/207481/dropout-backpropagation-implementation
        
        _, _, _, _, M = cache_layer
        dZ = dZ * M

        return dZ
    

    def backPropagation(self, dZL, Y, cache, reg_type = None):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :param reg_type: regularization type. L1 or L2.
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        
        L = self.num_layers-1
        cache_layer = cache[str(L)]
        dA, dWL, dbL = self.affineBackward(dZL, cache_layer)
        
        # L1/L2
        if self.reg_lambda > 0 and reg_type is not None:
            w_l= cache_layer[1]
            if reg_type == "L1":
                dWL += np.sign(w_l) * (self.reg_lambda / w_l.shape[1])
            elif reg_type == "L2":
                dWL += w_l * (self.reg_lambda / w_l.shape[1])
                
        gradients["dW" + str(L)] = dWL
        gradients["db" + str(L)] = dbL

        
        for l in reversed(range(1, L)):
            cache_layer = cache[str(l)]
            dZ = self.activationBackward(dA, cache_layer)
            
            # dropout
            if self.drop_prob > 0:
                dZ = self.dropout_backward(dZ, cache_layer)
                
            dA, dW, db = self.affineBackward(dZ, cache_layer)
            
            # L1/L2
            if self.reg_lambda > 0 and reg_type is not None:
                w_l= cache_layer[1]
                if reg_type == "L1":
                    dW += np.sign(w_l) * (self.reg_lambda / w_l.shape[1])
                elif reg_type == "L2":
                    dW += w_l * (self.reg_lambda / w_l.shape[1])
            
            gradients["dW" + str(l)] = dW
            gradients["db" + str(l)] = db
           
        
        return gradients


    def updateParameters(self, gradients, alpha, reg_type=None):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent
        :param reg_type: regularization type. L1 or L2.
        """
        delta=10e-6
               
        for l in range(1, self.num_layers):
            
            if self.momentum > 0 and self.decay_rate > 0:
                self.parameters["vb" + str(l)] = self.momentum*self.parameters["vb" + str(l)] + (1-self.momentum)*gradients["db" + str(l)]           
                self.parameters["vW" + str(l)] = self.momentum*self.parameters["vW" + str(l)] + (1-self.momentum)*gradients["dW" + str(l)]
                
                vb_prev = self.parameters["vb" + str(l)]
                sb_prev = self.parameters["sb" + str(l)]
                db_prev = gradients["db" + str(l)]
                self.parameters["sb" + str(l)] = self.decay_rate*sb_prev + (1-self.decay_rate)*np.square(db_prev)
                
                vb_hat = self.parameters["vb" + str(l)]/(1-self.momentum)
                sb_hat = self.parameters["sb" + str(l)]/(1-self.decay_rate)
                self.parameters["b" + str(l)] -= alpha*np.divide(vb_hat, np.sqrt(delta + sb_hat))
                
                vW_prev = self.parameters["vW" + str(l)]
                sW_prev = self.parameters["sW" + str(l)]
                dW_prev = gradients["dW" + str(l)]
                self.parameters["sW" + str(l)] = self.decay_rate*sW_prev + (1-self.decay_rate)*np.square(dW_prev)
                
                vW_hat = self.parameters["vW" + str(l)]/(1-self.momentum)
                sW_hat = self.parameters["sW" + str(l)]/(1-self.decay_rate)
                self.parameters["W" + str(l)] -= alpha*np.divide(vW_hat, np.sqrt(delta + sW_hat))
                
            elif self.momentum > 0:
                self.parameters["vb" + str(l)] = self.momentum*self.parameters["vb" + str(l)] + (1-self.momentum)*gradients["db" + str(l)]
                self.parameters["b" + str(l)] -= alpha*self.parameters["vb" + str(l)]
                
                self.parameters["vW" + str(l)] = self.momentum*self.parameters["vW" + str(l)] + (1-self.momentum)*gradients["dW" + str(l)]
                self.parameters["W" + str(l)] -= alpha*self.parameters["vW" + str(l)]
                
            elif self.decay_rate > 0:
                sb_prev = self.parameters["sb" + str(l)]
                db_prev = gradients["db" + str(l)]
                self.parameters["sb" + str(l)] = self.decay_rate*sb_prev + (1-self.decay_rate)*np.square(db_prev)
                self.parameters["b" + str(l)] -= alpha*np.divide(db_prev, np.sqrt(delta + self.parameters["sb" + str(l)]))
                
                sW_prev = self.parameters["sW" + str(l)]
                dW_prev = gradients["dW" + str(l)]
                self.parameters["sW" + str(l)] = self.decay_rate*sW_prev + (1-self.decay_rate)*np.square(dW_prev)
                self.parameters["W" + str(l)] -= alpha*np.divide(dW_prev, np.sqrt(delta + self.parameters["sW" + str(l)]))
                
            else:
                self.parameters["b" + str(l)] -= alpha*gradients["db" + str(l)]
                self.parameters["W" + str(l)] -= alpha*gradients["dW" + str(l)]

                
            # L1/L2
            if self.reg_lambda > 0:
                w_l = self.parameters["W" + str(l)]
                if reg_type == "L1":
                    self.parameters["W" + str(l)] -= alpha * np.sign(w_l) * (self.reg_lambda / w_l.shape[1])
                elif reg_type == "L2":
                    self.parameters["W" + str(l)] -= alpha * w_l * (self.reg_lambda / w_l.shape[1])


    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100, 
              reg_type = None, CV=False, augment_data=False):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        :param reg_type: regularization type. L1 or L2.
        """
        assert(y.shape[0]==X.shape[1])
        
        if augment_data==True:
            X, y = self.augment_data(X, y)
        
        indices = np.random.permutation(y.shape[0])
        cutoff = np.int(np.floor(0.9*y.shape[0]))
        train_idx, val_idx = indices[:cutoff], indices[cutoff:]
        
        X_train = X[:, train_idx]
        y_train = y[train_idx]
        
        X_val = X[:, val_idx]
        y_val = y[val_idx]
        
        for i in range(0, iters):
            # get minibatch
            X_batch, y_batch = self.get_batch(X_train, y_train, batch_size)
            
            # forward prop
            AL_batch, cache = self.forwardPropagation(X_batch)

            # compute loss
            cost_batch, dZL_batch = self.costFunction(AL_batch, y_batch, reg_type=reg_type)

            # compute gradients
            gradients_batch = self.backPropagation(dZL_batch, y_batch, cache, reg_type=reg_type)

            # update weights and biases based on gradient
            self.updateParameters(gradients_batch, alpha, reg_type=reg_type)

            if i % print_every == 0:
                # print cost, train and validation set accuracies
                y_pred = self.predict(X_batch)
                print("Training Batch Accuracy: " + str(np.sum(y_pred==y_batch)/np.float(batch_size)))
                print("Training Batch Cost: " + str(cost_batch))
                print("--------------------------")
                
            
        # forward prop
        AL_val, cache_val = self.forwardPropagation(X_val)

        # compute loss
        cost_val, _ = self.costFunction(AL_val, y_val)
        
        y_pred_val = self.predict(X_val)
        print("##########################")
        print("--------------------------")
        print("Validation Accuracy: " + str(np.sum(y_pred_val==y_val)/np.float(y_val.shape[0])))
        print("Validation Cost: " + str(cost_val))
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        AL, _ = self.forwardPropagation(X, training=False)
        y_pred = np.argmax(AL, axis=0)

        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """  
        idx = np.random.randint(y.size, size=batch_size)
        X_batch = X[:, idx]
        y_batch = y[idx]

        return X_batch, y_batch
    
    def augment_data(self, X, y):
        
        indices = np.random.permutation(y.shape[0])
        cutoff = np.int(np.floor(0.2*y.shape[0]))
        
        X_shifted = X[:, indices[:cutoff]]
        y_shifted = y[indices[:cutoff]]
        
        for col in range(X_shifted.shape[1]):
            X_shifted[:, col] = shift(X_shifted[:, col], np.int(np.floor(X_shifted.shape[0])), mode='nearest')
            
        X_aug = np.hstack((X, X_shifted))
        y_aug = np.concatenate((y, y_shifted))
        
        return X_aug, y_aug

In [4]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [14]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [6]:
# Load the data
data_root_path = '../cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'horse': 7, 'automobile': 1, 'deer': 4, 'dog': 5, 'frog': 6, 'cat': 3, 'truck': 9, 'ship': 8, 'airplane': 0, 'bird': 2}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [22]:
layer_dimensions = [X_train.shape[0], 500, 100, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500)

Training Batch Accuracy: 0.132
Training Batch Cost: 2.30255618603
--------------------------
Training Batch Accuracy: 0.344
Training Batch Cost: 1.87534155674
--------------------------
Training Batch Accuracy: 0.372
Training Batch Cost: 1.72467492437
--------------------------
Training Batch Accuracy: 0.382
Training Batch Cost: 1.65695733881
--------------------------
Training Batch Accuracy: 0.504
Training Batch Cost: 1.60521459363
--------------------------
Training Batch Accuracy: 0.482
Training Batch Cost: 1.49667822839
--------------------------
Training Batch Accuracy: 0.512
Training Batch Cost: 1.37639570008
--------------------------
Training Batch Accuracy: 0.586
Training Batch Cost: 1.27288725937
--------------------------
Training Batch Accuracy: 0.604
Training Batch Cost: 1.252074392
--------------------------
Training Batch Accuracy: 0.612
Training Batch Cost: 1.19341284537
--------------------------
Training Batch Accuracy: 0.644
Training Batch Cost: 1.19372781157
------

In [23]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-sy2569', y_predicted)

In [24]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-sy2569.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([3, 8, 0, 4, 5, 8, 8, 2, 8, 1])

## Part 2: Improving the performance

1. Regularizers: 
L1 regularization (Lasso), 
L2 regularization (Ridge), 
Dropout, 
Data Augmentation. 

2. Optimizers: 
SGD with momentum, 
rmsprop, 
adam.

####  L1/L2 regularization
* References: 
* L1: http://neuralnetworksanddeeplearning.com/chap3.html#other_techniques_for_regularization
* L2: http://neuralnetworksanddeeplearning.com/chap3.html#regularization

In [25]:
#helper functions for trying different params
def get_NNdict(drop_prob_list=[0], reg_lambda_list=[0], momentum_list = [0],decay_rate_list = [0]):
    NN_dict = {}
    for drop_prob in drop_prob_list:
        for reg_lambda in reg_lambda_list:
            for momentum in momentum_list:
                for decay_rate in decay_rate_list:
                    NN_val = NeuralNetwork(layer_dimensions, drop_prob=drop_prob, reg_lambda=reg_lambda, 
                                           momentum=momentum,decay_rate=decay_rate)
                    NN_dict[(drop_prob, reg_lambda, momentum, decay_rate)] = NN_val
    return NN_dict


def train_multipleNN(NN_dict, iters_list=[5000], alpha_list=[0.0005], 
                     batch_size_list =[500], reg_type_list=[None], augment_data_list=[False]):
    for iters in iters_list:
        for alpha in alpha_list:
            for batch_size in batch_size_list:
                for augment_data in augment_data_list: 
                    for param, nn in NN_dict.iteritems():
                        print "#####################################"
                        print "iters:", iters
                        print "alpha: ", alpha
                        print "batch_size: ", batch_size
                        print "augment_data: ", augment_data
                        print "drop_prob, reg_lambda, momentum, decay_rate: ", param

                        if param[1] > 0: # if reg_lambda > 0
                            for reg_type in reg_type_list:
                                print "reg_type:", reg_type
                                print "#####################################"
                                nn.train(X_train, y_train, iters=iters, alpha=alpha, 
                                     batch_size=batch_size, print_every=500, reg_type=reg_type,
                                         augment_data=augment_data)
                        else: # if reg_lambda == 0
                            print "#####################################"
                            nn.train(X_train, y_train, iters=iters, alpha=alpha, 
                                     batch_size=batch_size, print_every=500, augment_data=augment_data)
                            


In [15]:
### L1 Only
reg_lambda_list = [0.05,0.1,0.2,0.3]

NN_dict = get_NNdict(reg_lambda_list=reg_lambda_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L1"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                     batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.3, 0, 0)
reg_type: L1
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.49309865705
--------------------------
Training Batch Accuracy: 0.328
Training Batch Cost: 2.55485201363
--------------------------
Training Batch Accuracy: 0.356
Training Batch Cost: 2.78796608053
--------------------------
Training Batch Accuracy: 0.418
Training Batch Cost: 2.88013677916
--------------------------
Training Batch Accuracy: 0.496
Training Batch Cost: 3.15976799618
--------------------------
Training Batch Accuracy: 0.452
Training Batch Cost: 3.25829872552
--------------------------
Training Batch Accuracy: 0.524
Training Batch Cost: 3.43365580609
--------------------------
Training Batch Accuracy: 0.588
Training Batch Cost: 3.57275107328
--------------------------
Training Batch Accuracy: 0.622
Training B

In [26]:
### L1 Only with the best param 0.2 with 10,000 iterations
reg_lambda_list = [0.2]

NN_dict = get_NNdict(reg_lambda_list=reg_lambda_list)

iters_list=[10000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L1"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                     batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 10000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.2, 0, 0)
reg_type: L1
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.33
Training Batch Cost: 2.39995950189
--------------------------
Training Batch Accuracy: 0.374
Training Batch Cost: 2.46852385268
--------------------------
Training Batch Accuracy: 0.416
Training Batch Cost: 2.488543511
--------------------------
Training Batch Accuracy: 0.506
Training Batch Cost: 2.62846418549
--------------------------
Training Batch Accuracy: 0.474
Training Batch Cost: 2.67475789706
--------------------------
Training Batch Accuracy: 0.536
Training Batch Cost: 2.72428243987
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 2.77285690356
--------------------------
Training Batch Accuracy: 0.59
Training Batch 

In [27]:
### L2 Only
reg_lambda_list = [0.1,0.2,0.5]

NN_dict = get_NNdict(reg_lambda_list=reg_lambda_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L2"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                     batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.5, 0, 0)
reg_type: L2
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.30372470854
--------------------------
Training Batch Accuracy: 0.342
Training Batch Cost: 1.93787594364
--------------------------
Training Batch Accuracy: 0.382
Training Batch Cost: 1.78883221319
--------------------------
Training Batch Accuracy: 0.392
Training Batch Cost: 1.6735282525
--------------------------
Training Batch Accuracy: 0.498
Training Batch Cost: 1.644808124
--------------------------
Training Batch Accuracy: 0.492
Training Batch Cost: 1.5967884718
--------------------------
Training Batch Accuracy: 0.558
Training Batch Cost: 1.49013555679
--------------------------
Training Batch Accuracy: 0.578
Training Batch Cost: 1.440475051
--------------------------
Training Batch Accuracy: 0.584
Training Batch C

In [20]:
### Dropout Only
drop_prob_list = [0.1,0.2,0.3]

NN_dict = get_NNdict(drop_prob_list=drop_prob_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0.1, 0, 0, 0)
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.30255668218
--------------------------
Training Batch Accuracy: 0.324
Training Batch Cost: 1.85582917039
--------------------------
Training Batch Accuracy: 0.39
Training Batch Cost: 1.71304807972
--------------------------
Training Batch Accuracy: 0.45
Training Batch Cost: 1.57018949093
--------------------------
Training Batch Accuracy: 0.488
Training Batch Cost: 1.54944068152
--------------------------
Training Batch Accuracy: 0.496
Training Batch Cost: 1.46955981939
--------------------------
Training Batch Accuracy: 0.514
Training Batch Cost: 1.4449458248
--------------------------
Training Batch Accuracy: 0.554
Training Batch Cost: 1.39847885148
--------------------------
Training Batch Accuracy: 0.558
Training Batch Cost: 1.459

In [21]:
### Data Aug Only
NN_dict = get_NNdict()

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
augment_data_list=[True]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, augment_data_list=augment_data_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  True
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0, 0)
#####################################
Training Batch Accuracy: 0.122
Training Batch Cost: 2.30258520503
--------------------------
Training Batch Accuracy: 0.296
Training Batch Cost: 1.93273325836
--------------------------
Training Batch Accuracy: 0.344
Training Batch Cost: 1.84657892951
--------------------------
Training Batch Accuracy: 0.368
Training Batch Cost: 1.79857179604
--------------------------
Training Batch Accuracy: 0.38
Training Batch Cost: 1.79746819308
--------------------------
Training Batch Accuracy: 0.44
Training Batch Cost: 1.64388228906
--------------------------
Training Batch Accuracy: 0.436
Training Batch Cost: 1.65070178642
--------------------------
Training Batch Accuracy: 0.422
Training Batch Cost: 1.69012723863
--------------------------
Training Batch Accuracy: 0.522
Training Batch Cost: 1.47323

In [28]:
### Momentum Only
momentum_list = [0.9]

NN_dict = get_NNdict(momentum_list = momentum_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0.9, 0)
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.30255618603
--------------------------
Training Batch Accuracy: 0.356
Training Batch Cost: 1.75162571114
--------------------------
Training Batch Accuracy: 0.472
Training Batch Cost: 1.55207203588
--------------------------
Training Batch Accuracy: 0.546
Training Batch Cost: 1.4265186387
--------------------------
Training Batch Accuracy: 0.534
Training Batch Cost: 1.31766765481
--------------------------
Training Batch Accuracy: 0.572
Training Batch Cost: 1.25365865113
--------------------------
Training Batch Accuracy: 0.598
Training Batch Cost: 1.15116804342
--------------------------
Training Batch Accuracy: 0.632
Training Batch Cost: 1.03822685205
--------------------------
Training Batch Accuracy: 0.658
Training Batch Cost: 1.0

In [29]:
### RMSprop Only
decay_rate_list = [0.9, 0.97]

NN_dict = get_NNdict(decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0, 0.9)
#####################################
Training Batch Accuracy: 0.164
Training Batch Cost: 2.30255618603
--------------------------
Training Batch Accuracy: 0.372
Training Batch Cost: 1.79618202295
--------------------------
Training Batch Accuracy: 0.4
Training Batch Cost: 1.67347136983
--------------------------
Training Batch Accuracy: 0.478
Training Batch Cost: 1.62311930166
--------------------------
Training Batch Accuracy: 0.506
Training Batch Cost: 1.4626379222
--------------------------
Training Batch Accuracy: 0.488
Training Batch Cost: 1.53218247658
--------------------------
Training Batch Accuracy: 0.556
Training Batch Cost: 1.33299735767
--------------------------
Training Batch Accuracy: 0.59
Training Batch Cost: 1.27841202331
--------------------------
Training Batch Accuracy: 0.604
Training Batch Cost: 1.2895

In [30]:
### Adam Only
momentum_list = [0.9]
decay_rate_list = [0.97,0.99]

NN_dict = get_NNdict(momentum_list=momentum_list, decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0.9, 0.97)
#####################################
Training Batch Accuracy: 0.19
Training Batch Cost: 2.30255618603
--------------------------
Training Batch Accuracy: 0.422
Training Batch Cost: 1.59536979082
--------------------------
Training Batch Accuracy: 0.496
Training Batch Cost: 1.45233579815
--------------------------
Training Batch Accuracy: 0.582
Training Batch Cost: 1.31261943578
--------------------------
Training Batch Accuracy: 0.578
Training Batch Cost: 1.23480210994
--------------------------
Training Batch Accuracy: 0.594
Training Batch Cost: 1.22335926902
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 1.11990540935
--------------------------
Training Batch Accuracy: 0.63
Training Batch Cost: 1.05839372219
--------------------------
Training Batch Accuracy: 0.674
Training Batch Cost: 0.9

In [16]:
### Momentum with L1
reg_lambda_list = [0.2]
momentum_list = [0.9]

NN_dict = get_NNdict(reg_lambda_list=reg_lambda_list, momentum_list = momentum_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L1"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.2, 0.9, 0)
reg_type: L1
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.366
Training Batch Cost: 2.18580912398
--------------------------
Training Batch Accuracy: 0.466
Training Batch Cost: 2.26255491041
--------------------------
Training Batch Accuracy: 0.55
Training Batch Cost: 2.25179259149
--------------------------
Training Batch Accuracy: 0.548
Training Batch Cost: 2.40824584826
--------------------------
Training Batch Accuracy: 0.556
Training Batch Cost: 2.53545772043
--------------------------
Training Batch Accuracy: 0.646
Training Batch Cost: 2.5187530317
--------------------------
Training Batch Accuracy: 0.638
Training Batch Cost: 2.60625452664
--------------------------
Training Batch Accuracy: 0.682
Training B

In [31]:
### Momentum with L2
reg_lambda_list = [0.5,0.1]
momentum_list = [0.9]

NN_dict = get_NNdict(reg_lambda_list=reg_lambda_list, momentum_list = momentum_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L2"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.1, 0.9, 0)
reg_type: L2
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.30278989053
--------------------------
Training Batch Accuracy: 0.36
Training Batch Cost: 1.75324544519
--------------------------
Training Batch Accuracy: 0.462
Training Batch Cost: 1.58047768398
--------------------------
Training Batch Accuracy: 0.564
Training Batch Cost: 1.41012966936
--------------------------
Training Batch Accuracy: 0.562
Training Batch Cost: 1.35197887256
--------------------------
Training Batch Accuracy: 0.59
Training Batch Cost: 1.2792670906
--------------------------
Training Batch Accuracy: 0.612
Training Batch Cost: 1.17207507779
--------------------------
Training Batch Accuracy: 0.648
Training Batch Cost: 1.0806613816
--------------------------
Training Batch Accuracy: 0.664
Training Bat

In [32]:
### Momentum with Dropout
drop_prob_list = [0.2] # pick the best one from L1 only
momentum_list = [0.9] # pick the best one from momentum only

NN_dict = get_NNdict(drop_prob_list=drop_prob_list, momentum_list = momentum_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0.2, 0, 0.9, 0)
#####################################
Training Batch Accuracy: 0.132
Training Batch Cost: 2.3025557013
--------------------------
Training Batch Accuracy: 0.358
Training Batch Cost: 1.78465714735
--------------------------
Training Batch Accuracy: 0.472
Training Batch Cost: 1.60039898465
--------------------------
Training Batch Accuracy: 0.526
Training Batch Cost: 1.51765677675
--------------------------
Training Batch Accuracy: 0.518
Training Batch Cost: 1.38501915797
--------------------------
Training Batch Accuracy: 0.512
Training Batch Cost: 1.44631166844
--------------------------
Training Batch Accuracy: 0.558
Training Batch Cost: 1.32753060897
--------------------------
Training Batch Accuracy: 0.528
Training Batch Cost: 1.36172232124
--------------------------
Training Batch Accuracy: 0.596
Training Batch Cost: 1

In [33]:
### Momentum with Data Aug
momentum_list = [0.9]

NN_dict = get_NNdict(momentum_list=momentum_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
augment_data_list=[True]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, augment_data_list=augment_data_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  True
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0.9, 0)
#####################################
Training Batch Accuracy: 0.09
Training Batch Cost: 2.30258520503
--------------------------
Training Batch Accuracy: 0.338
Training Batch Cost: 1.8671357464
--------------------------
Training Batch Accuracy: 0.346
Training Batch Cost: 1.73702742171
--------------------------
Training Batch Accuracy: 0.436
Training Batch Cost: 1.56826451832
--------------------------
Training Batch Accuracy: 0.426
Training Batch Cost: 1.50245062804
--------------------------
Training Batch Accuracy: 0.466
Training Batch Cost: 1.50120668859
--------------------------
Training Batch Accuracy: 0.494
Training Batch Cost: 1.39433884727
--------------------------
Training Batch Accuracy: 0.49
Training Batch Cost: 1.397273094
--------------------------
Training Batch Accuracy: 0.56
Training Batch Cost: 1.3107429

In [17]:
### RMSprop with L1
reg_lambda_list = [0.2]
decay_rate_list = [0.9]

NN_dict = get_NNdict(reg_lambda_list = reg_lambda_list, decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L1"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.2, 0, 0.9)
reg_type: L1
#####################################
Training Batch Accuracy: 0.16
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.352
Training Batch Cost: 2.32767468076
--------------------------
Training Batch Accuracy: 0.424
Training Batch Cost: 2.34079223735
--------------------------
Training Batch Accuracy: 0.474
Training Batch Cost: 2.39759447126
--------------------------
Training Batch Accuracy: 0.486
Training Batch Cost: 2.41201669912
--------------------------
Training Batch Accuracy: 0.49
Training Batch Cost: 2.55599219774
--------------------------
Training Batch Accuracy: 0.55
Training Batch Cost: 2.49780739427
--------------------------
Training Batch Accuracy: 0.54
Training Batch Cost: 2.51133769811
--------------------------
Training Batch Accuracy: 0.598
Training Bat

In [34]:
### RMSprop with L2
reg_lambda_list = [0.1]
decay_rate_list = [0.9]

NN_dict = get_NNdict(reg_lambda_list = reg_lambda_list, decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L2"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.1, 0, 0.9)
reg_type: L2
#####################################
Training Batch Accuracy: 0.164
Training Batch Cost: 2.30278989053
--------------------------
Training Batch Accuracy: 0.364
Training Batch Cost: 1.81893833758
--------------------------
Training Batch Accuracy: 0.412
Training Batch Cost: 1.74767086268
--------------------------
Training Batch Accuracy: 0.484
Training Batch Cost: 1.65260778591
--------------------------
Training Batch Accuracy: 0.522
Training Batch Cost: 1.4760423858
--------------------------
Training Batch Accuracy: 0.508
Training Batch Cost: 1.46279430898
--------------------------
Training Batch Accuracy: 0.542
Training Batch Cost: 1.37523610887
--------------------------
Training Batch Accuracy: 0.552
Training Batch Cost: 1.31100921869
--------------------------
Training Batch Accuracy: 0.632
Training 

In [35]:
### RMSprop with Dropout
drop_prob_list = [0.2]
decay_rate_list = [0.9]

NN_dict = get_NNdict(drop_prob_list = drop_prob_list, decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0.2, 0, 0, 0.9)
#####################################
Training Batch Accuracy: 0.158
Training Batch Cost: 2.3025557013
--------------------------
Training Batch Accuracy: 0.366
Training Batch Cost: 1.8005487181
--------------------------
Training Batch Accuracy: 0.418
Training Batch Cost: 1.69096379485
--------------------------
Training Batch Accuracy: 0.472
Training Batch Cost: 1.60162678262
--------------------------
Training Batch Accuracy: 0.516
Training Batch Cost: 1.49848005299
--------------------------
Training Batch Accuracy: 0.484
Training Batch Cost: 1.53076299613
--------------------------
Training Batch Accuracy: 0.54
Training Batch Cost: 1.42452757653
--------------------------
Training Batch Accuracy: 0.544
Training Batch Cost: 1.42773541393
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.4

In [36]:
### RMSprop with Data Aug
decay_rate_list = [0.9]

NN_dict = get_NNdict(decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
augment_data_list=[True]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, augment_data_list=augment_data_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  True
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0, 0.9)
#####################################
Training Batch Accuracy: 0.15
Training Batch Cost: 2.30258520503
--------------------------
Training Batch Accuracy: 0.354
Training Batch Cost: 1.87295706897
--------------------------
Training Batch Accuracy: 0.36
Training Batch Cost: 1.80337542238
--------------------------
Training Batch Accuracy: 0.408
Training Batch Cost: 1.67272401492
--------------------------
Training Batch Accuracy: 0.418
Training Batch Cost: 1.63479622153
--------------------------
Training Batch Accuracy: 0.45
Training Batch Cost: 1.60485765873
--------------------------
Training Batch Accuracy: 0.464
Training Batch Cost: 1.47957938026
--------------------------
Training Batch Accuracy: 0.398
Training Batch Cost: 1.66952751647
--------------------------
Training Batch Accuracy: 0.534
Training Batch Cost: 1.3907

In [18]:
### Adam with L1
reg_lambda_list = [0.2]
momentum_list = [0.9]
decay_rate_list = [0.9, 0.95]

NN_dict = get_NNdict(reg_lambda_list = reg_lambda_list, momentum_list=momentum_list, 
                     decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L1"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.2, 0.9, 0.9)
reg_type: L1
#####################################
Training Batch Accuracy: 0.188
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.444
Training Batch Cost: 2.72023132812
--------------------------
Training Batch Accuracy: 0.54
Training Batch Cost: 2.68122447556
--------------------------
Training Batch Accuracy: 0.62
Training Batch Cost: 2.68995265733
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 2.80159431802
--------------------------
Training Batch Accuracy: 0.62
Training Batch Cost: 2.87933071888
--------------------------
Training Batch Accuracy: 0.656
Training Batch Cost: 3.0279917995
--------------------------
Training Batch Accuracy: 0.648
Training Batch Cost: 3.0507189975
--------------------------
Training Batch Accuracy: 0.694
Training Batc

In [19]:
### Adam with L1 - more combinations
reg_lambda_list = [0.2]
momentum_list = [0.9]
decay_rate_list = [0.97, 0.98]

NN_dict = get_NNdict(reg_lambda_list = reg_lambda_list, momentum_list=momentum_list, 
                     decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L1"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.2, 0.9, 0.97)
reg_type: L1
#####################################
Training Batch Accuracy: 0.188
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.44
Training Batch Cost: 2.54324081285
--------------------------
Training Batch Accuracy: 0.496
Training Batch Cost: 2.54858897593
--------------------------
Training Batch Accuracy: 0.596
Training Batch Cost: 2.52944759766
--------------------------
Training Batch Accuracy: 0.572
Training Batch Cost: 2.65190908441
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 2.73903002166
--------------------------
Training Batch Accuracy: 0.602
Training Batch Cost: 2.8746374933
--------------------------
Training Batch Accuracy: 0.64
Training Batch Cost: 2.86144533351
--------------------------
Training Batch Accuracy: 0.69
Training Ba

In [37]:
### Adam with L2
reg_lambda_list = [0.1]
momentum_list = [0.9]
decay_rate_list = [0.9,0.97]

NN_dict = get_NNdict(reg_lambda_list = reg_lambda_list, momentum_list=momentum_list, 
                     decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
reg_type_list=["L2"]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, reg_type_list=reg_type_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0.1, 0.9, 0.97)
reg_type: L2
#####################################
Training Batch Accuracy: 0.19
Training Batch Cost: 2.30278989053
--------------------------
Training Batch Accuracy: 0.424
Training Batch Cost: 1.61421651017
--------------------------
Training Batch Accuracy: 0.5
Training Batch Cost: 1.47612278069
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.33036313326
--------------------------
Training Batch Accuracy: 0.562
Training Batch Cost: 1.29687357607
--------------------------
Training Batch Accuracy: 0.564
Training Batch Cost: 1.2457862823
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 1.19125179438
--------------------------
Training Batch Accuracy: 0.658
Training Batch Cost: 1.08215806221
--------------------------
Training Batch Accuracy: 0.656
Training Ba

In [38]:
### Adam with Dropout
drop_prob_list = [0.2]
momentum_list = [0.9]
decay_rate_list = [0.9, 0.97]

NN_dict = get_NNdict(drop_prob_list = drop_prob_list, momentum_list=momentum_list, 
                     decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, batch_size_list =batch_size_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  False
drop_prob, reg_lambda, momentum, decay_rate:  (0.2, 0, 0.9, 0.97)
#####################################
Training Batch Accuracy: 0.186
Training Batch Cost: 2.3025557013
--------------------------
Training Batch Accuracy: 0.398
Training Batch Cost: 1.70482198051
--------------------------
Training Batch Accuracy: 0.518
Training Batch Cost: 1.45866627706
--------------------------
Training Batch Accuracy: 0.544
Training Batch Cost: 1.44526188238
--------------------------
Training Batch Accuracy: 0.546
Training Batch Cost: 1.3310626344
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.40267300701
--------------------------
Training Batch Accuracy: 0.582
Training Batch Cost: 1.33552731451
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.3111122936
--------------------------
Training Batch Accuracy: 0.62
Training Batch Cost: 1

In [39]:
### Adam with Data Aug
momentum_list = [0.9]
decay_rate_list = [0.97]

NN_dict = get_NNdict(momentum_list=momentum_list, decay_rate_list = decay_rate_list)

iters_list=[5000]
alpha_list=[0.0005] 
batch_size_list =[500]
augment_data_list=[True]

train_multipleNN(NN_dict,iters_list=iters_list, alpha_list=alpha_list, 
                 batch_size_list =batch_size_list, augment_data_list=augment_data_list)

#####################################
iters: 5000
alpha:  0.0005
batch_size:  500
augment_data:  True
drop_prob, reg_lambda, momentum, decay_rate:  (0, 0, 0.9, 0.97)
#####################################
Training Batch Accuracy: 0.122
Training Batch Cost: 2.30258520503
--------------------------
Training Batch Accuracy: 0.37
Training Batch Cost: 1.76238859811
--------------------------
Training Batch Accuracy: 0.376
Training Batch Cost: 1.68021782792
--------------------------
Training Batch Accuracy: 0.428
Training Batch Cost: 1.5818893374
--------------------------
Training Batch Accuracy: 0.446
Training Batch Cost: 1.51982548963
--------------------------
Training Batch Accuracy: 0.454
Training Batch Cost: 1.45919978936
--------------------------
Training Batch Accuracy: 0.512
Training Batch Cost: 1.40721908807
--------------------------
Training Batch Accuracy: 0.48
Training Batch Cost: 1.4150400162
--------------------------
Training Batch Accuracy: 0.552
Training Batch Cost: 1.28

### Try the best combinations from 5000 iters with 10,000 iters:

In [40]:
# Momentum with momentum=0.9
NN_mom = NeuralNetwork(layer_dimensions, momentum=0.9)
NN_mom.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500)

Training Batch Accuracy: 0.132
Training Batch Cost: 2.30255618603
--------------------------
Training Batch Accuracy: 0.356
Training Batch Cost: 1.75162571114
--------------------------
Training Batch Accuracy: 0.472
Training Batch Cost: 1.55207203588
--------------------------
Training Batch Accuracy: 0.546
Training Batch Cost: 1.4265186387
--------------------------
Training Batch Accuracy: 0.534
Training Batch Cost: 1.31766765481
--------------------------
Training Batch Accuracy: 0.572
Training Batch Cost: 1.25365865113
--------------------------
Training Batch Accuracy: 0.598
Training Batch Cost: 1.15116804342
--------------------------
Training Batch Accuracy: 0.632
Training Batch Cost: 1.03822685205
--------------------------
Training Batch Accuracy: 0.658
Training Batch Cost: 1.01430964353
--------------------------
Training Batch Accuracy: 0.712
Training Batch Cost: 0.879469139309
--------------------------
Training Batch Accuracy: 0.68
Training Batch Cost: 0.872802181887
----

In [41]:
# Adam with momentum=0.9 & decay_rate=0.97
NN_adam = NeuralNetwork(layer_dimensions, momentum=0.9, decay_rate=0.97)
NN_adam.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500)

Training Batch Accuracy: 0.19
Training Batch Cost: 2.30255618603
--------------------------
Training Batch Accuracy: 0.422
Training Batch Cost: 1.59536979082
--------------------------
Training Batch Accuracy: 0.496
Training Batch Cost: 1.45233579815
--------------------------
Training Batch Accuracy: 0.582
Training Batch Cost: 1.31261943578
--------------------------
Training Batch Accuracy: 0.578
Training Batch Cost: 1.23480210994
--------------------------
Training Batch Accuracy: 0.594
Training Batch Cost: 1.22335926902
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 1.11990540935
--------------------------
Training Batch Accuracy: 0.63
Training Batch Cost: 1.05839372219
--------------------------
Training Batch Accuracy: 0.674
Training Batch Cost: 0.988834719043
--------------------------
Training Batch Accuracy: 0.714
Training Batch Cost: 0.875626641816
--------------------------
Training Batch Accuracy: 0.718
Training Batch Cost: 0.853290324065
-----

In [42]:
# Momentum with L1, reg_lambda=0.2, and momentum=0.9
NN_moml1 = NeuralNetwork(layer_dimensions, momentum=0.9, reg_lambda=0.2)
NN_moml1.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500, reg_type="L1")

Training Batch Accuracy: 0.132
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.366
Training Batch Cost: 2.18580912398
--------------------------
Training Batch Accuracy: 0.466
Training Batch Cost: 2.26255491041
--------------------------
Training Batch Accuracy: 0.55
Training Batch Cost: 2.25179259149
--------------------------
Training Batch Accuracy: 0.548
Training Batch Cost: 2.40824584826
--------------------------
Training Batch Accuracy: 0.556
Training Batch Cost: 2.53545772043
--------------------------
Training Batch Accuracy: 0.646
Training Batch Cost: 2.5187530317
--------------------------
Training Batch Accuracy: 0.638
Training Batch Cost: 2.60625452664
--------------------------
Training Batch Accuracy: 0.682
Training Batch Cost: 2.72753416719
--------------------------
Training Batch Accuracy: 0.674
Training Batch Cost: 2.82456547228
--------------------------
Training Batch Accuracy: 0.7
Training Batch Cost: 2.91085393037
--------

In [43]:
# Momentum with L2, reg_lambda=0.1, and momentum=0.9
NN_moml2 = NeuralNetwork(layer_dimensions, momentum=0.9, reg_lambda=0.1)
NN_moml2.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500, reg_type="L2")

Training Batch Accuracy: 0.132
Training Batch Cost: 2.30278989053
--------------------------
Training Batch Accuracy: 0.36
Training Batch Cost: 1.75324544519
--------------------------
Training Batch Accuracy: 0.462
Training Batch Cost: 1.58047768398
--------------------------
Training Batch Accuracy: 0.564
Training Batch Cost: 1.41012966936
--------------------------
Training Batch Accuracy: 0.562
Training Batch Cost: 1.35197887256
--------------------------
Training Batch Accuracy: 0.59
Training Batch Cost: 1.2792670906
--------------------------
Training Batch Accuracy: 0.612
Training Batch Cost: 1.17207507779
--------------------------
Training Batch Accuracy: 0.648
Training Batch Cost: 1.0806613816
--------------------------
Training Batch Accuracy: 0.664
Training Batch Cost: 1.02320867627
--------------------------
Training Batch Accuracy: 0.698
Training Batch Cost: 1.00386141328
--------------------------
Training Batch Accuracy: 0.704
Training Batch Cost: 0.870547086957
-------

In [44]:
# Adam with L1, reg_lambda=0.2, momentum=0.9, decay_rate=0.97
NN_adaml1 = NeuralNetwork(layer_dimensions, reg_lambda=0.2, momentum=0.9, decay_rate=0.97)
NN_adaml1.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500, reg_type="L1")

Training Batch Accuracy: 0.188
Training Batch Cost: 2.42958450004
--------------------------
Training Batch Accuracy: 0.44
Training Batch Cost: 2.54324081285
--------------------------
Training Batch Accuracy: 0.496
Training Batch Cost: 2.54858897593
--------------------------
Training Batch Accuracy: 0.596
Training Batch Cost: 2.52944759766
--------------------------
Training Batch Accuracy: 0.572
Training Batch Cost: 2.65190908441
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 2.73903002166
--------------------------
Training Batch Accuracy: 0.602
Training Batch Cost: 2.8746374933
--------------------------
Training Batch Accuracy: 0.64
Training Batch Cost: 2.86144533351
--------------------------
Training Batch Accuracy: 0.69
Training Batch Cost: 2.96919615016
--------------------------
Training Batch Accuracy: 0.696
Training Batch Cost: 2.96810679099
--------------------------
Training Batch Accuracy: 0.718
Training Batch Cost: 3.03276242599
----------

In [45]:
# Adam with L2, reg_lambda=0.1, momentum=0.9, decay_rate=0.97
NN_adaml2 = NeuralNetwork(layer_dimensions, reg_lambda=0.1, momentum=0.9, decay_rate=0.97)
NN_adaml2.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500, reg_type="L2")

Training Batch Accuracy: 0.19
Training Batch Cost: 2.30278989053
--------------------------
Training Batch Accuracy: 0.424
Training Batch Cost: 1.61421651017
--------------------------
Training Batch Accuracy: 0.5
Training Batch Cost: 1.47612278069
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.33036313326
--------------------------
Training Batch Accuracy: 0.562
Training Batch Cost: 1.29687357607
--------------------------
Training Batch Accuracy: 0.564
Training Batch Cost: 1.2457862823
--------------------------
Training Batch Accuracy: 0.6
Training Batch Cost: 1.19125179438
--------------------------
Training Batch Accuracy: 0.658
Training Batch Cost: 1.08215806221
--------------------------
Training Batch Accuracy: 0.656
Training Batch Cost: 1.06609491996
--------------------------
Training Batch Accuracy: 0.72
Training Batch Cost: 0.916491221956
--------------------------
Training Batch Accuracy: 0.724
Training Batch Cost: 0.919657235272
---------

In [46]:
# Adam with Dropout, drop_prob=0.2, momentum=0.9, decay_rate=0.97
NN_adamdrop = NeuralNetwork(layer_dimensions, drop_prob=0.2, momentum=0.9, decay_rate=0.97)
NN_adamdrop.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500)

Training Batch Accuracy: 0.186
Training Batch Cost: 2.3025557013
--------------------------
Training Batch Accuracy: 0.398
Training Batch Cost: 1.70482198051
--------------------------
Training Batch Accuracy: 0.518
Training Batch Cost: 1.45866627706
--------------------------
Training Batch Accuracy: 0.544
Training Batch Cost: 1.44526188238
--------------------------
Training Batch Accuracy: 0.546
Training Batch Cost: 1.3310626344
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.40267300701
--------------------------
Training Batch Accuracy: 0.582
Training Batch Cost: 1.33552731451
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.3111122936
--------------------------
Training Batch Accuracy: 0.62
Training Batch Cost: 1.25392215271
--------------------------
Training Batch Accuracy: 0.616
Training Batch Cost: 1.23852719543
--------------------------
Training Batch Accuracy: 0.634
Training Batch Cost: 1.17965877799
--------

### Part 2: Summary

### Best version is: Adam with Dropout, drop_prob=0.2, momentum=0.9, and decay_rate=0.97
### Accuracy on a validation set: 0.5536

In [47]:
### Best version:
# Adam with Dropout, drop_prob=0.2, momentum=0.9, decay_rate=0.97
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.2, momentum=0.9, decay_rate=0.97)
NN2.train(X_train, y_train, iters=10000, alpha=0.0005, batch_size=500, print_every=500)

Training Batch Accuracy: 0.186
Training Batch Cost: 2.3025557013
--------------------------
Training Batch Accuracy: 0.398
Training Batch Cost: 1.70482198051
--------------------------
Training Batch Accuracy: 0.518
Training Batch Cost: 1.45866627706
--------------------------
Training Batch Accuracy: 0.544
Training Batch Cost: 1.44526188238
--------------------------
Training Batch Accuracy: 0.546
Training Batch Cost: 1.3310626344
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.40267300701
--------------------------
Training Batch Accuracy: 0.582
Training Batch Cost: 1.33552731451
--------------------------
Training Batch Accuracy: 0.566
Training Batch Cost: 1.3111122936
--------------------------
Training Batch Accuracy: 0.62
Training Batch Cost: 1.25392215271
--------------------------
Training Batch Accuracy: 0.616
Training Batch Cost: 1.23852719543
--------------------------
Training Batch Accuracy: 0.634
Training Batch Cost: 1.17965877799
--------

In [49]:
y_predicted2 = NN2.predict(X_test)
save_predictions('ans2-sy2569', y_predicted2)

# check if it's saved:
loaded_y = np.load('ans2-sy2569.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([3, 8, 8, 4, 5, 1, 8, 4, 8, 1])

#### Results:
|                                           |              |          |          | NN Init   |            |          |            | train |        |            |             |          |              |
|-------------------------------------------|--------------|----------|----------|-----------|------------|----------|------------|-------|--------|------------|-------------|----------|--------------|
| Note                                      | Accuracy     | Opt      | Reg      | drop_prob | reg_lambda | momentum | decay_rate | iters | alpha  | batch_size | print_every | reg_type | augment_data |
| Part 1                                    | 0.5114       | N/A      | N/A      | 0         | 0          | 0        | 0          | 10000 | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 1                                    | 0.494        | N/A      | N/A      | 0         | 0          | 0        | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5002       | N/A      | L1       | 0         | 0.05       | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.4796       | N/A      | L1       | 0         | 0.1        | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5102       | N/A      | L1       | 0         | 0.2        | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.4922       | N/A      | L1       | 0         | 0.3        | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5156       | N/A      | L1       | 0         | 0.2        | 0        | 0          | 10000 | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5082       | N/A      | L2       | 0         | 0.1        | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.486        | N/A      | L2       | 0         | 0.2        | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5198       | N/A      | L2       | 0         | 0.5        | 0        | 0          | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5076       | N/A      | Dropout  | 0.1       | 0          | 0        | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5172       | N/A      | Dropout  | 0.2       | 0          | 0        | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.514        | N/A      | Dropout  | 0.3       | 0          | 0        | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.4023333333 | N/A      | Data Aug | 0         | 0          | 0        | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | TRUE         |
| Part 2                                    | 0.549        | Momentum | N/A      | 0         | 0          | 0.9      | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5026       | RMS      | N/A      | 0         | 0          | 0        | 0.9        | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.492        | RMS      | N/A      | 0         | 0          | 0        | 0.97       | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5392       | Adam     | N/A      | 0         | 0          | 0.9      | 0.97       | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5302       | Adam     | N/A      | 0         | 0          | 0.9      | 0.99       | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5362       | Momentum | L1       | 0         | 0.2        | 0.9      | 0          | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5434       | Momentum | L2       | 0         | 0.1        | 0.9      | 0          | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5284       | Momentum | L2       | 0         | 0.5        | 0.9      | 0          | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5288       | Momentum | Dropout  | 0.2       | 0          | 0.9      | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.455        | Momentum | Data Aug | 0         | 0          | 0.9      | 0          | 5000  | 0.0005 | 500        | 500         | N/A      | TRUE         |
| Part 2                                    | 0.5056       | RMS      | L1       | 0         | 0.2        | 0        | 0.9        | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5104       | RMS      | L2       | 0         | 0.1        | 0        | 0.9        | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5018       | RMS      | Dropout  | 0.2       | 0          | 0        | 0.9        | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.4366666667 | RMS      | Data Aug | 0         | 0          | 0        | 0.9        | 5000  | 0.0005 | 500        | 500         | N/A      | TRUE         |
| Part 2                                    | 0.5284       | Adam     | L1       | 0         | 0.2        | 0.9      | 0.9        | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5354       | Adam     | L1       | 0         | 0.2        | 0.9      | 0.95       | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5402       | Adam     | L1       | 0         | 0.2        | 0.9      | 0.97       | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5382       | Adam     | L1       | 0         | 0.2        | 0.9      | 0.98       | 5000  | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.517        | Adam     | L2       | 0         | 0.1        | 0.9      | 0.9        | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5332       | Adam     | L2       | 0         | 0.1        | 0.9      | 0.97       | 5000  | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5238       | Adam     | Dropout  | 0.2       | 0          | 0.9      | 0.9        | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5316       | Adam     | Dropout  | 0.2       | 0          | 0.9      | 0.97       | 5000  | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.4636666667 | Adam     | Data Aug | 0         | 0          | 0.9      | 0.97       | 5000  | 0.0005 | 500        | 500         | N/A      | TRUE         |
| Try 10,000 iters for the best ones above: |              |          |          |           |            |          |            |       |        |            |             |          |              |
| Part 2                                    | 0.5392       | Momentum | N/A      | 0         | 0          | 0.9      | 0          | 10000 | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5262       | Adam     | N/A      | 0         | 0          | 0.9      | 0.97       | 10000 | 0.0005 | 500        | 500         | N/A      | FALSE        |
| Part 2                                    | 0.5384       | Momentum | L1       | 0         | 0.2        | 0.9      | 0          | 10000 | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.543        | Momentum | L2       | 0         | 0.1        | 0.9      | 0          | 10000 | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5196       | Adam     | L1       | 0         | 0.2        | 0.9      | 0.97       | 10000 | 0.0005 | 500        | 500         | L1       | FALSE        |
| Part 2                                    | 0.5206       | Adam     | L2       | 0         | 0.1        | 0.9      | 0.97       | 10000 | 0.0005 | 500        | 500         | L2       | FALSE        |
| Part 2                                    | 0.5536       | Adam     | Dropout  | 0.2       | 0          | 0.9      | 0.97       | 10000 | 0.0005 | 500        | 500         | N/A      | FALSE        |