# Entree Task: Implementing Your Own Neural Networks from Scratch
## By Vaani Gupta and Ryan Yong

In [1]:
import math
import numpy as np

## Task 1: Linear Layer 
Implement the forward and backward functions for a linear layer. Please read the requirement details for Task 1 in the code comment and in the pdf document.

In [2]:
class LinearLayer:
    def __init__(self, _m, _n):
        '''
        :param _m: _m is the input X hidden size
        :param _n: _n is the output Y hidden size
        '''
        # "Kaiming initialization" is important for neural network to converge. The NN will not converge without it!
        self.W = (np.random.uniform(low=-10000.0, high=10000.0, size = (_m, _n)))/10000.0*np.sqrt(6.0/ _m)
        self.stored_X = None
        self.W_grad = None #record the gradient of the weight

    def forward(self, X):
        '''
        :param X: shape(X)[0] is batch size and shape(X)[1] is the #features
         (1) Store the input X in stored_data for Backward.
         (2) :return: X * weights
        '''
        
        ########## Code start  ##########
        # Stores the input X in stored_data for Backward
        self.stored_X = X
        # Calculates and returns the forward operation of X * weights
        return X @ self.W
        
        ##########  Code end   ##########
    
    def backward(self, Y_grad):
        '''
        /* shape(output_grad)[0] is batch size and shape(output_grad)[1] is the # output features (shape(weight)[1])
         * 1) Calculate the gradient of the output (the result of the Forward method) w.r.t. the **W** and store the product of the gradient and Y_grad in W_grad
         * 2) Calculate the gradient of the output (the result of the Forward method) w.r.t. the **X** and return the product of the gradient and Y_grad
         */
        '''
        
        ########## Code start  ##########
        # Calculates W gradient of output from Forward method and stores product of gradient and Y_grad in W_grad
        self.W_grad = self.stored_X.T @ Y_grad
        # Calculates X gradient of output from Forward method and returns product of gradient and Y_grad
        return Y_grad @ self.W.T
        ##########  Code end   ##########

## Checkpoint 1: Linear Layer
Check your linear forward and backward function implementations with numerical derivatives.


In [3]:
#gradient check
import copy
#Random test
n = 3
m = 6
Y_grad = np.random.rand(1, m)
test_vector = np.random.rand(1, n)
DELTA = 1e-6
test_layer = LinearLayer(n, m)

test_layer_1 = copy.deepcopy(test_layer)
test_layer_2 = copy.deepcopy(test_layer)

test_layer.forward(test_vector)
Your_backward = test_layer.backward(Y_grad)

cal_gradient = np.zeros((np.shape(test_vector)[0], np.shape(test_vector)[1]))
for t_p in range(np.shape(test_vector)[0]):
    for i in range(np.shape(test_vector)[1]):
        test_vector_1 = copy.deepcopy(test_vector)
        test_vector_2 = copy.deepcopy(test_vector)
        test_vector_1[t_p][i] = test_vector_1[t_p][i] + DELTA
        test_vector_2[t_p][i] = test_vector_2[t_p][i] - DELTA

        cal_gradient[t_p][i] = np.sum(
            (np.dot(Y_grad, np.transpose(test_layer_1.forward(test_vector_1) - test_layer_2.forward(test_vector_2))/(2*DELTA))))


print('Your gradient: ',Your_backward)
print('Numerical gradient:',cal_gradient)
print('Error: ',abs(np.sum(Your_backward - cal_gradient)))
if abs(np.sum(Your_backward - cal_gradient)) < 1e-4:
    print('Correct backward. Congratulations!')
else:
    print('Wrong backawrd. Please check your implementation again.')

Your gradient:  [[-0.80899972 -0.21857053 -0.25933788]]
Numerical gradient: [[-0.80899972 -0.21857053 -0.25933788]]
Error:  9.110101562015416e-11
Correct backward. Congratulations!


## Task 2: Non-Linear Activation
Implement the forward and backward functions for a nonlinear layer. Please read the requirement details for Task 2 in the code comment and in the pdf document.

In [4]:
class ReLU:
    #sigmoid layer
    def __init__(self):
        self.stored_X = None # Here we should store the input matrix X for Backward

    def forward(self, X):
        '''
        /*
         *  The input X matrix has the dimension [#samples, #features].
         *  The output Y matrix has the same dimension as the input X.
         *  You need to perform ReLU on each element of the input matrix to calculate the output matrix.
         *  TODO: 1) Create an output matrix by going through each element in input and calculate relu=max(0,x) and
         *  TODO: 2) Store the input X in self.stored_X for Backward.
         */
        '''
        
        ########## Code start  ##########
        self.stored_X = X
        Y = np.zeros(X.shape)
        
        # Loops through each element of the input X matrix and stores the max of the current element in X and 
        # 0 in the output Y matrix
        for i in range(X.shape[0]):
            for j in range(X.shape[1]):
                Y[i][j] = max(0, X[i][j])
        
        return Y
                
        ##########  Code end   ##########

    def backward(self, Y_grad):
        '''
         /*  grad_relu(x)=1 if relu(x)=x
         *  grad_relu(x)=0 if relu(x)=0
         *
         *  The input matrix has the name "output_grad." The name is confusing (it is actually the input of the function). But the name follows the convension in PyTorch.
         *  The output matrix has the same dimension as input.
         *  The output matrix is calculated as grad_relu(stored_X)*Y_grad.
         *  TODO: returns the output matrix calculated above
         */
        '''
        
        ########## Code start  ##########
        grad = np.zeros(Y_grad.shape)
        
        # Loops through all elements of the input X matrix and stores the value of the element in Y_grad only
        # if its value is greater than 0 in the output grad matrix
        for i in range(self.stored_X.shape[0]):
            for j in range(self.stored_X.shape[1]):
                if (self.stored_X[i][j] > 0):
                    grad[i][j] = Y_grad[i][j]
                else:
                    grad[i][j] = 0  
        
        return grad

        ##########  Code end   ##########

## Checkpoint 2: ReLU 
Check your ReLU forward and backward functions 

In [5]:
#gradient check
import copy
#Random test
n = 3
Y_grad = np.random.rand(1, n)
test_vector = np.random.rand(1, n)
DELTA = 1e-6
test_layer = ReLU()

test_layer_1 = copy.deepcopy(test_layer)
test_layer_2 = copy.deepcopy(test_layer)

test_layer.forward(test_vector)
Your_backward = test_layer.backward(Y_grad)

cal_gradient = np.zeros((np.shape(test_vector)[0], np.shape(test_vector)[1]))
for t_p in range(np.shape(test_vector)[0]):
    for i in range(np.shape(test_vector)[1]):
        test_vector_1 = copy.deepcopy(test_vector)
        test_vector_2 = copy.deepcopy(test_vector)
        test_vector_1[t_p][i] = test_vector_1[t_p][i] + DELTA
        test_vector_2[t_p][i] = test_vector_2[t_p][i] - DELTA

        cal_gradient[t_p][i] = np.sum(
            (np.dot(Y_grad, np.transpose(test_layer_1.forward(test_vector_1) - test_layer_2.forward(test_vector_2))/(2*DELTA))))


print('Your gradient: ',Your_backward)
print('Numerical gradient:',cal_gradient)
print('Error: ',abs(np.sum(Your_backward - cal_gradient)))
if abs(np.sum(Your_backward - cal_gradient)) < 1e-4:
    print('Correct backward. Congratulations!')
else:
    print('Wrong backawrd. Please check your implementation again.')

Your gradient:  [[0.27021522 0.86796804 0.12691432]]
Numerical gradient: [[0.27021522 0.86796804 0.12691432]]
Error:  1.785624426098309e-11
Correct backward. Congratulations!


## Task 3: Loss Function
Implement the MSE loss function and its backward derivative. Please read the requirement details for Task 3 in the code comment and in the pdf document. 

In [6]:
class MSELoss:
    # cross entropy loss
    # return the mse loss mean(y_j-y_pred_i)^2
    
    def __init__(self):
        self.stored_diff = None
    def forward(self, prediction, groundtruth):
        '''
        /*  TODO: 1) Calculate stored_data=pred-truth
         *  TODO: 2) Calculate the MSE loss as the squared sum of all the elements in the stored_data divided by the number of elements, i.e., MSE(pred, truth) = ||pred-truth||^2 / N, with N as the total number of elements in the matrix
         */
        '''
        
        ########## Code start  ##########
        # Stores the difference between the prediction and ground truth 
        self.stored_diff = prediction - groundtruth
        
        # Finds sum of squares of difference between prediction and ground truth
        sumSquare = np.sum(np.square(prediction - groundtruth))
        
        # Divide sum by number of samples 
        return sumSquare/prediction.shape[0]
        ##########  Code end   ##########

    # return the gradient of the input data
    def backward(self):
        '''
        /* TODO: return the gradient matrix of the MSE loss
         * The output matrix has the same dimension as the stored_data (make sure you have stored the (pred-truth) in stored_data in your forward function!)
         * Each element (i,j) of the output matrix is calculated as grad(i,j)=2(pred(i,j)-truth(i,j))/N
         */
        '''
        
        ########## Code start  ##########
        # Returns 2 times the difference between the prediction and ground truth divided by number of samples
        return (2/self.stored_diff.shape[0]) * self.stored_diff
        ##########  Code end   ##########

## Task 4: Network Architecture
Implement your own neural network architecture. Please read the requirement for Task 4 in the pdf document.

In [7]:
class Network:
    def __init__(self, layers_arch):
        '''
        /*  TODO: 1) Initialize the array for input layers with the proper feature sizes specified in the input vector.
         * For the linear layer, in each pair (in_size, out_size), the in_size is the feature size of the previous layer and the out_size is the feature size of the output (that goes to the next layer)
         * In the linear layer, the weight should have the shape (in_size, out_size).
         
         *  For example, if layers_arch = [['Linear', (256, 128)], ['ReLU'], ['Linear', (128, 64)], ['ReLU'], ['Linear', (64, 32)]],
       * 							 then there are three linear layers whose weights are with shapes (256, 128), (128, 64), (64, 32),
       * 							 and there are two non-linear layers.
         *  Attention: * The output feature size of the linear layer i should always equal to the input feature size of the linear layer i+1.
       */
        '''
       
        ########## Code start  ##########
        self.layers = []

        # Loops through all layers in the network architecture and stores each layer as a linear 
        # or ReLU layer in the Network's layers array
        for layer in layers_arch:
            if layer[0] == 'Linear':
                self.layers.append(LinearLayer(layer[1][0], layer[1][1]))
            else:
                self.layers.append(ReLU())
            
        ##########  Code end   ##########
        
    def forward(self, X):
        '''
        /*
         * TODO: propagate the input data for the first linear layer throught all the layers in the network and return the output of the last linear layer.
         * For implementation, you need to write a for-loop to propagate the input from the first layer to the last layer (before the loss function) by going through the forward functions of all the layers.
         * For example, for a network with k linear layers and k-1 activation layers, the data flow is:
         * linear[0] -> activation[0] -> linear[1] ->activation[1] -> ... -> linear[k-2] -> activation[k-2] -> linear[k-1]
         */
        '''
        
        ########## Code start  ##########
        
        curr_x = X
        
        # Loops through all layers and performs the forward method on them
        for i in range(len(self.layers)):
            curr_x = self.layers[i].forward(curr_x)
        
        return curr_x

        ##########  Code end   ##########

    def backward(self, Y_grad):
        '''
        /* Propagate the gradient from the last layer to the first layer by going through the backward functions of all the layers.
         * TODO: propagate the gradient of the output (we got from the Forward method) back throught the network and return the gradient of the first layer.

         * Notice: We should use the chain rule for the backward.
         * Notice: The order is opposite to the forward.
         */
        '''
        
        ########## Code start  ##########
        curr_y_grad = Y_grad
        
        # Loops through all layers and performs the backward method on them
        for i in range(len(self.layers)-1, -1, -1):
            curr_y_grad = self.layers[i].backward(curr_y_grad)
        
        return curr_y_grad
            
        ##########  Code end   ##########

## Checkpoint 3: Regression Network
Check your network implementation with a simple regression task. Here we also provide you a sample implementation for the gradient descent algorithm, which you will find useful for your own Classifier implementation.  

In [8]:
class Regressor:
    #Classifier
    def __init__(self, layers_arch, data_function, learning_rate = 1e-3, batch_size = 32, max_epoch = 200):

        input_feature_size = 2
        output_feature_size = 2

        self.train_data = []
        self.train_label = []
        self.test_data = []
        self.test_label = []

        self.data_function = data_function
        
        self.layers_arch = layers_arch
        self.net = Network(layers_arch)
        self.loss_function = MSELoss()

        self.max_epoch = max_epoch
        self.batch_size = batch_size
        self.learning_rate = learning_rate

    def dataloader(self):
        
        '''
        We randomly generate the mapping: (x)->(x^3+x^2 + 1)
        '''
        self.train_data = np.zeros((1000,1))
        self.train_label = np.zeros((1000, 1))

        for i in range(1000):
            self.train_data[i][0] = np.random.uniform(low=0.0, high=10000.0)/10000.0
            self.train_label[i][0] = self.data_function(self.train_data[i][0])

        self.test_data = np.zeros((200, 1))
        self.test_label = np.zeros((200, 1))

        for i in range(200):
            self.test_data[i][0] = np.random.uniform(low=-0.0, high=10000.0) / 10000.0
            self.test_label[i][0] = self.data_function(self.test_data[i][0])



    def Train_One_Epoch(self):
        '''
        Here we train the network using gradient descent
        '''
        loss = 0
        n_loop = int(math.ceil(len(self.train_data)/self.batch_size))
    

        for i in range(n_loop):
            batch_data = self.train_data[i * self.batch_size : (i+1)*self.batch_size]
            batch_label = self.train_label[i * self.batch_size : (i+1)*self.batch_size]
            
            '''
            /*  Forward the data to the network.
             *  Forward the result to the loss function.
             *  Backward.
             *  Update the weights with weight gradients.
             *  Do not forget the learning rate!
             */
            '''
            
            ########## Sample code  ##########
            prediction = self.net.forward(batch_data)
            loss += self.loss_function.forward(prediction, batch_label)

            pred_grad = self.loss_function.backward()
            self.net.backward(pred_grad)
            
            
            for i in range(len(self.layers_arch)):
                if self.layers_arch[i][0] == 'Linear':
                    self.net.layers[i].W -= self.net.layers[i].W_grad * self.learning_rate
            ##########  Sample code ##########
            
        return loss/n_loop

    def Test(self):
        prediction = self.net.forward(self.test_data)
        loss = self.loss_function.forward(prediction, self.test_label)
        return loss

    def Train(self):
        self.dataloader()
        for i in range(self.max_epoch):
            train_loss = self.Train_One_Epoch()
            test_loss = self.Test()
            print("Epoch: ", str(i+1), "/", str(self.max_epoch), " | Train loss: ", train_loss, " | Test loss : ", test_loss)


In [9]:
train_data_path = './MNIST_Sub/train_data.txt'
train_labels_path = './MNIST_Sub/train_labels.txt'
test_data_path = './MNIST_Sub/test_data.txt'
test_labels_path = './MNIST_Sub/test_labels.txt'

# regressor
regressor_layers_arch = [['Linear', (1, 16)], ['ReLU'], ['Linear', (16, 16)], ['ReLU'], ['Linear', (16, 1)]]
def data_function(x):
    return np.power(x,3) + pow(x,2) + 1
regressor = Regressor(regressor_layers_arch, data_function, learning_rate = 1e-4, batch_size = 32, max_epoch = 200)
regressor.Train()

regressor.Test()

Epoch:  1 / 200  | Train loss:  1.523359006708903  | Test loss :  1.2215882192986363
Epoch:  2 / 200  | Train loss:  1.2015056555672097  | Test loss :  0.9684808481817379
Epoch:  3 / 200  | Train loss:  0.9520166529069569  | Test loss :  0.772296936829498
Epoch:  4 / 200  | Train loss:  0.7589163241839182  | Test loss :  0.6204913878534434
Epoch:  5 / 200  | Train loss:  0.6097503702825209  | Test loss :  0.5032582754276054
Epoch:  6 / 200  | Train loss:  0.4947730870569282  | Test loss :  0.4129151477065187
Epoch:  7 / 200  | Train loss:  0.406348960058301  | Test loss :  0.343442021278679
Epoch:  8 / 200  | Train loss:  0.33849909309312287  | Test loss :  0.290127457942416
Epoch:  9 / 200  | Train loss:  0.28654993338277795  | Test loss :  0.2492920566881641
Epoch:  10 / 200  | Train loss:  0.2468573112115636  | Test loss :  0.2180700879495136
Epoch:  11 / 200  | Train loss:  0.2165878748052055  | Test loss :  0.19423610898568533
Epoch:  12 / 200  | Train loss:  0.19354538583049755  

Epoch:  95 / 200  | Train loss:  0.12156990942678593  | Test loss :  0.11830223639411319
Epoch:  96 / 200  | Train loss:  0.12156990935610197  | Test loss :  0.11830223433638676
Epoch:  97 / 200  | Train loss:  0.12156990929496429  | Test loss :  0.11830223255372677
Epoch:  98 / 200  | Train loss:  0.12156990924207199  | Test loss :  0.11830223100935475
Epoch:  99 / 200  | Train loss:  0.12156990919630446  | Test loss :  0.118302229671412
Epoch:  100 / 200  | Train loss:  0.12156990915669537  | Test loss :  0.11830222851230097
Epoch:  101 / 200  | Train loss:  0.12156990912241106  | Test loss :  0.11830222750811487
Epoch:  102 / 200  | Train loss:  0.12156990909273208  | Test loss :  0.11830222663814378
Epoch:  103 / 200  | Train loss:  0.12156990906703696  | Test loss :  0.11830222588444694
Epoch:  104 / 200  | Train loss:  0.1215699090447888  | Test loss :  0.11830222523148227
Epoch:  105 / 200  | Train loss:  0.12156990902552366  | Test loss :  0.11830222466578565
Epoch:  106 / 200 

Epoch:  190 / 200  | Train loss:  0.12156990890065733  | Test loss :  0.1183022209983772
Epoch:  191 / 200  | Train loss:  0.1215699089006547  | Test loss :  0.11830222099837176
Epoch:  192 / 200  | Train loss:  0.12156990890065208  | Test loss :  0.11830222099836661
Epoch:  193 / 200  | Train loss:  0.1215699089006495  | Test loss :  0.11830222099836178
Epoch:  194 / 200  | Train loss:  0.1215699089006469  | Test loss :  0.11830222099835719
Epoch:  195 / 200  | Train loss:  0.1215699089006443  | Test loss :  0.11830222099835282
Epoch:  196 / 200  | Train loss:  0.12156990890064165  | Test loss :  0.11830222099834863
Epoch:  197 / 200  | Train loss:  0.12156990890063911  | Test loss :  0.11830222099834461
Epoch:  198 / 200  | Train loss:  0.12156990890063651  | Test loss :  0.11830222099834073
Epoch:  199 / 200  | Train loss:  0.12156990890063392  | Test loss :  0.11830222099833698
Epoch:  200 / 200  | Train loss:  0.12156990890063134  | Test loss :  0.1183022209983333


0.1183022209983333

## Task 5: Classfication Network
Implement your own classifier with gradient descent. Please read the requirement for Task 5 in the pdf document.

In [10]:
def One_Hot_Encode(labels, classes = 10):
    '''
    /*  Make the labels one-hot.
     *  For example, if there are 5 classes {0, 1, 2, 3, 4} then
     *  [0, 2, 4] -> [[1, 0, 0, 0, 0],
     * 								[0, 0, 1, 0, 0],
     * 								[0, 0, 0, 0, 1]]
     */
    '''
    
    ########## Code start  ##########
    # Creates the classifier matrix that is number of labels by number of classes
    class_matrix = np.zeros((len(labels), classes))
    
    # Loops through each label and makes the current index in the classifier matrix
    # a 1 to properly classify each label
    for i in range(len(labels)):
        class_matrix[i][labels[i]] = 1
    
    return class_matrix
    ##########  Code end   ##########

In [11]:
class Classifier:
    #Classifier
    def __init__(self, train_data_path, train_labels_path, test_data_path, test_labels_path, layers_arch, learning_rate = 1e-3, batch_size = 32, max_epoch = 200, classes = 10):
        self.classes = classes

        self.train_data_path = train_data_path
        self.train_labels_path = train_labels_path
        self.test_data_path = test_data_path
        self.test_labels_path = test_labels_path


        self.train_data = [] #The shape of train data should be (n_samples,28^2)
        self.train_labels = []
        self.test_data = []
        self.test_labels = []
        
        self.layers_arch = layers_arch
        self.net = Network(layers_arch)
        self.loss_function = MSELoss()

        self.max_epoch = max_epoch
        self.batch_size = batch_size
        self.learning_rate = learning_rate


    def dataloader(self):

        with open(self.train_data_path, "r") as f:
            for line in f:
                self.train_data.append(np.array(line.strip().split()).astype(np.float64)/255.0)
        self.train_data = np.array(self.train_data)

        with open(self.train_labels_path, "r") as f:
            for line in f:
                self.train_labels.append(int(line.strip()))
        self.train_labels = np.array(self.train_labels)

        with open(self.test_data_path, "r") as f:
            for line in f:
                self.test_data.append(np.array(line.strip().split()).astype(np.float64)/255.0)
        self.test_data = np.array(self.test_data)

        with open(self.test_labels_path, "r") as f:
            for line in f:
                self.test_labels.append(int(line.strip()))
        self.test_labels = np.array(self.test_labels)


    def Train_One_Epoch(self):
        '''
        Here we train the network using gradient descent
        '''
        loss = 0
        n_loop = int(math.ceil(len(self.train_data) / self.batch_size))
        for i in range(n_loop):
            batch_data = self.train_data[i * self.batch_size: (i + 1) * self.batch_size]
            batch_label = self.train_labels[i * self.batch_size: (i + 1) * self.batch_size]
            batch_one_hot_label = One_Hot_Encode(batch_label, classes = self.classes)
            
            '''
             /*  Forward the data to the network.
             *  Forward the result to the loss function.
             *  Backward.
             *  Update the weights with weight gradients.
             *  Do not forget the learning rate!
             */
            '''
            
            ########## Code start  ##########
            # Based off of the sample code provided in Checkpoint 3
            # Calculates the predicted value from the inputted batch of data with the forward method
            prediction = self.net.forward(batch_data)
            # Calculates and adds the loss from the prediction and true values to the loss accumulator
            loss += self.loss_function.forward(prediction, batch_one_hot_label)

            # Calculates the predicted gradient from the inputted batch of data with the backward method
            pred_grad = self.loss_function.backward()
            # Performs the backward method on the network with the predicted gradient
            self.net.backward(pred_grad)
            
            # Loops through each layer in the network's architecture and calculates the parameter
            # weights for each linear layer
            for i in range(len(self.layers_arch)):
                if self.layers_arch[i][0] == 'Linear':
                    self.net.layers[i].W -= self.net.layers[i].W_grad * self.learning_rate
            ##########  Code end   ##########
        
        return loss / n_loop

    def Test(self):
        '''
        the class with max score is our predicted label
        '''
        score = self.net.forward(self.test_data)
        accuracy = 0
        for i in range(np.shape(score)[0]):
            one_label_list = score[i].tolist()
            label_pred = one_label_list.index(max(one_label_list))
            if label_pred == self.test_labels[i]:
                accuracy = accuracy +1

        accuracy = accuracy/np.shape(score)[0]
        return accuracy

    def Train(self):
        self.dataloader()
        for i in range(self.max_epoch):
            loss = self.Train_One_Epoch()
            accuray = self.Test()
            print("Epoch: ", str(i+1), "/", str(self.max_epoch), " | Train loss: ", loss, " | Test Accuracy : ", accuray)



## Evaluation
That's it! Congratulations on finishing everything. Now try your network on MNIST!

In [12]:
train_data_path = './MNIST_Sub/train_data.txt'
train_labels_path = './MNIST_Sub/train_labels.txt'
test_data_path = './MNIST_Sub/test_data.txt'
test_labels_path = './MNIST_Sub/test_labels.txt'


#classifier
classifier_layers_arch = [['Linear', (28*28, 256)], ['ReLU'], ['Linear', (256, 10)]]
cls = Classifier(train_data_path, train_labels_path, test_data_path, test_labels_path, layers_arch = classifier_layers_arch, learning_rate = 0.01, batch_size = 32, max_epoch = 200)
cls.Train()
cls.Test()

Epoch:  1 / 200  | Train loss:  1.2142269194468032  | Test Accuracy :  0.46
Epoch:  2 / 200  | Train loss:  0.7613363660672452  | Test Accuracy :  0.57
Epoch:  3 / 200  | Train loss:  0.6293032932778765  | Test Accuracy :  0.665
Epoch:  4 / 200  | Train loss:  0.5549495099811881  | Test Accuracy :  0.715
Epoch:  5 / 200  | Train loss:  0.504091596786214  | Test Accuracy :  0.715
Epoch:  6 / 200  | Train loss:  0.4659019889015074  | Test Accuracy :  0.715
Epoch:  7 / 200  | Train loss:  0.43562967110705497  | Test Accuracy :  0.74
Epoch:  8 / 200  | Train loss:  0.41066264912035266  | Test Accuracy :  0.75
Epoch:  9 / 200  | Train loss:  0.3893637875099155  | Test Accuracy :  0.75
Epoch:  10 / 200  | Train loss:  0.3709858754287907  | Test Accuracy :  0.75
Epoch:  11 / 200  | Train loss:  0.35492586212334987  | Test Accuracy :  0.75
Epoch:  12 / 200  | Train loss:  0.34055643223733123  | Test Accuracy :  0.76
Epoch:  13 / 200  | Train loss:  0.327625544885862  | Test Accuracy :  0.76
Ep

Epoch:  106 / 200  | Train loss:  0.0713612091737407  | Test Accuracy :  0.865
Epoch:  107 / 200  | Train loss:  0.07061873238746992  | Test Accuracy :  0.865
Epoch:  108 / 200  | Train loss:  0.0699006816315759  | Test Accuracy :  0.865
Epoch:  109 / 200  | Train loss:  0.06920041746107887  | Test Accuracy :  0.865
Epoch:  110 / 200  | Train loss:  0.0685021539918214  | Test Accuracy :  0.865
Epoch:  111 / 200  | Train loss:  0.06781387854288813  | Test Accuracy :  0.865
Epoch:  112 / 200  | Train loss:  0.06714989616110159  | Test Accuracy :  0.865
Epoch:  113 / 200  | Train loss:  0.06648130319824001  | Test Accuracy :  0.865
Epoch:  114 / 200  | Train loss:  0.06581996920012713  | Test Accuracy :  0.865
Epoch:  115 / 200  | Train loss:  0.06517106664550901  | Test Accuracy :  0.865
Epoch:  116 / 200  | Train loss:  0.06453693892886735  | Test Accuracy :  0.865
Epoch:  117 / 200  | Train loss:  0.06390981505677566  | Test Accuracy :  0.86
Epoch:  118 / 200  | Train loss:  0.06328781

0.86