# Neural Networks

Table of contents

✔ Chapter 1.  Fully connected layer

To install further python libraries, type

`!pip install --target=$my_path [LIBRARY_NAME]`

# Chapter 1-1. Implement from scratch and train/test against fake data


In [None]:
# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    def forward_pass(self, input):
        raise NotImplementedError

    # computes dJ/dw and dJ/db and update params
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

In [None]:
import numpy as np

# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_pass(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    # computes dJ/dW, dJ/dB for a given loss_d=dJ/dz
    # Returns next_input=dJ/dz*w for the next step of the backpropagation.
    def backward_propagation(self, loss_d, learning_rate):
        next_input = np.dot(loss_d, self.weights.T)
        
        #compute dJ/dw
        weights_error = np.dot(self.input.T, loss_d)
       
        # update parameters
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * loss_d
        return next_input

In [None]:
# inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    # returns the activated input
    def forward_pass(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    # Returns next_input=ReLU'(x)*(dJ/da) for the next step of backpropagation
    # learning_rate is not used as no update is needed
    def backward_propagation(self, loss_d, learning_rate):
        return self.activation_prime(self.input) * loss_d

In [None]:
import numpy as np

# activation function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh(x):
    return np.tanh(x);

def tanh_prime(x):
    return 1-np.tanh(x)**2;

In [None]:
import numpy as np

# loss function and its derivative
def mse(y_true, y_pred):
    return np.mean(np.power(y_true-y_pred, 2));

def mse_prime(y_true, y_pred):
    return 2*(y_pred-y_true)/y_true.size;

In [None]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # compute loss
    def compute_loss(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

    # predict for each of given inputs
    def predict(self, input_data):
        samples = len(input_data)
        result = []

        
        for i in range(samples):
            # forward pass
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_pass(output)
            result.append(output)

        return result

    # train the network
    def fit(self, x_train, y_train, epochs, learning_rate):
        samples = len(x_train)

        # training loop
        for i in range(epochs):
            err = 0
            # Batch gradient descent
            for j in range(samples):
                # forward pass
                a = x_train[j]
                for layer in self.layers:
                    a = layer.forward_pass(a)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], a)

                # backward propagation
                # Since the last layer is the activation layer, the output is denoted as a here
                # Note that the output of fclayer is z which is z=wx+b or z=wa+b
                # compute the derivative of loss (loss_d) with respect to activation output (a)
                loss_d = self.loss_prime(y_train[j], a)
                for layer in reversed(self.layers):
                    loss_d = layer.backward_propagation(loss_d, learning_rate)

            # calculate average error on all samples
            err /= samples
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))

In [None]:
# training data
x_train = np.array([[[0,0]], [[0,1]], [[1,0]], [[1,1]]])
y_train = np.array([[[0]], [[1]], [[1]], [[0]]])
print(x_train.shape)
print(y_train.shape)


(4, 1, 2)
(4, 1, 1)


In [None]:
# network
net = Network()
net.add(FCLayer(2, 3)) # input (1,2), output (1,3)
net.add(ActivationLayer(sigmoid, sigmoid_prime)) # element-wise application
net.add(FCLayer(3, 1)) # input (1,3), output (1,1)
net.add(ActivationLayer(sigmoid, sigmoid_prime)) # element-wise application

# train
net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.1)

# test
out = net.predict(x_train)
print(out)

epoch 1/1000   error=0.282530
epoch 2/1000   error=0.279044
epoch 3/1000   error=0.275926
epoch 4/1000   error=0.273159
epoch 5/1000   error=0.270718
epoch 6/1000   error=0.268577
epoch 7/1000   error=0.266710
epoch 8/1000   error=0.265088
epoch 9/1000   error=0.263686
epoch 10/1000   error=0.262477
epoch 11/1000   error=0.261438
epoch 12/1000   error=0.260547
epoch 13/1000   error=0.259786
epoch 14/1000   error=0.259136
epoch 15/1000   error=0.258582
epoch 16/1000   error=0.258110
epoch 17/1000   error=0.257709
epoch 18/1000   error=0.257369
epoch 19/1000   error=0.257080
epoch 20/1000   error=0.256834
epoch 21/1000   error=0.256627
epoch 22/1000   error=0.256451
epoch 23/1000   error=0.256301
epoch 24/1000   error=0.256175
epoch 25/1000   error=0.256068
epoch 26/1000   error=0.255977
epoch 27/1000   error=0.255901
epoch 28/1000   error=0.255836
epoch 29/1000   error=0.255781
epoch 30/1000   error=0.255734
epoch 31/1000   error=0.255694
epoch 32/1000   error=0.255661
epoch 33/1000   e

It seems that the error doesn't really reduce properly. Change the activation function to use tanh

In [None]:
# network
net = Network()
net.add(FCLayer(2, 3)) # input (1,2), output (1,3)
net.add(ActivationLayer(tanh, tanh_prime)) # change the activation function
net.add(FCLayer(3, 1)) # input (1,3), output (1,1)
net.add(ActivationLayer(tanh, tanh_prime)) # change the activation function

# train
net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.1)

# test
out = net.predict(x_train)
print(out)

epoch 1/1000   error=0.376126
epoch 2/1000   error=0.310075
epoch 3/1000   error=0.298690
epoch 4/1000   error=0.295070
epoch 5/1000   error=0.293468
epoch 6/1000   error=0.292574
epoch 7/1000   error=0.291978
epoch 8/1000   error=0.291526
epoch 9/1000   error=0.291149
epoch 10/1000   error=0.290816
epoch 11/1000   error=0.290511
epoch 12/1000   error=0.290224
epoch 13/1000   error=0.289952
epoch 14/1000   error=0.289692
epoch 15/1000   error=0.289442
epoch 16/1000   error=0.289201
epoch 17/1000   error=0.288969
epoch 18/1000   error=0.288746
epoch 19/1000   error=0.288531
epoch 20/1000   error=0.288323
epoch 21/1000   error=0.288122
epoch 22/1000   error=0.287929
epoch 23/1000   error=0.287742
epoch 24/1000   error=0.287563
epoch 25/1000   error=0.287389
epoch 26/1000   error=0.287222
epoch 27/1000   error=0.287061
epoch 28/1000   error=0.286906
epoch 29/1000   error=0.286756
epoch 30/1000   error=0.286611
epoch 31/1000   error=0.286472
epoch 32/1000   error=0.286338
epoch 33/1000   e

Now it works out!

# Chapter 1-2. train/test against MNIST data

Use MNIST dataset


*   Source: keras



In [None]:
from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST 
(x_train, y_train), (x_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
# training data : 60000 samples
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
print(x_train[0])
x_train /= 255
print(x_train[0])
# One-hot encoding: encode output in range [0,9] into a vector of size 10
# Change to one-hot vector (e.g. 3 = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0])
print(y_train[0])
y_train = np_utils.to_categorical(y_train)
print(y_train[0])

[[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   3.  18.
   18.  18. 126. 136. 175.  26. 166. 255. 247. 127.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  30.  36.  94. 154. 170. 253.
  253. 253. 253. 253. 225. 172. 253. 242. 195.  64.   0.   0.   

In [None]:
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
x_test /= 255
y_test = np_utils.to_categorical(y_test)

In [None]:
class Network:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.loss_prime = None

    # add layer to network
    def add(self, layer):
        self.layers.append(layer)

    # compute loss
    def compute_loss(self, loss, loss_prime):
        self.loss = loss
        self.loss_prime = loss_prime

    # predict for each of given inputs
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []

        
        for i in range(samples):
            # forward pass
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_pass(output)
            result.append(output)

        return result

    # train the network
    def fit(self, x_train, y_train, epochs, learning_rate, batch_size):
        train_len = np.array(range(len(x_train)))
        
        # training loop
        for i in range(epochs):
            err = 0
            # mini-batch gradient descent
            np.random.shuffle(train_len)
            for j in train_len[:batch_size]:
                # forward pass
                a = x_train[j]
                for layer in self.layers:
                    a = layer.forward_pass(a)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], a)

                # backward propagation
                # Since the last layer is the activation layer, the output is denoted as a here
                # Note that the output of fclayer is z which is z=wx+b or z=wa+b
                # compute the derivative of loss (loss_d) with respect to activation output (a)
                loss_d = self.loss_prime(y_train[j], a)
                for layer in reversed(self.layers):
                    loss_d = layer.backward_propagation(loss_d, learning_rate)

            # calculate average error on all samples
            err /= batch_size
            print('epoch %d/%d   error=%f' % (i+1, epochs, err))

In [None]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=500, learning_rate=0.1, batch_size=256)


epoch 1/500   error=0.409538
epoch 2/500   error=0.187746
epoch 3/500   error=0.140968
epoch 4/500   error=0.132480
epoch 5/500   error=0.129564
epoch 6/500   error=0.113788
epoch 7/500   error=0.107492
epoch 8/500   error=0.103560
epoch 9/500   error=0.092148
epoch 10/500   error=0.103268
epoch 11/500   error=0.085633
epoch 12/500   error=0.084692
epoch 13/500   error=0.081322
epoch 14/500   error=0.074442
epoch 15/500   error=0.072270
epoch 16/500   error=0.079649
epoch 17/500   error=0.072777
epoch 18/500   error=0.075048
epoch 19/500   error=0.077887
epoch 20/500   error=0.065717
epoch 21/500   error=0.076405
epoch 22/500   error=0.077066
epoch 23/500   error=0.068500
epoch 24/500   error=0.060106
epoch 25/500   error=0.063347
epoch 26/500   error=0.061975
epoch 27/500   error=0.064382
epoch 28/500   error=0.061273
epoch 29/500   error=0.062822
epoch 30/500   error=0.061999
epoch 31/500   error=0.060352
epoch 32/500   error=0.070359
epoch 33/500   error=0.054681
epoch 34/500   erro

In [None]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9283


# [Extra Credit] Could you improve the performance? 
Any strategies can be acceptable such as different activation funciton, learning rate, and loss function, adding more layers and neurons, changing the epoch and batch sizes, etc.
**(Deadline:12/16 FRI 23:59)**



### Submitters are as follows.
 *Note that hands-on assignments can be done collaboratively (**up to 2 students**)*

    Name: Sohyun Doh
    Student ID: 2076120
    

TO-DO: Using the Network class provided above, improve the initial performance we got. Note that the model obtained 92.1% accuracy on average using five experiments (91.2%, 90.3%, 93.47%, 92.65%, 92.82%).
- Feel free to add and modify all the codes provided above
- Run the five experiments and report the average accuracy
- Discuss what changes/additions you made to improve the performance of the neural network model   

#Trial 1: inproving model with higher epoch
Fit the model with 1000 epochs improve accuacy.

In [None]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.1, batch_size=256)

epoch 1/1000   error=0.428175
epoch 2/1000   error=0.185952
epoch 3/1000   error=0.143784
epoch 4/1000   error=0.117513
epoch 5/1000   error=0.115367
epoch 6/1000   error=0.108520
epoch 7/1000   error=0.098891
epoch 8/1000   error=0.101980
epoch 9/1000   error=0.101318
epoch 10/1000   error=0.089660
epoch 11/1000   error=0.089062
epoch 12/1000   error=0.097709
epoch 13/1000   error=0.091211
epoch 14/1000   error=0.081800
epoch 15/1000   error=0.081513
epoch 16/1000   error=0.076465
epoch 17/1000   error=0.083021
epoch 18/1000   error=0.072423
epoch 19/1000   error=0.073730
epoch 20/1000   error=0.076869
epoch 21/1000   error=0.075844
epoch 22/1000   error=0.061372
epoch 23/1000   error=0.066383
epoch 24/1000   error=0.071594
epoch 25/1000   error=0.069095
epoch 26/1000   error=0.058309
epoch 27/1000   error=0.061926
epoch 28/1000   error=0.062925
epoch 29/1000   error=0.061375
epoch 30/1000   error=0.054224
epoch 31/1000   error=0.063546
epoch 32/1000   error=0.058415
epoch 33/1000   e

In [None]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9433


# Trial 2: Improving the model with higher learning rate
Fit the model with 1000 epochs and 0.2 learning rate improve accuacy.

In [None]:
# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input (1, 28*28), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.2, batch_size=256)

epoch 1/1000   error=0.416550
epoch 2/1000   error=0.285837
epoch 3/1000   error=0.221027
epoch 4/1000   error=0.215281
epoch 5/1000   error=0.202393
epoch 6/1000   error=0.177308
epoch 7/1000   error=0.169388
epoch 8/1000   error=0.174669
epoch 9/1000   error=0.150390
epoch 10/1000   error=0.152699
epoch 11/1000   error=0.140484
epoch 12/1000   error=0.137074
epoch 13/1000   error=0.140108
epoch 14/1000   error=0.124974
epoch 15/1000   error=0.126611
epoch 16/1000   error=0.129643
epoch 17/1000   error=0.126919
epoch 18/1000   error=0.119810
epoch 19/1000   error=0.102446
epoch 20/1000   error=0.113340
epoch 21/1000   error=0.118381
epoch 22/1000   error=0.100412
epoch 23/1000   error=0.106275
epoch 24/1000   error=0.117212
epoch 25/1000   error=0.114046
epoch 26/1000   error=0.121646
epoch 27/1000   error=0.096014
epoch 28/1000   error=0.115796
epoch 29/1000   error=0.102292
epoch 30/1000   error=0.085605
epoch 31/1000   error=0.098602
epoch 32/1000   error=0.095363
epoch 33/1000   e

In [None]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9459


# Trial 3: Adding one more layer
Fit the model with 1000 epochs and 0.2 learning rate and one more layer improve accuacy.

## Discussion
I think overfitting occurs, So score is decrease than upper experiment (Trial 2) 

In [None]:
# Network
net = Network()
net.add(FCLayer(28*28, 200))                # input (1, 28*28), output (1, 200)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(200, 100))                # input (1, 200), output (1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input (1, 100), output (1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input (1, 50), output (1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

net.compute_loss(mse, mse_prime)
net.fit(x_train, y_train, epochs=1000, learning_rate=0.2, batch_size=256)

epoch 1/1000   error=0.438660
epoch 2/1000   error=0.302680
epoch 3/1000   error=0.256481
epoch 4/1000   error=0.261980
epoch 5/1000   error=0.213331
epoch 6/1000   error=0.221133
epoch 7/1000   error=0.203850
epoch 8/1000   error=0.187652
epoch 9/1000   error=0.184678
epoch 10/1000   error=0.178766
epoch 11/1000   error=0.189099
epoch 12/1000   error=0.193911
epoch 13/1000   error=0.185683
epoch 14/1000   error=0.161869
epoch 15/1000   error=0.159003
epoch 16/1000   error=0.165341
epoch 17/1000   error=0.139382
epoch 18/1000   error=0.137835
epoch 19/1000   error=0.151826
epoch 20/1000   error=0.134837
epoch 21/1000   error=0.127329
epoch 22/1000   error=0.140768
epoch 23/1000   error=0.136875
epoch 24/1000   error=0.116936
epoch 25/1000   error=0.109895
epoch 26/1000   error=0.138236
epoch 27/1000   error=0.122435
epoch 28/1000   error=0.113350
epoch 29/1000   error=0.124318
epoch 30/1000   error=0.123441
epoch 31/1000   error=0.108161
epoch 32/1000   error=0.098229
epoch 33/1000   e

In [None]:
out = net.predict(x_test)
tp=0
for pred, true in zip(out,y_test):
  if np.argmax(pred) == np.argmax(true):
    tp +=1
print(tp/len(x_test))

0.9349


#Report
I learned from these experiments that the learning rate and the number of training contribute to improving the accuracy of the model.

And I was surprised that when I added one more neuron, the accuracy went down, which seems to be due to overfitting.


The accuracy has changed in several attempts, so I don't think it's because of overfitting.

About accuracy with the second trial, **94.72** in the first trial, **93.56**. So when I tried to run the model again with more neurons, we got an accuracy of **93.49**. 

Therefore, it was decided to get the result from the second model with a higher score than the third model.


## Finally

I've got average accuracy score : 94.562%.

It's quiet improved compare with 500 epoch and 0.1 learning rate.

Experiment 1: 94.72

Experiment 2: 93.56

Experiment 3: 95.04

Experiment 4: 94.9

Experiment 5: 94.59