In [95]:
import numpy as np
import matplotlib.pyplot as plt
import autograd
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix


In [102]:
def ReLU(data):
    return np.maximum(data,0)

def sigmoid(data):
    return (1/(1+np.exp(-data)))

def softmax(data):
    return np.exp(data)/(np.exp(data).sum())

def loss_function(predicted, ground_truth):
    return ((predicted - ground_truth)**2).sum()/len(predicted)

def linear(data):
    return data

def ReLU_Deriv(data):
    new_data = data[data>0]=1
    new_data = new_data[data<0]=0
    return new_data

class neural_network():
    
    def __init__(self, n_layers, activation_list, hidden_units, data, loss, rseed=12):
        
        self.n_layers = n_layers
        self.activation_list = activation_list
        self.loss = loss
        self.bias = {}
        self.weights = {}
        self.data = data 
        self.hidden_units = hidden_units
        np.random.seed(rseed)
        self.derivative = {'weight':{},'bias':{}}
        
        for i in range(self.n_layers+1):
            self.bias[i] = np.random.randn(self.hidden_units[i])
            if i == 0:
                self.weights[i] = np.random.randn(len(self.data[0]), self.hidden_units[i])
            else:
                self.weights[i] = np.random.randn(self.hidden_units[i-1], self.hidden_units[i])
        
    def forward(self, data):
        
        for i in range(self.n_layers+1):
            if i==0:
                temp = np.dot(data,self.weights[i])
                if self.activation_list[i] == 'ReLU':
                    temp = ReLU(temp)
                elif self.activation_list[i]=='sigmoid':
                    temp = sigmoid(temp)
                else:
                    temp = linear(temp)
            else:
                temp = np.dot(temp, self.weights[i])
                if self.activation_list[i] =='ReLU':
                    temp = ReLU(temp)
                elif self.activation_list[i]=='sigmoid':
                    temp = sigmoid(temp)
                else:
                    temp = linear(temp)
        return temp
    
    def backpropagate(self, loss, y):
        derivative_store = []
        
        if self.activation_list[-1]=='sigmoid':
            fixed = [2/len(y)*(forward(self.data)-y)*forward(self.data)*(1-forward(self.data))]
        if self.activation_list[-1]=='linear':
            fixed = [2/len(y)*(forward(self.data)-y)]
        
        j=0
        for i in range(self.n_layers+1):
            index = self.n_layers-i
            curr_activation = activation_list[index]
            if curr_activation=='sigmoid':
                temp1 = forward(self.data,till_layer= index)*(1-forward(self.data,till_layer=index))
                self.derivative['weight'][index] = fixed[j]*forward(self.data, till_layer=index-1)
                self.derivative['bias'][index] = fixed[j]
                fixed.append(temp1*weights[index+1])
                
#                 temp1 = 2/len(y)*(y - forward(self.data,till_layer=index))
#                 temp2 = forward(self.data)*(1-forward(self.data,till_layer=index))
#                 temp3 = forward(self.data,till_layer=index-1)
#                 self.derivative['weight'][index] = temp1*temp2*temp3
            
#             if curr_activation=='linear':
#                 pass
            j+=1
                    
                    
            

In [103]:
nn = neural_network(2, {0:ReLU, 1:sigmoid, 2:linear}, [4,2,1], np.array([[1,1,1],[2,2,2]]), loss_function)

In [104]:
nn.forward(np.array([[2,2,2],[1,1,1]]))

array([[-2.31366411],
       [-1.15683205]])

In [105]:
loss_function(np.array([1,1,1]),np.array([1,2,2]))

0.6666666666666666

## Q1 (c)

Code for this part was majorly referred from:
* The GitHub Repository [here](https://github.com/dennybritz/nn-from-scratch) and [here](https://github.com/pangolulu/neural-network-from-scratch).
* The blogpost [here](https://jonathanweisbaerg.org/post/A%20Neural%20Network%20from%20Scratch%20-%20Part%201/).


In [106]:
def multi_class_loss_function(y, predicted):
    sum_loss = np.sum(np.multiply(y, np.log(predicted)))
    return sum_loss/len(y)

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
X, y = mnist["data"], mnist["target"]
X = X/255 


In [108]:

digits = 10
number_of_examples = y.shape[0]
y = y.reshape(1, number_of_examples)
Y_new = np.eye(digits)[y.astype('int64')]
Y_new = Y_new.T.reshape(digits, examples)
## Setting the number of train samples
m = 35000
m_test = X.shape[0] - m
X_train, X_test = X[:m].T, X[m:].T
Y_train, Y_test = Y_new[:,:m], Y_new[:,m:]

shuffle_index = np.random.permutation(m)
X_train, Y_train = X_train[:, shuffle_index], Y_train[:, shuffle_index]
a = 10

In [109]:
n_x = X_train.shape[0]
n_h = 64
learning_rate = 1

W1 = np.random.randn(n_h, n_x)
b1 = np.zeros((n_h, 1))
W2 = np.random.randn(digits, n_h)
b2 = np.zeros((digits, 1))

X = X_train
Y = Y_train
cost = 0
for i in range(500):
    if (i%10)==0 and i!=0:
        print("Cost after ",i, " epochs", cost)
    
    Z1 = np.matmul(W1,X) + b1           ## FIRST LAYER OUTPUT
    A1 = sigmoid(Z1)                    ## FIRST LAYER ACTIVATION
    Z2 = np.matmul(W2,A1) + b2          ## SECOND LAYER INPUT
    A2 = np.exp(Z2) / np.sum(np.exp(Z2), axis=0) ## SECOND LAYER ACTIVATION

    cost = multi_class_loss_function(Y, A2)   ## THE COST FUNCTION

    dZ2 = A2-Y
    dW2 = (1./m) * np.matmul(dZ2, A1.T)   ## COMPUTING THE DERIVATIVES
    db2 = (1./m) * np.sum(dZ2, axis=1, keepdims=True)
    dA1 = np.matmul(W2.T, dZ2)            ## 
    dZ1 = dA1 * sigmoid(Z1) * (1 - sigmoid(Z1))
    dW1 = (1./m) * np.matmul(dZ1, X.T)
    db1 = (1./m) * np.sum(dZ1, axis=1, keepdims=True)
    W2 = W2 - learning_rate * dW2          ## Gradient Descent Update
    b2 = b2 - learning_rate * db2         ## Gradient Descent Update
    W1 = W1 - learning_rate * dW1         ## Gradient Descent Update
    b1 = b1 - learning_rate * db1          ## Gradient Descent Updates is here

Cost after  10  epochs -5644.2179493046915
Cost after  20  epochs -3412.152710987341
Cost after  30  epochs -2510.055502054724
Cost after  40  epochs -2114.4536530916475
Cost after  50  epochs -1871.2717479895175
Cost after  60  epochs -1701.501553642613
Cost after  70  epochs -1574.7095422877774
Cost after  80  epochs -1475.4416970874363
Cost after  90  epochs -1395.033959076589
Cost after  100  epochs -1328.244175377284
Cost after  110  epochs -1271.6728491126564
Cost after  120  epochs -1222.9892642885548
Cost after  130  epochs -1180.5307379866222
Cost after  140  epochs -1143.0748528807126
Cost after  150  epochs -1109.7013896108915
Cost after  160  epochs -1079.7047467036932
Cost after  170  epochs -1052.5356263916967
Cost after  180  epochs -1027.7605493090593
Cost after  190  epochs -1005.0330223831716
Cost after  200  epochs -984.0727521644812
Cost after  210  epochs -964.6505428159614
Cost after  220  epochs -946.5772072020964
Cost after  230  epochs -929.69528095693
Cost aft

## Below we can see the confusion matrix

In [110]:
Z1 = np.matmul(W1, X_test) + b1
A1 = sigmoid(Z1)
Z2 = np.matmul(W2, A1) + b2
A2 = np.exp(Z2) / np.sum(np.exp(Z2), axis=0)
predictions = np.argmax(A2, axis=0)
labels = np.argmax(Y_test, axis=0)
pd.DataFrame(confusion_matrix(predictions, labels))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,953,0,15,6,4,41,746,728,170,197
1,1,1116,9,10,3,11,145,492,631,106
2,4,8,942,28,15,41,2684,1461,1405,177
3,6,7,30,919,5,140,91,1380,1566,593
4,3,2,20,8,948,27,2370,2731,682,5429
5,13,2,16,39,7,1649,840,501,2371,456
6,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0


## The Test Error is below

In [111]:
print(classification_report(predictions, labels))

             precision    recall  f1-score   support

          0       0.97      0.33      0.50      2860
          1       0.98      0.44      0.61      2524
          2       0.91      0.14      0.24      6765
          3       0.91      0.19      0.32      4737
          4       0.97      0.08      0.14     12220
          5       0.86      0.28      0.42      5894
          6       0.00      0.00      0.00         0
          7       0.00      0.00      0.00         0
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0

avg / total       0.93      0.19      0.30     35000



  'recall', 'true', average, warn_for)


### We achieve 30 % accuracy using this neural network.