In [38]:
import numpy as np
import pandas as pd

In [39]:
data = pd.read_csv('../input/mnist-in-csv/mnist_train.csv')
data.head(5)

In [40]:
data = np.array(data)
m,n = data.shape 

#dividing data into test set and training set
train_data = data[0:1000]
x_train = (train_data.T)[1:n]
y_train = (train_data.T)[0:1]
x_train = x_train/255 #rescaling value of pixels between 0-1

test_data = data[1000:m]
x_test = (test_data.T)[1:n]
y_test = (test_data.T)[0:1]
x_test = x_test/255 #rescaling value of pixels between 0-1


***Writing code for activation functions***

In [41]:
def relu(z):
    return np.maximum(0,z)

def deriv_relu(z):
    return z>0

def soft_max(z):
    return np.exp(z)/sum(np.exp(z))

***Rest of the important functions***


In [56]:
def param_init():
    w1 = np.random.rand(10,(n-1)) - 0.5
    b1 = np.random.rand(10,1) - 0.5
    w2 = np.random.rand(10,10) - 0.5
    b2 = np.random.rand(10,1) -0.5
    
    return w1,b1,w2,b2

def forward_prop(w1,b1,w2,b2,X):
    z1 = np.dot(w1,X) + b1
    a1 = relu(z1)
    z2 = np.dot(w2,a1) + b2
    a2 = soft_max(z2)
    
    return z1,a1,z2,a2

def one_hot(y):
    one_hot_Y = np.zeros((y.size, 10))
    for i in range(y.size):
        one_hot_Y[i][y[0][i]] = 1
    return one_hot_Y.T
    

def backward_prop(X,Y,a1,a2,w1,w2,z1,z2):
    y_actual = one_hot(Y)
    dz2 = a2 - y_actual #loss_function
    dw2 = (1/m)*np.dot(dz2,a1.T)
    db2 = (1/m)*sum(dz2)
    dz1 = np.dot(w2.T,dz2) * deriv_relu(z1)
    dw1 = (1/m)*np.dot(dz1,X.T)
    db1 = (1/m)*sum(dz1)
    
    return dw1,db1,dw2,db2


def update_param(w1,dw1,w2,dw2,b1,db1,b2,db2,alpha):
    w1 = w1 - alpha*dw1
    b1 = b1 - alpha*db1
    w2 = w2 - alpha*dw2
    b2 = b2 - alpha*db2
    
    return w1,b1,w2,b2
    

***Gradient Descent***

In [61]:
def get_predictions(a2):
    return np.argmax(a2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X,Y,alpha,epochs):
    w1,b1,w2,b2 = param_init()
    for i in range(epochs):
        z1,a1,z2,a2 = forward_prop(w1,b1,w2,b2,X)
        dw1,db1,dw2,db2 = backward_prop(X,Y,a1,a2,w1,w2,z1,z2)
        w1,b1,w2,b2 = update_param(w1,dw1,w2,dw2,b1,db1,b2,db2,alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(a2)
            print(get_accuracy(predictions, Y))
    return w1,b1,w2,b2

In [62]:
W1, b1, W2, b2 = gradient_descent(x_test, y_test, 0.10, 500)