In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist

In [2]:
def relu(Z): #relu func for normalization at the hidden layers
    return np.maximum(0, Z)

def softmax(Z): #softmax func for normalization at the final step
    return np.exp(Z) / np.sum(np.exp(Z))

In [3]:
def init_params(input_size, hidden_size1, hidden_size2, output_size):
    w1 = np.random.randn(input_size, hidden_size1) * 0.01
    b1 = np.zeros((1, hidden_size1))
    w2 = np.random.randn(hidden_size1,hidden_size2) * 0.01
    b2 = np.zeros((1, hidden_size2))
    w3 = np.random.randn(hidden_size2, output_size) * 0.01
    b3 = np.zeros((1,output_size))
    return w1,b1,w2,b2,w3,b3


In [4]:
def foward_prop(x,w1, b1, w2, b2, w3, b3):
    z1 = np.dot(x, w1) + b1
    a1 = relu(z1)
    z2 = np.dot(z1, w2) + b2
    a2 = relu(z2)
    z3 = np.dot(z2, w3) + b3
    a3 = softmax(z3)
    return a1,a2,a3,z1,z2,z3

In [5]:
def deriv_reLu(y):
    return y > 0

def back_prop(x,y,a1,a2,a3,w2,w3,z1,z2):
    m = y.shape[0]
    dC3 = a3 - y
    dW3 = a2.T.dot(dC3) / m
    dB3 = 1 / m * np.sum(dC3, axis=0, keepdims=True)
    
    dC2 = dC3.dot(w3.T) * deriv_reLu(z2)
    dW2 = 1 / m * a1.T.dot(dC2)
    dB2 = 1 / m * np.sum(dC2, axis=0, keepdims=True)

    dC1 = dC2.dot(w2.T) * deriv_reLu(z1)
    dW1 = 1 / m * x.T.dot(dC1)
    dB1 = 1 / m * np.sum(dC1, axis=0, keepdims=True)
    return dW3, dB3, dW2, dB2, dW1, dB1

In [6]:
def update_params(w1,b1,w2,b2,w3,b3,dW3, dB3, dW2, dB2, dW1, dB1, learning_rate):
    w1 -= learning_rate * dW1
    b1 -= learning_rate * dB1
    w2 -= learning_rate * dW2
    b2 -= learning_rate * dB2
    w3 -= learning_rate * dW3
    b3 -= learning_rate * dB3
    return (w1,b2,w2,b2,w3,b3)

def predict(predictions):
    return np.argmax(predictions)

In [7]:
def batch_gradient_descent(x, y, iterations, learning_rate):
    w1,b1,w2,b2,w3,b3 = init_params(784, 10, 10, 10)
    for i in range(iterations):
        a1,a2,a3,z1,z2,z3 = foward_prop(x,w1, b1, w2, b2, w3, b3)
        
        dW3, dB3, dW2, dB2, dW1, dB1 = back_prop(x,y,a1,a2,a3,w2,w3,z1,z2)
        w1,b2,w2,b2,w3,b3 = update_params(w1,b1,w2,b2,w3,b3,dW3,dB3,dW2,dB2,dW1,dB1, learning_rate)
        if( i % 10 == 0):
            print("Iteration", i)
            print("Accuracy: ", np.sum(predict(a3) == y ) / y.size )

    return w1,b1,w2,b2,w3,b3

In [8]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], 784) / 255.0  # Flatten and normalize
x_test = x_test.reshape(x_test.shape[0], 784) / 255.0
y_train = np.eye(10)[y_train]  # One-hot encoding
y_test = np.eye(10)[y_test]

w1,b1,w2,b2,w3,b3 = batch_gradient_descent(x_train,y_train,500,0.1)

Iteration 0
Accuracy:  0.0
Iteration 10
Accuracy:  0.0
Iteration 20
Accuracy:  0.0
Iteration 30
Accuracy:  0.0
Iteration 40
Accuracy:  0.0
Iteration 50
Accuracy:  0.0
Iteration 60
Accuracy:  0.0
Iteration 70
Accuracy:  0.0
Iteration 80
Accuracy:  0.0


KeyboardInterrupt: 