In [1]:
import numpy as np
np.random.seed(1)

In [2]:
def sigmoid(Z : np.ndarray) -> np.ndarray:
    a = 1/(1+np.exp(-Z))
    return a

def relu(Z : np.ndarray) -> np.array:
    a = np.maximum(0,Z)
    return a

def tanh(Z : np.ndarray) -> np.ndarray:
    a = np.tanh(Z)
    return a

def softmax(Z : np.ndarray) -> np.ndarray:
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)
    exp_Z = np.exp(Z_shifted)
    softmax_output = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
    return softmax_output

def d_sigmoid(Z : np.ndarray) -> np.ndarray:
    d_g = np.multiply(sigmoid(Z),(1-sigmoid(Z)))
    return d_g

def d_relu(Z : np.ndarray) -> np.ndarray:
    d_g = Z[Z<=0] = 0
    return d_g

def d_tanh(Z : np.ndarray) -> np.ndarray:
    d_g = 1-np.power(tanh(Z))
    return d_g


In [3]:
def initialize_parameters(layer_dims : list[int]) -> dict[str,np.ndarray]:
    parameters = {}
    L = len(layer_dims)
    for l in range(1,L):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters[f"b{l}"] = np.zeros((layer_dims[l],1))

    return parameters
    

In [4]:
def fwd_layer_computation(W : np.ndarray,b : np.ndarray, A_prev : np.ndarray, activation : str)-> (np.ndarray, (np.ndarray,np.ndarray,np.ndarray)):
    l = W.shape[0]
    m = W.shape[1]

    Z = np.dot(W,A_prev) + b

    if activation == "sigmoid":
        A = sigmoid(Z)
    if activation == "relu":
        A = relu(Z)
    if activation == "tanh":
        A = tanh(Z)
    if activation == "softmax":
        A = softmax(Z)

    linear_cache = (W,b,A_prev)
    Activation_cache = Z

    cache = (linear_cache,Activation_cache)
    
    return A,cache    

In [5]:
def forward_propogation(parameters : dict[str,np.ndarray], X : np.ndarray) -> (np.ndarray,((dict[str,np.ndarray]),np.ndarray)):
    L = len(parameters)//2
    A = X
    caches = []
    for l in range(1,L):
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]
        
        A,cache = fwd_layer_computation(W,b,A,"relu")
        caches.append(cache)

    W = parameters[f"W{L}"]
    b = parameters[f"b{L}"]

    AL,cache = fwd_layer_computation(W,b,A,"softmax")
    epsilon = 1e-15
    AL = np.clip(AL, epsilon, 1 - epsilon)
    caches.append(cache)

    return AL,caches
    
    

In [6]:
def compute_cost(AL : np.ndarray,Y : np.ndarray) -> float:
    m = Y.shape[1]
    log_probs = -np.log(AL[Y,np.arange(m)])
    J = np.sum(log_probs)/m

    J = np.squeeze(J)

    return J

In [7]:
def bkwd_layer_propogation(AL : np.ndarray, Y : np.ndarray, dA : np.ndarray, Z : np.ndarray, W : np.ndarray, A_prev : np.ndarray, activation : str) -> (dict[str,np.ndarray], np.ndarray):
    grad = {}
    m = dA.shape[1]
    if activation == "sigmoid":
        dZ = np.multiply(dA,d_sigmoid(Z))
    if activation == "relu":
        dZ = np.multiply(dA,d_relu(Z))
    if activation == "tanh":
        dZ = np.multiply(dA,d_tanh(Z))
    if activation == "softmax":
        dZ = AL-Y

    dW = 1/m * np.dot(dZ,A_prev.T)
    db = 1/m * np.sum(dZ,axis=1,keepdims = True)
    dA = np.dot(W.T,dZ)
    grad["dW"] = dW
    grad["db"] = db

    return grad, dA    

In [8]:
def backpropogation(caches, Y, AL):
    # print(f"W1 : {caches[0][0][0].shape}")
    # print(f"b1 : {caches[0][0][1].shape}")
    # print(f"A0 : {caches[0][0][2].shape}")
    # print(f"Z1 : {caches[0][1].shape}")
    # print(f"W2 : {caches[1][0][0].shape}")
    # print(f"b2 : {caches[1][0][1].shape}")
    # print(f"A1 : {caches[1][0][2].shape}")
    # print(f"Z2 : {caches[1][1].shape}")
    # print(f"W3 : {caches[2][0][0].shape}")
    # print(f"b3 : {caches[2][0][1].shape}")
    # print(f"A2 : {caches[2][0][2].shape}")
    # print(f"Z3 : {caches[2][1].shape}")
    L = len(caches)
    linear_cache = caches[L-1][0]
    activation_cache = caches[L-1][1]
    Z = activation_cache
    W = linear_cache[0]
    b = linear_cache[1]
    A_prev = linear_cache[2]
    
    dA = - Y / AL
    grads = {}
    
    grad,dA = bkwd_layer_propogation(AL,Y,dA,Z,W,A_prev,"softmax")
    grads[f"dW{L}"] = grad["dW"]
    grads[f"db{L}"] = grad["db"]
    for l in range(L-2,-1,-1):
        linear_cache = caches[l][0]
        activation_cache = caches[l][1]

        Z = activation_cache
        W = linear_cache[0]
        b = linear_cache[1]
        A_prev = linear_cache[2]

        grad,dA = bkwd_layer_propogation(AL,Y,dA,Z,W,A_prev,"relu")
        grads[f"dW{l+1}"] = grad["dW"]
        grads[f"db{l+1}"] = grad["db"]



    return grads

In [9]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters)//2

    for l in range(1,L+1):
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]
        m = W.shape[0]
        # print(f"layer {l}")
        # print(f"W : {W.shape}")
        # print(f"b : {b.shape}")
        
        dW = grads[f"dW{l}"]
        db = grads[f"db{l}"]
        
        W = W - learning_rate*dW
        b = b - learning_rate*db

        parameters[f"W{l}"] = W
        parameters[f"b{l}"] = b        

    return parameters

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import random
from load_MNIST import MnistDataloader
from os.path import join
%matplotlib inline

input_path = 'datasets/MNIST_dataset/'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

#
# Helper function to show a list of images with their relating titles
#
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

#
# Load MINST dataset
#
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

#
# Show some random training and test images 
#
images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

# show_images(images_2_show, titles_2_show)

m_train = len(x_train)
m_test = len(x_test)
num_pixel = len(x_train[0])

# print(f"Number of training examples in training set : {m_train}")
# print(f"Number of training examples in test set : {m_test}")
# print(f"size of each image : {num_pixel}*{num_pixel}")

In [11]:
#unrolling the training data
for i in range(m_train):
    x_train[i] = np.array(x_train[i]).reshape((-1,1))

x_train = np.array(x_train)
x_train = x_train.reshape((num_pixel*num_pixel,m_train))

#unrolling the test data
for i in range(m_test):
    x_test[i] = np.array(x_test[i]).reshape((-1,1))

x_test = np.array(x_test)
x_test = x_test.reshape((num_pixel*num_pixel,m_test))
y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = y_train.reshape((60000,1)).T
y_test = y_test.reshape((10000,1)).T

In [12]:
def model_train(layer_dims, learning_rate, X, Y, num_iterations):

    parameters = initialize_parameters(layer_dims)
    for i in range(num_iterations):
        AL,caches = forward_propogation(parameters,X)
        J = compute_cost(AL, Y)
        grads = backpropogation(caches,Y,AL)
        parameters = update_parameters(parameters, grads, learning_rate)

        if i%100==0:
            print(f"Cost over iteration {i} : {J}")

    return parameters

In [13]:
layer_dims = [x_train.shape[0],25,15,10]
parameters = model_train(layer_dims, 0.3, x_train, y_train,1001)

Cost over iteration 0 : 2.302724968249625
Cost over iteration 100 : 2.302636857696377
Cost over iteration 200 : 2.302616364768769
Cost over iteration 300 : 2.3026097805677215
Cost over iteration 400 : 2.302606290421627
Cost over iteration 500 : 2.302603812846365
Cost over iteration 600 : 2.3026018327885076
Cost over iteration 700 : 2.302600182404442
Cost over iteration 800 : 2.3025987819277027
Cost over iteration 900 : 2.3025975800591056
Cost over iteration 1000 : 2.3025965389949756


In [24]:
def predict(X : np.ndarray, Y : np.ndarray, parameters : dict[str,np.ndarray]):
    m = X.shape[1]
    L = len(parameters)//2

    A_prev = X
    
    for l in range(1,L):
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]

        Z = np.dot(W,A_prev) + b
        A_prev = relu(Z)

    W = parameters[f"W{L}"]
    b = parameters[f"b{L}"]

    Z = np.dot(W,A_prev) + b
    AL = softmax(Z)

    predictions = np.argmax(AL, axis=0)       
    accuracy = np.mean(predictions == Y.flatten())
    print(f"predictions : {predictions[:10]}")
    print(f"y_train : {Y[:10]}")
    return accuracy*100       

In [25]:
print(f"Accuracy for Training set : {predict(x_train,y_train,parameters)}")

predictions : [7 4 3 7 7 1 1 1 1 4]
y_train : [[5 0 4 ... 5 6 8]]
Accuracy for Training set : 10.12
