<a href="https://colab.research.google.com/github/preranaprakashh/neuralnet/blob/main/mlpmnist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# loading the mnist dataset
import tensorflow.keras.datasets.mnist as mnist
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from keras.utils import np_utils

# load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# number of elements in training and testing datasets
m_train = x_train.shape[0]
m_test = x_test.shape[0]
image_size = x_train.shape[1]

# summarize loaded dataset
print('X_train: ' + str(x_train.shape))
print('Y_train: ' + str(y_train.shape))
print('X_test:  ' + str(x_test.shape))
print('Y_test:  ' + str(y_test.shape[0]))
print('Training images:', m_train)
print('Test images:', m_test)
print('height = width =', image_size)


for i in range(9):
    # define subplot
    # plt.subplot(330 + 1 + i)
    # # plot raw pixel data
    # plt.imshow(x_train[i], cmap=plt.get_cmap('gray'))
    print("y = " + str(y_train[i]))
# show the figure
plt.show()

x_train_flat = x_train.reshape(x_train.shape[1]*x_train.shape[2], x_train.shape[0])
x_test_flat = x_test.reshape(x_test.shape[1]*x_test.shape[2], x_test.shape[0])
y_train = y_train.reshape(1, y_train.shape[0])
y_test = y_test.reshape(1, y_test.shape[0])

print ("x_train_flat: " + str(x_train_flat.shape))
print ("x_test_flat: " + str(x_test_flat.shape))
print ("y_train:", y_train.shape, "y_test", y_test.shape)

x_train = x_train_flat/255
x_test = x_test_flat/255

y_train = np_utils.to_categorical(y_train)
print("changeeeeeeeee", y_train.shape)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]



def relu(Z):
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache

def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ


def softmax(x):
    x_exp = np.exp(x)
    x_sum = np.sum(x_exp, axis=1, keepdims = True)
    Z = x_exp/x_sum 
    cache = x
    assert(Z.shape == x.shape)
    return Z, x


def softmax_backward(x, cache):
    S = cache
    print("size of softmax = ", S.shape)
    S_vector = S.reshape(S.shape[0],1)
    S_matrix = np.tile(S_vector,S.shape[0])
    der = np.diag(S) - (S_matrix * S_matrix.T)
    assert (der.shape == x.shape)
    return der

#initialize parameters to random numbers for W and zeros for b
def parameter_init(layer_num):
    parameters = {}
    L = len(layer_num)
    for l in range(1, L):
        parameters['W'+str(l)] = np.random.randn(layer_num[l], layer_num[l-1])
        parameters['b'+ str(l)] = np.zeros((layer_num[l], 1))
    return parameters


def fwd_lin(A, W, b):
    # to calculate Z value for all layers
    Z = np.dot(W, A) + b #Z[L] = W[L]A[L-1] + b[L]
    lin_cache = (A, W, b) #inputs A, W and b are stored in cache
    
    return Z, lin_cache

def fwd_acti(Aprev, W, b, acti):
    if acti == "relu":
        Z, lin_cache = fwd_lin(Aprev, W, b) #calculate Z for each layer
        A, acti_cache = relu(Z) #use relu activation function 
    elif acti == "softmax":
        Z, lin_cache = fwd_lin(Aprev, W, b)
        A, acti_cache = softmax(Z)
    cache = (lin_cache, acti_cache)
    return A, cache

def fwd(X, parameters):
    #to define the activation function for every layer and implement it
    caches = []
    A = X
    L = len(parameters) // 2  # number of layers
    for l in range(1, L):
        Aprev = A 
        A, cache = fwd_acti(Aprev, parameters['W' + str(l)], parameters['b' + str(l)], acti = "relu") #for hidden layers
        caches.append(cache)

    AL, cache = fwd_acti(A, parameters['W' + str(L)], parameters['b' + str(L)], acti = "softmax") #for output layer
    caches.append(cache)
   
    return AL, caches

def costx(AL, Y):
    m = Y.shape[1]
    print(Y.shape)
    print(AL.shape)
    cost = -1/m * np.sum(np.multiply(Y, np.log(AL))+np.multiply((1-Y), np.log(1-AL)), axis = 1, keepdims = True) #J = sum(yloga+(1-y)log(1-a))
    cost = np.squeeze(cost)    
    return cost

def back_lin(dZ, cache):
    Aprev, W, b = cache
    m = Aprev.shape[1]
    dW = 1/m * np.dot(dZ, Aprev.T)
    db = 1/m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

def back_acti(dA, cache, acti):
    lin_cache, acti_cache = cache
    
    if acti == "relu":
        dZ = relu_backward(dA, acti_cache)
        dA_prev, dW, db = back_lin(dZ, lin_cache)
        
    else: #if acti == "softmax"
        dZ = softmax_backward(dA, acti_cache)
        dA_prev, dW, db = back_lin(dZ, lin_cache)
        
    return dA_prev, dW, db

def bwd(AL, Y, caches):
    grads = {} # the gradients are stored in grads
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL

    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
    
    # last layer is softmax
    current_cache = caches[L-1]
    dA_prev_temp, dW_temp, db_temp = back_acti(dAL, current_cache, acti = "sigmoid")
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp

    # hidden layers are relu
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = back_acti(grads["dA" + str(l+1)], current_cache, acti = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l+1)] = dW_temp
        grads["db" + str(l+1)] = db_temp
    
    return grads

def parameter_update(params, grads, learning_rate):
    parameters = params.copy()
    L = len(parameters) // 2 # number of layers

    # Update rule for each parameter
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW"+str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db"+str(l+1)]

    return parameters



layer_dim = [784, 300, 100, 10] #  3-layer model

def three_layer_model(X, Y, layer_dim, learning_rate = 0.0075, iterations = 3000, print_cost=False):
    costs = []                         # keep track of cost
    
    parameters = parameter_init(layer_dim)
    
    for i in range(0, iterations):
        AL, caches = fwd(X, parameters) #forward propagation
        cost = costx(AL, Y)#compute cost
        grads = bwd(AL, Y, caches)#backward propagation
        parameters = parameter_update(parameters, grads, learning_rate)#update parameters

        if print_cost and i % 100 == 0 or i == iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0 or i == iterations:
            costs.append(cost)
    
    return parameters, costs

parameters, costs = three_layer_model(x_train, y_train, layer_dim, iterations = 2500, print_cost = True)




X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  10000
Training images: 60000
Test images: 10000
height = width = 28
y = 5
y = 0
y = 4
y = 1
y = 9
y = 2
y = 1
y = 3
y = 1
x_train_flat: (784, 60000)
x_test_flat: (784, 10000)
y_train: (1, 60000) y_test (1, 10000)
changeeeeeeeee (1, 60000, 10)
(1, 60000, 10)
(10, 60000)




ValueError: ignored