#Packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras 

#Initializing parameters

In [None]:
def initialize_parameters(layer_dims_incl_input):
    """
    Initializes the weights(random initialization) and biases(initialization with zeros)

    Arguments :
    layer_dims_incl_input -- python list containing the dimensions of each layer(including input layer), i.e. , no of units in each layer in our network
    
    Returns :
    parameters -- python dictionary containing the parameters "W1", "b1", ..., "WL", "bL"
    Wl -- weight matrix , a numpy array of shape (layer_dims_incl_input[l], layer_dims_incl_input[l-1])
    bl -- bias vector , a numpy array of shape (layer_dims_incl_input[l], 1)
    """
    np.random.seed(3)
    parameters = {}
    L_i = len(layer_dims_incl_input) # number of layers in the network including input

    for l in range(1, L_i):
        parameters['W' + str(l)] = np.random.randn(layer_dims_incl_input[l], layer_dims_incl_input[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims_incl_input[l],1))
        
    return parameters

#Activation functions

In [None]:
def sigmoid(Z):
    return (1/(1+np.exp(-Z)))

In [None]:
def relu(Z):
    return np.maximum(0,Z)

#Forward propagation

In [None]:
def forward_1_layer(A_prev, W, b, activation):
    """
    Implements the forward propagation for a single layer

    Arguments:
    A_prev -- activations from previous layer (or input data) with shape =(size of previous layer, number of examples)
    W -- weights matrix for current layer, numpy array of shape =(size of current layer, size of previous layer)
    b -- bias vector for current layer, numpy array of shape =(size of the current layer, 1)
    activation -- the activation function used in current layer , stored as a text string : "sigmoid" or "relu"

    Returns
    A -- activation of current layer, obtained as the output of the activation function for the current layer
    cache -- a python dictionary containing A_prev,W,b and Z;stored for computing the backward pass efficiently
    """
    Z=np.dot(W,A_prev)+b

    if activation == "sigmoid":
        A = sigmoid(Z)

    elif activation == "relu":
        A = relu(Z)
        
    cache = {
        "A_prev" : A_prev ,
        "W" : W ,
        "b" : b ,
        "Z" : Z
    }

    return A, cache

In [None]:
def forward_L_layers(X, parameters,activations):
    """
    Implements forward propagation for all the layers of the neural network 
    
    Arguments:
    X -- input features(pixel values in our case) for all records(all images), numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters()
    activations -- python list of size no_of_layers_including_input containing activations for all the layers; activations[l]='activation function of layer l' ;activations[0] is non-existent

    Returns:
    AL -- activation value from the output layer(last layer of the network)
    caches -- pyhton list of size no_of_layers_including_input containing every cache (every cache returned by forward_1_layer() function); caches[l]=cache for layer l; cache[0] is non-existent
    """
    caches = [{}]
    
    A = X
    L = len(parameters) // 2 # number of layers in the neural network excluding input layer
    
    # The for loop goes from 1 to L because layer 0 is the input
    for l in range(1, L+1):
        A_prev = A 

        A, cache = forward_1_layer(A_prev, parameters["W"+str(l)], parameters["b"+str(l)], activations[l])
        caches.append(cache)
    
    # At the end of for loop , A becomes AL, activation of last layer 
    return A, caches

#Calculating cost

Computes the binary cross-entropy cost $J$, using the following formula: $$J=-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right))$$

In [None]:
def compute_cost(AL, Y):
    """
    Implements the binary cross entropy cost function.

    Arguments:
    AL -- activation of the last layer of the network which denotes the probability vector corresponding to our label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if the digit is zero, 1 if non-zero), shape (1, number of examples)

    Returns:
    cost -- binary cross-entropy cost
    """
    m = Y.shape[1]
    cost =(-1/m)*(np.dot(Y,(np.log(AL)).T) + np.dot(1-Y,(np.log(1-AL)).T))
  
    cost = np.squeeze(cost) # To make sure cost is a number (int or float); e.g. this turns [[10]] into 10
    return cost

#Derivatives of activation funtions

In [None]:
def sigmoid_derivative(Z):
    return sigmoid(Z)*(1-sigmoid(Z))

In [None]:
def relu_derivative(Z):
    return (Z>0)

#Backpropagation

Initialising backpropagation(for binary cross entropy loss):

$$dA^{[L]} = \frac{\partial \mathcal{L}}{\partial A^{[L]}}=- (np.divide(Y, A^{[L]}) - np.divide(1 - Y, 1 - A^{[L]}))$$

General formula:
$$dZ^{[l]} = \frac{\partial \mathcal{L}}{\partial Z^{[l]}} = dA^{[l]} * g'(Z^{[l]})$$
$$ dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} $$
$$ db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} $$

In [None]:
def backward_1_layer(dA, cache, activation):
    
    A_prev=cache["A_prev"]
    W=cache["W"]
    b=cache["b"]  
    Z=cache["Z"]
    m=A_prev.shape[1]


    if activation == "relu":
        dZ = dA * relu_derivative(Z)
  
    elif activation == "sigmoid":
        dZ =  dA * sigmoid_derivative(Z)


    dW = (1/m)*np.dot(dZ,A_prev.T)
    db = (1/m)*np.sum(dZ,axis=1,keepdims=True)
    dA_prev = np.dot(W.T,dZ)    
    
    return dA_prev, dW, db

In [None]:
def backward_L_layers(AL, Y, caches , activations):
    
    grads = {}
    L = len(caches)-1 # the number of layers excluding input
    m = AL.shape[1]
    
    # Initializing the backpropagation(for cross entropy loss)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    dA=dAL
    
    
    # Loop from l=L to l=1
    for l in reversed(range(1,L+1)):
        current_cache =  caches[l]
        dA_prev, dW, db = backward_1_layer(dA, current_cache, activations[l])
        grads["dA" + str(l-1)] = dA_prev
        grads["dW" + str(l)] = dW
        grads["db" + str(l)] = db
        dA=dA_prev
       

    return grads

#Update parameters
Update the parameters using gradient descent following the update rule
$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]}$$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]}$$

where $\alpha$ is the learning rate.

In [None]:
def update_parameters(parameters, grads, learning_rate):
    
    parameters = parameters.copy()
    L = len(parameters) // 2 # number of layers in the neural network

    # Updating all parameters using for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)]-learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]-learning_rate*grads["db" + str(l+1)]

    return parameters

#Combining all the funtions to build the model

In [None]:
def neural_network_train(X, Y, layer_dims_incl_input,activations, learning_rate = 0.01, num_iterations = 3000, print_cost=False):
    
    np.random.seed(1)
    costs = []                         # list to keep track of cost
    
    # Parameters initialization
    parameters = initialize_parameters(layer_dims_incl_input)
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation:
        AL, caches = forward_L_layers(X, parameters,activations)
  
        
        # Compute cost.
        cost = compute_cost(AL, Y)
  
    
        # Backward propagation.
        grads = backward_L_layers(AL, Y, caches , activations)    
  
 
        # Update parameters.
        parameters = update_parameters(parameters, grads, learning_rate)
  

        # Print the cost every 200 iterations
        if print_cost and i % 200 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))


        #Append the cost after every iteration
        if i % 1 == 0 or i == num_iterations:
            costs.append(np.squeeze(cost))
    
    return parameters, costs

#Predict function

In [None]:
def neural_network_predict(X,Y,parameters,activations):
    AL, caches = forward_L_layers(X, parameters,activations)
    
    Y_pred=np.array((AL>=0.5), dtype=int)

    print("Accuracy in percentage is "+str((100 - np.mean(np.abs(Y_pred - Y)))))

    return Y_pred

#Loading and processing the dataset

In [None]:
(X_train, Y_train), (X_test, Y_test) = keras.datasets.mnist.load_data()

In [None]:
# Changing the labels to zero or non-zero
Y_train=np.array((Y_train!=0), dtype=int)
Y_test=np.array((Y_test!=0), dtype=int)

In [None]:
plt.imshow(X_test[1])
print(Y_test[1])

In [None]:
X_train=(X_train/255)
X_test=(X_test/255)

In [None]:
X_train=X_train.reshape(X_train.shape[0],-1).T
X_test=X_test.reshape(X_test.shape[0],-1).T
Y_train=Y_train.reshape(Y_train.shape[0],-1).T
Y_test=Y_test.reshape(Y_test.shape[0],-1).T

In [None]:
print(X_train.shape)
print(X_test.shape )
print(Y_train.shape)
print(Y_test.shape )

#Training and Testing the model

In [None]:
layer_dims_incl_input=[X_train.shape[0],16,1]
activations=['NULL','relu','sigmoid']
parameters, costs = neural_network_train(X_train, Y_train, layer_dims_incl_input, activations, learning_rate = 0.01, num_iterations = 200, print_cost = True)

In [None]:
plt.plot(costs)

In [None]:
predictions_test = neural_network_predict(X_test, Y_test, parameters,activations)

In [None]:
X_test_pic=X_test.T.reshape(-1,28,28)

plt.imshow(X_test_pic[0])
print(predictions_test[0][0])

In [None]:
count=0
for i in range(Y_test.shape[1]):
  if(predictions_test[0][i]!=Y_test[0][i]):
    count+=1

print("The no of incorrect predictions is "+str(count))