Notebook for implementing forward pass and backword pass of neural network from scratch.
- Motivation behind this notebook was the mathmatical explanation of [this course](https://www.coursera.org/learn/neural-networks-deep-learning/) by Andrew Ng

### Importing

In [36]:
import numpy as np
import pandas as pd 
import random 

Necessary input parameters [Layer Information and hyperameters]

In [37]:
layer_dims = [5, 5, 2] # list of layer of the neural networks 
# so this model will have 3 layer of neural network and the last layer is a classification layer
# layer dims also inclues input array shape like layer_dims[0] = X.shpae[0]

# Initializing the weights

### Initialization of the weights
- The follwoing function will initialize all the parameters(weights and biases) for the network.
 - xaviar initalizer is used for initialization of the weights and random for biases


In [38]:

def weight_initializer(layer_dims):
    """ This function will initalize all the perameter and stores these parameters
        in a dictionary named parameters. 
        xaviar initalizer is used for initialization for weights parameter and zero initialization for bias parameters.
        
        Args:
        layer_dims(list)= dimensions of all the layer of the neural network
        output: all the weights and bais parameter (w1, b1,..... etc)
    """
    parameters= {} # dictionary for all parameters 
    # parameters will be stored as w1, b1, w2, b2 .....
    L= len(layer_dims)
    
    for l in range(1, L):
        # w(weights) will be shaped as next layer nodes by previous layer nodes for example w1= (n1, n0)
        parameters['W'+str(l)]= np.random.rand(layer_dims[l], layer_dims[l-1]) * np.sqrt(2/(layer_dims[l]+layer_dims[l-1]))
        # b(bias variable) will be shaped as next layer nodes by 1.
        parameters['b'+str(l)] = np.random.rand(layer_dims[l], 1)
        
        # check wether the assignment have the desired shape.
        assert(parameters['W'+str(l)].shape==(layer_dims[l],layer_dims[l-1]))
        assert(parameters['b'+str(l)].shape==(layer_dims[l],1))
        
    return parameters

In [39]:
# layer_dims=[5,5,2]
param= weight_initializer(layer_dims)
param

{'W1': array([[0.32542753, 0.37364332, 0.22594846, 0.37229411, 0.25005722],
        [0.29733825, 0.40022195, 0.18433493, 0.18114278, 0.23502322],
        [0.12182328, 0.19385777, 0.28549221, 0.09497587, 0.04772753],
        [0.01186503, 0.3293282 , 0.19713594, 0.02197311, 0.4114601 ],
        [0.01418737, 0.24916951, 0.43096036, 0.00484937, 0.18539176]]),
 'b1': array([[0.07328775],
        [0.37671992],
        [0.45621285],
        [0.66116328],
        [0.67549705]]),
 'W2': array([[0.41384845, 0.16759992, 0.29326084, 0.22022787, 0.52523349],
        [0.51896394, 0.24773936, 0.12588104, 0.28093464, 0.47747299]]),
 'b2': array([[0.31731246],
        [0.78206464]])}

# Forward propagation

### Linear forward

- This function will compute linear computation of the forward propagation.
- More specifically, will compute this: Z= W . X + b (where w=weights, x=input vector, b=bias value) 

In [40]:
# linear_forward
# activation function to that linear_forward 
# Gets the node activation values

def linear_forward(A_prev, W, b):
    """This function will compute the linear activation for the nodes of the network 
        using the values of previous nodes activation values
        linear forward function:
        Z= W . X + b 

    Args:
        A_prev(nd array) : Previous layer activation values 
        W : weight matrix for the forward path between two nodes (shape=(n1, n0))
        b : bias varibale matrix 
        
    output:
        Z: next layer node z values. matrix.
    """
    # Z = np.dot(W, A_prev) + b
    Z = np.dot(W, A_prev) 
    # type(Z)
    assert (Z.shape == (W.shape[0], A_prev.shape[1]))
    
    return Z    

#### Activation functions

- Implements all the necessary activation function for NN
- These functions will add non-linearity in our linear nn system.

In [41]:
# all the related activation function for the neural network
# These functions will add non-linearity in our linear nn system.

def sigmoid(Z):
    """Computes the sigmoid of the linear function, hence introduce non-linearity.
        It will squashed the linear values of the forward path between 0 and 1.
    
    Args:
        Z (nd array): output of the linear forward path function.
    output: 
        A(nd array): activations of the nodes of a layer.
    """
    
    A= 1/ (1+np.exp(-Z))
    
    assert (A.shape == Z.shape)
    
    return A

def tanh(Z):
    """Computes the tanh (hyporabolic tangent) of the linear function, hence introduce non-linearity.
    
    Args:
        Z(nd array) : output of the linear forward path function.
    output: 
        A(nd array): activations of the nodes of a layer.
    """
    A= np.tanh(Z)
    assert (A.shape== Z.shape)
    
    return A


def relu(Z):
    """Evaluates the ReLU(Rectified Linear Unit) function value of the linear function, hence introduce non-linearity.
        ReLU(z) = max(0, z)
    Args:
        Z (nd array): output of the linear forward path function.
    output: 
        A(nd array): activations of the nodes of a layer.
    """
    
    A= np.maximum(0, Z)
    assert (A.shape == Z.shape)
    
    return A

def leaky_relu(Z):
    """Evaluates the leaky ReLU function(extension of ReLU) value of the linear function, hence introduce non-linearity.
        LeakyReLU(z) = max(0.01 * z, z)
    Args:
        Z (nd array): output of the linear forward path function.
    output: 
        A(nd array): activations of the nodes of a layer.
    """
    
    A= np.maximum(0.01*Z, Z)
    assert (A.shape == Z.shape)
    
    return A

## Forward pass of single layer

In [42]:
def linear_activation_forward(A_prev, W, b, activation_function='sigmoid'):
    """ Combines the linear function and activation function to get activations of the nodes
    Args:
        A_prev
        W 
        b 
        activation_function (str, optional):Which activation function to use. Defaults to 'sigmoid'.
    """
    # computing Z
    Z= linear_forward(A_prev, W, b)
    
    # computing A
    
    if activation_function == 'sigmoid' :
        A= sigmoid(Z)
    elif activation_function == 'tanh' :
        A= tanh(Z)
    elif activation_function == 'relu' :
        A= relu(Z)
    elif activation_function == 'leakyrelu' :
        A= leaky_relu(Z)
    
    cache= (Z, A, A_prev, W, b) # all the variables of a forward node in one list
                                # These will be needed for back propagation
    
    return A, cache
    

# Full forward propagation of full Neural network 

This function will calcaulte a complte forward propagation of full neural network and giving the prediction as output

In [43]:
def forward_propagation(X, parameters):
    """This function will execute the full forward path of a neural network and will return prediction or probalility.

    Args:
        X : Input nd array
        parameters: all the parameter to compute activations
        
    Output:
        AL= last layer activation values. (predicted Y)
        caches= All necessary values of a node for back propagation. (Z, A, A_prev, W, b)
    """
    
    caches= []
    
    A= X
    L= len(parameters)//2 # finding number of layers. As all layer have w, b parameter.
    
    #implementing [linear>relu((L-1)times)]
    for l in range (1, L): # interates untill the last layer
        A_prev= A
        A, temp_cache= linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation_function= 'leakyrelu')
        
        caches.append(temp_cache)
    
    # last layer
    AL, temp_cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation_function= 'sigmoid')
    caches.append(temp_cache)
    
    return AL, caches


# Cost/loss calculation

In [44]:
def compute_cost(AL, Y, parameters, lamb= 1.25):
    """ Computs loss/ cost of the forward path

    Args:
        AL : predicted label
        Y : Actual label
        parameters 
        lamb (float, optional): regularization constant lambda. Defaults to 1.25.
    """
    
    m= Y.shape[1] # output layer node number/ class of classifier
    
    L= len(parameters)//2 # no of layers
    
    # regularization part
    #"https://towardsdatascience.com/regularization-an-important-concept-in-machine-learning-5891628907ea#:~:text=Regularization%20is%20a%20technique%20used,don't%20take%20extreme%20values."
    
    reg= 0 
    for l in range(1, L+1) :
        reg= reg + np.sum(parameters['W'+ str(l)]**2)
    
    cost=(-1/m)*np.sum(Y*np.log(AL+1e-10) + (1 - Y) * np.log(1 - AL-1e-10)) + (lamb/(2*m)) * reg
    # 1e-10 is added in log sothat zero can go into log
    
    cost= np.squeeze(cost) # cost must be a number rather than a array.
    assert(cost.shape== ())
    
    return cost

In [45]:
### Checking everything working fine

K= np.array([[1], [1.01], [0], [2], [5]]) #input
parameters= weight_initializer(layer_dims) # initialization
AL, caches= forward_propagation(K, parameters) # compute prediction
Y= np.array([[0],[1]])
c= compute_cost(AL, Y, parameters)


In [46]:
parameters

{'W1': array([[0.30690338, 0.11173668, 0.44643018, 0.08852416, 0.02707642],
        [0.16384467, 0.25581957, 0.02770852, 0.18784855, 0.29387433],
        [0.25077069, 0.35368401, 0.4163648 , 0.02162521, 0.44599713],
        [0.18581151, 0.1432331 , 0.00589277, 0.06285917, 0.26218866],
        [0.10360754, 0.24546001, 0.06052531, 0.00656289, 0.42867652]]),
 'b1': array([[0.38185299],
        [0.2237979 ],
        [0.21490264],
        [0.26438148],
        [0.78754164]]),
 'W2': array([[0.50356514, 0.32030772, 0.52615517, 0.47563611, 0.10438827],
        [0.31259295, 0.00123094, 0.067075  , 0.03374472, 0.39424899]]),
 'b2': array([[0.38944142],
        [0.06892909]])}

### Back Propagation

In [47]:
def linear_backward(dZ, cache):
    """computes gradient for Z= W.A_prev + b
    Args:
        dZ (_type_): previous gradient of Z
        cache (_type_): All the required variables stored in forward propagation. (Z, A, A_prev, W, b)
    Output: 
        Gradient of the cost with respect to A, W, b
    """
    Z, A, A_prev, W, b = cache
    
    m= A_prev.shape[1]
    
    dW= (1/m) * np.dot(dZ, A_prev.T)
    db= (1/m) * np.sum(dZ, axis=1, keepdims= True)
    dA_prev= np.dot(W.T, dZ)
    
    assert(dA_prev.shape == A_prev.shape)
    assert(dW.shape == W.shape)
    assert(db.shape == b.shape)
    
    return dA_prev, dW, db
    
    

In [48]:
def sigmoid_backward(dA, cache):
    """Implement the backward propagation for a single SIGMOID unit.
    
    Arguments:
    dA : post-activation gradient
    cache : All the required variables stored in forward propagation. (Z, A, A_prev, W, b)
    
    Returns:
    dZ : Gradient of the cost with respect to Z
    """
    Z, A, A_prev, W, b= cache
    
    dZ= dA * A * (1 - A)
    assert (dZ.shape == Z.shape)
    
    return dZ

def tanh_backward(dA, cache):
    """Implement the backward propagation for a single tanh unit.
    
    Arguments:
    dA : post-activation gradient
    cache : All the required variables stored in forward propagation. (Z, A, A_prev, W, b)
    
    Returns:
    dZ : Gradient of the cost with respect to Z
    """
    Z, A, A_prev, W, b= cache
    
    dZ= dA * A * (1-A) * (1-A)
    assert (dZ.shape == Z.shape)
    
    return dZ


def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.
    
    Arguments:
    dA : post-activation gradient
    cache:All the required variables stored in forward propagation. (Z, A, A_prev, W, b)
    
    Returns:
    dZ : Gradient of the cost with respect to Z
    """
    
    Z, A, A_prev, W, b= cache
    dZ = np.array(dA) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z<0]=0
    
    assert (dZ.shape == Z.shape)
    
    return dZ


def leakyrelu_backward(dA, cache):
    """
    Implement the backward propagation for a single  Leaky RELU unit.
    
    Arguments:
    dA : post-activation gradient
    cache:All the required variables stored in forward propagation. (Z, A, A_prev, W, b)
    
    Returns:
    dZ : Gradient of the cost with respect to Z
    """
    
    Z, A, A_prev, W, b= cache
    dZ = np.array(dA) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z<0]= 0.01 
    
    assert (dZ.shape == Z.shape)
    
    return dZ



In [49]:
def linear_activation_backword(dA, cache, activation_function= 'sigmoid'):
    # linear_cache,activation_cache=cache
    
    if activation_function =='relu':
        dZ=relu_backward(dA,cache)
        dA_prev, dW, db=linear_backward(dZ,cache)
    
    elif activation_function =='leakyrelu':
        dZ=leakyrelu_backward(dA,cache)
        dA_prev, dW, db=linear_backward(dZ,cache)
    
    elif activation_function =='sigmoid':
        dZ=sigmoid_backward(dA, cache)
        dA_prev, dW, db=linear_backward(dZ, cache)
    
    elif activation_function =='tanh':
        dZ=tanh_backward(dA, cache)
        dA_prev, dW, db=linear_backward(dZ, cache)
    
    return dA_prev, dW, db

In [50]:
def back_propagation(AL, Y, caches, parameters):
    """computes the backpropagation of the network

    Args:
        AL : Prediction
        Y : labels
        caches : collection of all layer cache. layer cache ==(Z, A, A_prev, W, b)
    """
    
    grads= {} # dictionary for all grad of the newtork
    
    L= len(parameters)//2 
    
    Y= Y.reshape(AL.shape)
    
    AL= -(np.divide(Y, AL) - np.divide(1-Y, 1-AL+1e-11))
    
    dAL= -1 * (Y/(AL+1e-11)- (1-Y)/(1-AL+1e-11))
    
    current_cache= caches[L-1]
    grads['dA'+str(L-1)], grads['dW'+str(L)], grads['db'+str(L)]= linear_activation_backword(dAL, current_cache, activation_function="sigmoid")
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads['dA'+str(l)], grads['dW'+str(l+1)], grads['db'+str(l+1)]= linear_activation_backword(grads['dA'+str(l+1)], current_cache, activation_function="relu")
    
    return grads

In [51]:
grads= back_propagation(AL, Y, caches, parameters)
grads

{'dA1': array([[ 3.82879788e-02],
        [-2.98047834e-05],
        [ 7.97851174e-03],
        [ 3.89424802e-03],
        [ 4.85906875e-02]]),
 'dW2': array([[-0.00041535, -0.00128617, -0.00163443, -0.00100244, -0.00142273],
        [ 0.0903512 ,  0.27978131,  0.35554041,  0.21806301,  0.30948829]]),
 'db2': array([[-0.00056727],
        [ 0.12339893]]),
 'dA0': array([[0.01950456],
        [0.01957727],
        [0.02337797],
        [0.00412003],
        [0.02643705]]),
 'dW1': array([[ 3.82879788e-02,  3.86708585e-02,  0.00000000e+00,
          7.65759575e-02,  1.91439894e-01],
        [-2.98047834e-05, -3.01028312e-05,  0.00000000e+00,
         -5.96095667e-05, -1.49023917e-04],
        [ 7.97851174e-03,  8.05829686e-03,  0.00000000e+00,
          1.59570235e-02,  3.98925587e-02],
        [ 3.89424802e-03,  3.93319050e-03,  0.00000000e+00,
          7.78849604e-03,  1.94712401e-02],
        [ 4.85906875e-02,  4.90765943e-02,  0.00000000e+00,
          9.71813749e-02,  2.42953437e-0

# Optimization and Update parameters

In [52]:
def update_parameters(X, parameters, grads, learning_rate= 0.001, lamb= 1.25):
    """ This function will update parameter values according the gradient of the network
    Args:
        X : input array
        parameters : all weight and bias parameters
        grads (_type_): gradients of corresponding parameters
        learning_rate (float, optional): learning rate. Defaults to 0.001.
        lamb (float, optional): lambda for regularization. Defaults to 1.25.
    """
    
    L= len(parameters)//2
    m= X.shape[1]
    
    for l in range(L):
        parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * (grads['dW'+str(l+1)] )#+(lamb/m)*parameters['W'+str(l+1)])
        parameters['b'+str(l+1)]=parameters['b'+str(l+1)]-learning_rate*grads['db'+str(l+1)]
        
    return parameters

In [53]:
update_parameters(K, parameters, grads, learning_rate= 0.001, lamb= 1.25)

{'W1': array([[0.30686509, 0.11169801, 0.44643018, 0.08844759, 0.02688498],
        [0.1638447 , 0.2558196 , 0.02770852, 0.18784861, 0.29387448],
        [0.25076271, 0.35367595, 0.4163648 , 0.02160926, 0.44595724],
        [0.18580762, 0.14322917, 0.00589277, 0.06285138, 0.26216919],
        [0.10355895, 0.24541093, 0.06052531, 0.00646571, 0.42843357]]),
 'b1': array([[0.3818147 ],
        [0.22379793],
        [0.21489466],
        [0.26437758],
        [0.78749305]]),
 'W2': array([[0.50356556, 0.320309  , 0.52615681, 0.47563711, 0.10438969],
        [0.3125026 , 0.00095115, 0.06671946, 0.03352666, 0.3939395 ]]),
 'b2': array([[0.38944198],
        [0.06880569]])}

# Full Training (Forward pass + backward pass + parameter updates)

In [54]:
def optimize(X, Y, num_iterations, learning_rate= 0.001, lamb= 0):
    """Runs all the function of the model.
        initialize parameter ->forward propagation -> cost -> back propagation -> update parameter -> repeat.
        

    Args:
        X :
        Y : 
        num_iterations : iteration number
        learning_rate (float, optional): . Defaults to 0.001.
        lamb (int, optional): . Defaults to 0.
    """
    costs= []
    n= X.shape[0]
    layer_dims= [n, 5, 5, 2]
    parameters= weight_initializer(layer_dims)
    
    for i in range(num_iterations):
        
        # forward propagation
        AL, caches= forward_propagation(X, parameters)
        cost= compute_cost(AL, Y, parameters)
        
        #back propagation
        grads= back_propagation(AL, Y, caches, parameters)
        parameters= update_parameters(X, parameters, grads, learning_rate, lamb= 2)
        
        #printing cost
        # if i%1000 == 0:
        #     costs.append(cost)
        #     print(cost)
        costs.append(cost)
        print(cost)
        
    
    return parameters, costs
    

In [57]:
p, costs= optimize(K, Y, learning_rate= 0.01, num_iterations= 10, lamb=0.5)

5.4389071667775655
5.419422681163059
5.400025516572007
5.380728249850053
5.361543576956823
5.342484272957304
5.323563150410922
5.304793016494477
5.286186629225046
5.267756653173809


### Prediction

In [58]:
def predict(X,parameters):
    #m=X.shape[1]
    #Y_prediction=np.zeros((1,m))
    AL,_=forward_propagation (X,parameters)
    
    Y_prediction=np.round(AL)