**Importing the Libraries**

In [24]:
import numpy as np
np.random.seed(42)

--------------------------------

**Nodes in each layer**

In [2]:
input_nodes = 5
hidden_1_nodes = 3
hidden_2_nodes = 5
output_nodes = 4

-------------------------------------

**Inputs and true outputs**

In [27]:
x = np.random.randint(1, 100, size = (input_nodes, 1)) / 100
x

array([[0.88],
       [0.24],
       [0.03],
       [0.22],
       [0.53]])

In [4]:
y = np.array([[0], [1], [0], [0]])
y

array([[0],
       [1],
       [0],
       [0]])

------------------

**Defining Activation functions and loss with their derivatives**

Sigmoid for first hidden layer

In [5]:
def sig(x):
    return 1/(1 + np.exp(-x))           

In [6]:
def sig_dash(x):
    return sig(x) * (1 - sig(x))

Softmax for second hidden layer and output layer

In [7]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))      

In [8]:
def softmax_dash(x):
    
    I = np.eye(x.shape[0])
    
    return softmax(x) * (I - softmax(x).T)

Categorical cross-entropy loss

In [9]:
def cross_E(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred + 10**-100))

In [10]:
def cross_E_grad(y_true, y_pred):
    return -y_true/(y_pred + 10**-100)

--------------------

**Random initialization of weights and biases**

In [11]:
w1 = np.random.random(size = (hidden_1_nodes, input_nodes))
b1 = np.zeros(shape = (hidden_1_nodes, 1))

In [12]:
w2 = np.random.random(size = (hidden_2_nodes, hidden_1_nodes))
b2 = np.zeros(shape = (hidden_2_nodes, 1))

In [13]:
w3 = np.random.random(size = (output_nodes, hidden_2_nodes))
b3 = np.zeros(shape = (output_nodes, 1))

------------------

**Forward feed before training**

In [14]:
in_hidden_1 = w1.dot(x) + b1
out_hidden_1 = sig(in_hidden_1)

in_hidden_2 = w2.dot(out_hidden_1) + b2
out_hidden_2 = softmax(in_hidden_2)

in_output_layer = w3.dot(out_hidden_2) + b3
y_hat = softmax(in_output_layer)

y_hat

array([[0.22154299],
       [0.2905346 ],
       [0.23804737],
       [0.24987504]])

In [15]:
y

array([[0],
       [1],
       [0],
       [0]])

In [16]:
cross_E(y, y_hat)

1.2360326123452905

-------------------------

**SGD Momentum**

In [17]:
learning_rate = 0.01
momentum = 0.9

In [18]:
update_w1 = np.zeros(w1.shape)

update_b1 = np.zeros(b1.shape)

update_w2 = np.zeros(w2.shape)

update_b2 = np.zeros(b2.shape)

update_w3 = np.zeros(w3.shape)

update_b3 = np.zeros(b3.shape)

---------------------------------------------

**Total number of epochs**

In [19]:
epochs = 1000

------------------------

**Backpropagation in ANNs**

In [20]:
for epoch in range(epochs):

#------------------------------------------Forward Propagation-------------------------------------------------
    
    in_hidden_1 = w1.dot(x) + b1
    out_hidden_1 = sig(in_hidden_1)

    in_hidden_2 = w2.dot(out_hidden_1) + b2
    out_hidden_2 = softmax(in_hidden_2)

    in_output_layer = w3.dot(out_hidden_2) + b3
    y_hat = softmax(in_output_layer)
    
    loss = cross_E(y, y_hat)
    print(f'loss before training is {loss} -- epoch number {epoch + 1}')
    print('\n')
    
#-----------------------------------------Gradient Calculations via Back Propagation---------------------------

    error_upto_softmax = np.sum(cross_E_grad(y, y_hat) * softmax_dash(in_output_layer), axis = 0).reshape((-1, 1))
    
    grad_w3 = error_upto_softmax .dot( out_hidden_2.T )
    
    grad_b3 = error_upto_softmax
    
    #-----------------------------------------
    
    error_grad_upto_H2 = np.sum(error_upto_softmax * w3, axis = 0) .reshape((-1, 1))
    
    error_upto_softmax_H2 = np.sum(error_grad_upto_H2 * softmax_dash(in_hidden_2), axis = 0).reshape((-1, 1))
    
    grad_w2 = error_upto_softmax_H2 .dot( out_hidden_1.T )
    
    grad_b2 = error_upto_softmax_H2
    
    #-----------------------------------------
    
    error_grad_upto_H1 = np.sum(error_upto_softmax_H2 * w2, axis = 0) .reshape((-1, 1))
    
    grad_w1 = error_grad_upto_H1 * sig_dash(in_hidden_1) .dot( x.T )
    
    grad_b1 = error_grad_upto_H1 * sig_dash(in_hidden_1)
    
#-----------------------------------------Updating weights and biases via SGD Momentum------------------------

    update_w1 = - learning_rate * grad_w1 + momentum * update_w1
    w1 += update_w1
    
    update_b1 = - learning_rate * grad_b1 + momentum * update_b1
    b1 += update_b1
    
    update_w2 = - learning_rate * grad_w2 + momentum * update_w2
    w2 += update_w2
    
    update_b2 = - learning_rate * grad_b2 + momentum * update_b2
    b2 += update_b2
    
    update_w3 = - learning_rate * grad_w3 + momentum * update_w3
    w3 += update_w3
    
    update_b3 = - learning_rate * grad_b3 + momentum * update_b3
    b3 += update_b3

loss before training is 1.2360326123452905 -- epoch number 1


loss before training is 1.2272601886539678 -- epoch number 2


loss before training is 1.2107023401823842 -- epoch number 3


loss before training is 1.1873387402582236 -- epoch number 4


loss before training is 1.1581267572348306 -- epoch number 5


loss before training is 1.1239907842101362 -- epoch number 6


loss before training is 1.0858143749274538 -- epoch number 7


loss before training is 1.0444341603351217 -- epoch number 8


loss before training is 1.0006348855323508 -- epoch number 9


loss before training is 0.9551451872946534 -- epoch number 10


loss before training is 0.9086339463885449 -- epoch number 11


loss before training is 0.8617072069352467 -- epoch number 12


loss before training is 0.8149057622040771 -- epoch number 13


loss before training is 0.7687035648418804 -- epoch number 14


loss before training is 0.7235071322594603 -- epoch number 15


loss before training is 0.6796560896120585 -- epo

-------------------------

**Forward feed after training**

In [21]:
in_hidden_1 = w1.dot(x) + b1
out_hidden_1 = sig(in_hidden_1)

in_hidden_2 = w2.dot(out_hidden_1) + b2
out_hidden_2 = softmax(in_hidden_2)

in_output_layer = w3.dot(out_hidden_2) + b3
y_hat = softmax(in_output_layer)

y_hat

array([[0.0011485 ],
       [0.99616402],
       [0.00134445],
       [0.00134303]])

In [22]:
y

array([[0],
       [1],
       [0],
       [0]])

In [23]:
cross_E(y, y_hat)

0.0038433545161639282