**Importing the Libraries**

In [1]:
import numpy as np
np.random.seed(42)

-------------------------------

**Nodes in each layer**

In [2]:
input_nodes = 5
hidden_1_nodes = 3
hidden_2_nodes = 5
output_nodes = 4

-------------------------

**Inputs and true outputs**

In [3]:
x = np.random.randint(1, 100, size = (input_nodes, 1)) / 100
x

array([[0.52],
       [0.93],
       [0.15],
       [0.72],
       [0.61]])

In [4]:
y = np.random.randint(1, 100, size = (output_nodes, 1)) / 100
y

array([[0.21],
       [0.83],
       [0.87],
       [0.75]])

------------------------------

**Defining Activation functions and loss with their derivatives**

Relu for first hidden layer

In [5]:
def relu(x, leak = 0):
    return np.where(x <= 0, leak * x, x)

In [6]:
def relu_dash(x, leak = 0):
    return np.where(x <= 0, leak, 1)

<br>

Sigmoid for second hidden layer and output layer

In [7]:
def sig(x):
    return 1/(1 + np.exp(-x))           

In [8]:
def sig_dash(x):
    return sig(x) * (1 - sig(x))

<br>

Mean square loss

In [9]:
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

In [10]:
def mse_grad(y_true, y_pred):
    
    N = y_true.shape[0]
    
    return -2*(y_true - y_pred)/N

--------------------------------------------

**Random initialization of weights and biases**

In [12]:
w1 = np.random.random(size = (hidden_1_nodes, input_nodes))
b1 = np.zeros(shape = (hidden_1_nodes, 1))

print(w1, b1)

[[0.04666566 0.97375552 0.23277134 0.09060643 0.61838601]
 [0.38246199 0.98323089 0.46676289 0.85994041 0.68030754]
 [0.45049925 0.01326496 0.94220176 0.56328822 0.3854165 ]] [[0.]
 [0.]
 [0.]]


In [13]:
w2 = np.random.random(size = (hidden_2_nodes, hidden_1_nodes))
b2 = np.zeros(shape = (hidden_2_nodes, 1))

print(w2, b2)

[[0.01596625 0.23089383 0.24102547]
 [0.68326352 0.60999666 0.83319491]
 [0.17336465 0.39106061 0.18223609]
 [0.75536141 0.42515587 0.20794166]
 [0.56770033 0.03131329 0.84228477]] [[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [14]:
w3 = np.random.random(size = (output_nodes, hidden_2_nodes))
b3 = np.zeros(shape = (output_nodes, 1))

print(w3, b3)

[[0.44975413 0.39515024 0.92665887 0.727272   0.32654077]
 [0.57044397 0.52083426 0.96117202 0.84453385 0.74732011]
 [0.53969213 0.58675117 0.96525531 0.60703425 0.27599918]
 [0.29627351 0.16526694 0.01563641 0.42340148 0.39488152]] [[0.]
 [0.]
 [0.]
 [0.]]


-----------------------------------

**Forward feed before training**

In [15]:
in_hidden_1 = w1.dot(x) + b1
out_hidden_1 = relu(in_hidden_1, leak = 0.1)

in_hidden_2 = w2.dot(out_hidden_1) + b2
out_hidden_2 = sig(in_hidden_2)

in_output_layer = w3.dot(out_hidden_2) + b3
y_hat = sig(in_output_layer)

y_hat

array([[0.91288614],
       [0.95449609],
       [0.92229189],
       [0.74871256]])

In [16]:
y

array([[0.21],
       [0.83],
       [0.87],
       [0.75]])

In [17]:
mse(y, y_hat)

0.1280710759216072

----------------------

**SGD**

In [18]:
learning_rate = 0.01

------------------------

**Total number of epochs**

In [19]:
epochs = 10000

-------------------

**Backpropagation in ANNs**

In [20]:
for epoch in range(epochs):

#------------------------------------------Forward Propagation-------------------------------------------------
    
    in_hidden_1 = w1.dot(x) + b1
    out_hidden_1 = relu(in_hidden_1, leak = 0.1)

    in_hidden_2 = w2.dot(out_hidden_1) + b2
    out_hidden_2 = sig(in_hidden_2)

    in_output_layer = w3.dot(out_hidden_2) + b3
    y_hat = sig(in_output_layer)
    
    loss = mse(y, y_hat)
    print(f'loss before training is {loss} -- epoch number {epoch + 1}')
    print('\n')
    
#-----------------------------------------Gradient Calculations via Back Propagation---------------------------
    
    grad_w3 = mse_grad(y, y_hat) * sig_dash(in_output_layer) .dot( out_hidden_2.T )
    
    grad_b3 = mse_grad(y, y_hat) * sig_dash(in_output_layer)
    
    #-----------------------------------------
    
    error_grad_upto_H2 = np.sum(mse_grad(y, y_hat) * sig_dash(in_output_layer) * w3, axis = 0) .reshape((-1, 1))
    
    grad_w2 = error_grad_upto_H2 * sig_dash(in_hidden_2) .dot( out_hidden_1.T )
    
    grad_b2 = error_grad_upto_H2 * sig_dash(in_hidden_2)
    
    #-----------------------------------------
    
    error_grad_upto_H1 = np.sum(error_grad_upto_H2 * sig_dash(in_hidden_2) * w2, axis = 0) .reshape((-1, 1))
    
    grad_w1 = error_grad_upto_H1 * relu_dash(in_hidden_1, leak = 0.1) .dot( x.T )
    
    grad_b1 = error_grad_upto_H1 * relu_dash(in_hidden_1, leak = 0.1)
    
#-----------------------------------------Updating weights and biases with SGD---------------------------------

    update_w1 = - learning_rate * grad_w1
    w1 += update_w1
    
    update_b1 = - learning_rate * grad_b1
    b1 += update_b1
    
    update_w2 = - learning_rate * grad_w2
    w2 += update_w2
    
    update_b2 = - learning_rate * grad_b2
    b2 += update_b2
    
    update_w3 = - learning_rate * grad_w3
    w3 += update_w3
    
    update_b3 = - learning_rate * grad_b3
    b3 += update_b3

loss before training is 0.1280710759216072 -- epoch number 1


loss before training is 0.1280298111486201 -- epoch number 2


loss before training is 0.12798846929994118 -- epoch number 3


loss before training is 0.12794705020473168 -- epoch number 4


loss before training is 0.12790555369182613 -- epoch number 5


loss before training is 0.1278639795897324 -- epoch number 6


loss before training is 0.12782232772663207 -- epoch number 7


loss before training is 0.12778059793038052 -- epoch number 8


loss before training is 0.12773879002850724 -- epoch number 9


loss before training is 0.12769690384821603 -- epoch number 10


loss before training is 0.1276549392163852 -- epoch number 11


loss before training is 0.12761289595956804 -- epoch number 12


loss before training is 0.12757077390399263 -- epoch number 13


loss before training is 0.12752857287556285 -- epoch number 14


loss before training is 0.12748629269985778 -- epoch number 15


loss before training is 0.127443933202

--------------------------------

**Forward feed after training**

In [21]:
in_hidden_1 = w1.dot(x) + b1
out_hidden_1 = relu(in_hidden_1, leak = 0.1)

in_hidden_2 = w2.dot(out_hidden_1) + b2
out_hidden_2 = sig(in_hidden_2)

in_output_layer = w3.dot(out_hidden_2) + b3
y_hat = sig(in_output_layer)

y_hat

array([[0.2107099 ],
       [0.8395499 ],
       [0.86775392],
       [0.74933989]])

In [22]:
y

array([[0.21],
       [0.83],
       [0.87],
       [0.75]])

In [23]:
mse(y, y_hat)

2.4296304586091913e-05

In [24]:
w1

array([[ 0.01384127,  0.91505036,  0.22330277,  0.04515728,  0.57988047],
       [ 0.34599997,  0.91801996,  0.456245  ,  0.80945453,  0.63753478],
       [ 0.41832215, -0.04428255,  0.9329199 ,  0.51873531,  0.34767029]])