In [183]:
import numpy as np
import pandas as pd
from pprint import *

In [184]:
df = pd.DataFrame([[8,8,4],[7,9,5],[6,10,6],[5,12,7]], columns=['cgpa','profile_score','lpa'])

In [185]:
df

Unnamed: 0,cgpa,profile_score,lpa
0,8,8,4
1,7,9,5
2,6,10,6
3,5,12,7


In [186]:
class Layer:
  def __init__(self, nodes,input_matrix):
    self.nodes = nodes
    self.input_matrix = input_matrix
    self.w = np.ones((self.input_matrix.shape[1],self.nodes))*0.1
    self.b = np.zeros((self.nodes))

  def forward_prop(self):
    dot = np.dot(self.input_matrix, self.w)
    self.output = dot + self.b
    return self.output

N = no. of example inputs  
M = no. of features  
P = no. of nodes in l1  
Q = no. of nodes in l2 (in regression this is always 1)

$X_{N \times M}$  
$W^{[1]}_{M \times P} \; and \;b^{[1]}_P$ b = 1D array  
$A^{[1]}_{N \times P}$  
$W^{[2]}_{P \times Q} \; and \;b^{[2]}_Q$ b = 1D array  
$A^{[2]}_{N \times Q}$ or Prediction matrix

In [187]:
def compute_loss(predictions, target):
    # Compute the squared errors element-wise
    squared_errors = (predictions - target) ** 2

    # Calculate the mean of these squared errors
    # Flatten the matrix to count all elements
    loss = (1 / (2 * predictions.size)) * squared_errors.sum()

    return loss

def g_loss(predictions, targets):
  # Compute the error
  error = predictions - targets

  # Compute the gradient of the loss with respect to predictions
  # For MSE: grad_loss = (1 / N) * error
  N = len(predictions)
  grad_loss = error / N
  return grad_loss

$$Loss = \frac{1}{2N}\sum_{n=1}^{N}(y_n-\hat{y}_n)^2_{N \times 1}$$
Note that this is a 2D matrix of size $N \times 1$  
The $\sum$ doesnt really stand for summation here, it just means for each n value calculate..

\begin{align}
  \frac{\partial{L}}{\partial{W^{[2]}}}&=\frac{\partial{L}}{\partial{\hat{y}}}\cdot\frac{\partial{\hat{y}}}{\partial{W^{[2]}}}\\
  &= (a^{[1]}_{N \times P})^T \cdot [\frac{1}{N}(y_n-\hat{y_n}))]_{N\times 1}
  \end{align}
\
\begin{align}
  \frac{\partial{L}}{\partial{b^{[2]}}}&=\frac{\partial{L}}{\partial{\hat{y}}}\cdot\frac{\partial{\hat{y}}}{\partial{b^{[2]}}}\\
  &=\frac{1}{N}(y_n-\hat{y_n})
  \end{align}

In case of $\frac{\partial{L}}{\partial{b^{[2]}}}$, the summation does really stand for what it is, we need to sum the losses for all the examples and the graident should be **1D array** with one element.

In [188]:
def derivative_w2(grad_loss, l1_output):
  gradient_w2 = np.dot(l1_output.T,grad_loss)
  return gradient_w2
def derivative_b2(grad_loss):
  gradient_b2 = np.mean(grad_loss, axis=0)
  return gradient_b2

In [189]:
def gradient_step_l2(learning_rate,w2,b2,grad_loss,l1_output):
  w2 = w2 - learning_rate*derivative_w2(grad_loss, l1_output)
  b2 = b2 - learning_rate*derivative_b2(grad_loss)
  return w2, b2

\begin{align}
  \frac{\partial{L}}{\partial{W^{[1]}}}&=\frac{\partial{L}}{\partial{\hat{y}}}\cdot\frac{\partial{\hat{y}}}{\partial{A^{[1]}}}\cdot\frac{\partial{A^{[1]}}}{\partial{W^{[1]}}}\\
  &= (X_{N\times M})^T \cdot [\frac{1}{N}(y_n-\hat{y_n})]_{N\times 1}\cdot (W^{[2]}_{P \times 1})^T \;\;(Q\,has\,been\,substituted\,for\,1)
  \end{align}

\begin{align}
  \frac{\partial{L}}{\partial{b^{[1]}}}&=\frac{\partial{L}}{\partial{\hat{y}}}\cdot\frac{\partial{\hat{y}}}{\partial{A^{[1]}}}\cdot\frac{\partial{A^{[1]}}}{\partial{b^{[1]}}}\\
  &=  [\frac{1}{N}(y_n-\hat{y_n})]_{N\times 1}\cdot (W^{[2]}_{P \times 1})^T \;\;(Q\,has\,been\,substituted\,for\,1)
  \end{align}

Here the final matrix for $\frac{\partial{L}}{\partial{W^{[1]}}}$ is $M\times P$ and the final matrix for $\frac{\partial{L}}{\partial{b^{[1]}}}$ is $N \times P$ but the dimension of b is $P$ which is a 1D array, hence we need to sum along the columns of $N \times P$ matrix which will give us a 1D array of $P$ length.

In [190]:
def derivative_w1(grad_loss, w2, X):
  gradient_w1 = np.dot(np.dot(X.T, grad_loss),w2.T)
  return gradient_w1
def derivative_b1(grad_loss, w2):
  gradient_b1 = np.mean(np.dot(grad_loss,w2.T), axis=0) # gives 1d array of P length
  return gradient_b1

In [191]:
def gradient_step_l1(learning_rate,w1,b1,w2,X,grad_loss):
  w1 = w1 - learning_rate*derivative_w1(grad_loss,w2,X)
  b1 = b1 - learning_rate*derivative_b1(grad_loss, w2)
  return w1, b1

In [192]:
X = np.array([[8,8],[7,9],[6,10],[5,12]])

l1 = Layer(2, X)
a_1 = l1.forward_prop()
l2 = Layer(1,a_1)
a_2 = l2.forward_prop()

In [193]:
for i in range(100):

  a_1 = l1.forward_prop()
  a_2 = l2.forward_prop()

  loss = compute_loss(a_2, np.array([[4],[5],[6],[7]]))
  grad_loss = g_loss(a_2, np.array([[4],[5],[6],[7]]))

  print(f'loss = {loss}')

  l1.w, l1.b = gradient_step_l1(0.001, l1.w, l1.b, l2.w, X, grad_loss)
  l2.w, l2.b = gradient_step_l2(0.001, l2.w, l2.b, grad_loss, a_1)

loss = 14.007850000000001
loss = 13.85888152070699
loss = 13.705298800084005
loss = 13.546812267637408
loss = 13.383147490755938
loss = 13.21404670800729
loss = 13.039270589178804
loss = 12.858600212564244
loss = 12.671839246491372
loss = 12.478816317894491
loss = 12.279387545876954
loss = 12.073439212717625
loss = 11.860890538718444
loss = 11.641696520765873
loss = 11.415850787625258
loss = 11.183388417983965
loss = 10.94438866033326
loss = 10.698977487202765
loss = 10.447329910352927
loss = 10.189671978648226
loss = 9.926282376865752
loss = 9.657493542047769
loss = 9.383692214590768
loss = 9.105319344463302
loss = 8.822869279099372
loss = 8.536888168884017
loss = 8.247971538886848
loss = 7.956760991621945
loss = 7.663940024968118
loss = 7.370228971633308
loss = 7.07637909115227
loss = 6.78316587162837
loss = 6.491381625341182
loss = 6.20182748885679
loss = 5.915304963200069
loss = 5.632607151728504
loss = 5.354509871357489
loss = 5.081762825601613
loss = 4.815081034573447
loss = 4.55

In [194]:
a1 = l1.forward_prop()
a2 = l2.forward_prop()
print(a2)

[[5.37221978]
 [5.37221978]
 [5.37221978]
 [5.70448124]]
