In [None]:
import numpy as np
import numpy.matlib

# This is referred above as f(u).
class nn_MSECriterion:
    def forward(self, predictions, labels):
        return np.sum(np.square(predictions - labels))
        
    def backward(self, predictions, labels):
        num_samples = labels.shape[0]
        return num_samples * 2 * (predictions - labels) ### why num_samples * ... ?

# This is referred above as g(v).
class nn_Sigmoid:
    def forward(self, x):
        return 1 / (1 + np.exp(-x))
    
    def backward(self, x, gradOutput):
        # It is usually a good idea to use gv from the forward pass and not recompute it again here.
        gv = 1 / (1 + np.exp(-x))  
        return np.multiply(np.multiply(gv, (1 - gv)), gradOutput) ### what is gradOutput?

# This is referred above as h(W, b)
class nn_Linear:
    def __init__(self, input_dim, output_dim):
        # Initialized with random numbers from a gaussian N(0, 0.001)
        self.weight = np.matlib.randn(input_dim, output_dim) * 0.01
        self.bias = np.matlib.randn((1, output_dim)) * 0.01
        self.gradWeight = np.zeros_like(self.weight)
        self.gradBias = np.zeros_like(self.bias)
        
    def forward(self, x):
        return np.dot(x, self.weight) + self.bias
    
    def backward(self, x, gradOutput):
        # dL/dw = dh/dw * dL/dv
        self.gradWeight = np.dot(x.T, gradOutput)
        # dL/db = dh/db * dL/dv
        self.gradBias = np.copy(gradOutput)
        # return dL/dx = dh/dx * dL/dv
        return np.dot(gradOutput, self.weight.T)
    
    def getParameters(self):
        params = [self.weight, self.bias]
        gradParams = [self.gradWeight, self.gradBias]
        return params, gradParams
    

In [None]:
learningRate = 0.1

model = {}  
model['linear1'] = nn_Linear(4, 5)
model['linear2'] = nn_Linear(5, 3)
model['sigmoid'] = nn_Sigmoid()
model['loss'] = nn_MSECriterion()

epochsToRun = 401
for epoch in range(0, epochsToRun+1):
    loss = 0
    for i in range(0, dataset_size):
        xi = x[i:i+1, :]
        yi = y[i:i+1, :] 

        # Forward layer 1
        a0_L1 = model['linear1'].forward(xi)
        a1_L1 = model['sigmoid'].forward(a0_L1)
        # Forward layer 2
        a0_L2 = model['linear2'].forward(a1_L1)
        a1_L2 = model['sigmoid'].forward(a0_L2)
        #
        loss += model['loss'].forward(a1_L2, yi)

        # Backward layer 2
        da1_L2 = model['loss'].backward(a1_L2, yi)
        da0_L2 = model['sigmoid'].backward(a0_L2, da1_L2)
        da1_L1 = model['linear2'].backward(a1_L1, da0_L2) # IS THIS RIGHT???

        # Backward layer 1
        #da1_L1 = model['loss'].backward(a1_L1, y2i) ### AND IS THIS WHAT GETS PASSED HERE?
        da0_L1 = model['sigmoid'].backward(a0_L1, da1_L1)
        model['linear1'].backward(xi, da0_L1)
        
        ##update layer 2
        model['linear2'].weight = model['linear2'].weight - learningRate * model['linear2'].gradWeight
        model['linear2'].bias = model['linear2'].bias - learningRate * model['linear2'].gradBias
        #
        ##update layer 1
        model['linear1'].weight = model['linear1'].weight - learningRate * model['linear1'].gradWeight
        model['linear1'].bias = model['linear1'].bias - learningRate * model['linear1'].gradBias
          
    if (epoch % 100 == 0) | (epoch == epochsToRun):
        print('epoch[%d] = %.8f' % (epoch, loss / dataset_size))
        #print('$$$ ' + 'weight = \n' + str(model['linear'].weight))
        #print('$$$ ' + 'bias = \n' + str(model['linear'].bias))
        print('************')
        