![img](https://drive.google.com/uc?id=1TYfSkbrOKiAKwXF_sAF_XyMnk4ShfPJP)



In [2]:
%run "BackpropModule.ipynb"

Populating the interactive namespace from numpy and matplotlib


In [3]:
# PACKAGE
import numpy as np
import matplotlib.pyplot as plt


In [4]:
# Here is the activation function and its derivative.
sigma = lambda z : 1 / (1 + np.exp(-z))
d_sigma = lambda z : np.cosh(z/2)**(-2) / 4
    

# This is the cost function of a neural network with respect to a training set.
def cost(x, y) :
    """
    network_function(x)[-1] is the last layer a3
    division by x.size... isn't that always equal to one in this scenario ?? why divide ??
    """
    return np.linalg.norm(network_function(x)[-1] - y)**2 / x.size


In [5]:
# LAYER 3

def J_W3 (y) :
    # First get all the activations and weighted sums at each layer of the network.
    a0, z1, a1, z2, a2, z3, a3 = feed_forward(x)
    
    # dC/da3,
    J = 2 * (a3 - y) # layer 3 has 2 nodes: [2 x 1] 
    
    # da3/dz3
    J = J * d_sigma(z3)
    
    # dz3/dW3
    J = J @ a2.T # [2 x 1] x [1 x 2] = [2 x 2]
    
    # then divide by the number of training examples, for the average over all training examples.
    J = J / m 
    print('shape J_W3: ', J.shape)
    
    return J


def J_b3 (y) :
    # As last time, we'll first set up the activations.
    a0, z1, a1, z2, a2, z3, a3 = feed_forward(x)
    
    # dC/da3
    J = 2 * (a3 - y) # [2 x 1] 
    
    # da3/dz3
    J = J * d_sigma(z3)
    
    # dz3/db3
    J = J 
    
    # sum over all training examples however.
    J = np.sum(J, axis=1, keepdims=True) / m
    print('shape J_b3: ', J.shape)
    
    return J

In [6]:
# LAYER 2

def J_W2 (y) :
    #The first two lines are identical to in J_W3.
    a0, z1, a1, z2, a2, z3, a3 = feed_forward(x) 
    
    # dC/da3
    J = 2 * (a3 - y) # [2 x 1] 
    
    # da3/dz3
    J = J * d_sigma(z3) # [2 x 1] 
    
    # dz3/da2
    # layer 3 has two nodes thus W3 = [2 x n2]
    J = (J.T @ W3).T # [1 x 2] x [2 x n2] = [1 x n2]
    
    # da2/dz2
    J = J * d_sigma(z2)
    
    # dz2/dW2
    J = J @ a1.T # [1 x n2] x [n2 x 1]  = [1 x 1]
    
    # then divide by the number of training examples, for the average over all training examples.
    J = J / m 
    
    return J


def J_b2 (y) :
    a0, z1, a1, z2, a2, z3, a3 = feed_forward(x)
    
    # dC/da3
    J = 2 * (a3 - y) # [2 x 1] 
    
    # da3/dz3
    J = J * d_sigma(z3) # [2 x 1] 
    
    # dz3/da2
    # layer 3 has two nodes thus W3 = [2 x n2]
    J = (J.T @ W3).T # [1 x 2] x [2 x n2] = [1 x n2]
    
    # da2/dz2
    J = J * d_sigma(z2)
    
    # dz2/db2
    J = J 
    
    # sum over all training examples
    J = np.sum(J, axis=1, keepdims=True) / x.size
    
    return J

In [7]:
# LAYER 1

def J_W1 (y) :
    #The first two lines are identical to in J_W3.
    a0, z1, a1, z2, a2, z3, a3 = feed_forward(x) 
    
    # dC/da3
    J = 2 * (a3 - y) # [2 x 1] 
    
    ##### da3/dz3 #####################################
    J = J * d_sigma(z3) # [2 x 1] 
    
    # dz3/da2
    # layer 3 has 2 nodes: W3 = [2 x n2]
    J = (J.T @ W3).T # [1 x 2] x [2 x n2] = [1 x n2].T = [n2 x 1]
    ###################################################
    
    ##### da2/z2 ######################################
    J = J * d_sigma(z2) # [n2 x 1]
    
    # dz2/da1
    # layer 2 has n2 nodes thus W2 = [n2 x n1]
    J = (J.T @ W2).T # [1 x n2] x [n2 x n1] = [1 x n1].T = [n1 x 1]
    ###################################################
    
    # da1/dz1
    J = J * d_sigma(z1) # [n1 x 1]
    
    # dz1/dW1
    # a0 shape is [m x 1]
    
    J = J @ a0.T # [n1 x 1] x [1 x m]  = [n1 x m]
    
    # then divide by the number of training examples, for the average over all training examples.
    J = J / m 
    
    return J
  

def J_b1 (y) :
    a0, z1, a1, z2, a2, z3, a3 = feed_forward(x)

    # dC/da3
    J = 2 * (a3 - y) # [2 x 1] 
    
    ##### da3/dz3 ######################################
    J = J * d_sigma(z3) # [2 x 1] 
    
    # dz3/da2
    # layer 3 has two nodes: W3 = [2 x n2]
    J = (J.T @ W3).T # [1 x 2] x [2 x n2] = [1 x n2].T = [n2 x 1]
    ####################################################
    
    ##### da2/z2 #######################################
    J = J * d_sigma(z2) # [n2 x 1]
    
    # dz2/da1
    # layer 2 has n2 nodes thus W2 = [n2 x n1]
    J = (J.T @ W2).T # [1 x n2] x [n2 x n1] = [1 x n1].T = [n1 x 1]
    ####################################################
    
    # da1/dz1
    J = J * d_sigma(z1) # [n1 x 1]
    
    # dz1/db1
    J = J 
    
    J = np.sum(J, axis=1, keepdims=True) / x.size
    return J

In [8]:
global W1, W2, W3, b1, b2, b3
def backprop(x, y, iterations=1, aggression=3.5, noise=1) :
    global W1, W2, W3, b1, b2, b3
    
    while iterations>=0:
        # compute
        j_W1 = J_W1(y)
        j_W2 = J_W2(y)
        j_W3 = J_W3(y)
        j_b1 = J_b1(y)
        j_b2 = J_b2(y)
        j_b3 = J_b3(y)

        # update
        W1 = W1 - j_W1 
        W2 = W2 - j_W2 
        W3 = W3 - j_W3 
        b1 = b1 - j_b1 
        b2 = b2 - j_b2 
        b3 = b3 - j_b3 

#         if (iterations%100==0) :
#             nf = network_function(x)[-1]
    
        iterations -= 1

    a3 = feed_forward(x)[-1]
    return a3

In [9]:
np.random.seed(seed=123)

def training_data (M = 5) :
    x = np.arange(0,1,1/M) # nbr btw 0 and 1 in 1/N steps
    
    y = np.array([0,1])
    
    x = np.reshape(x, (M, 1))
    print('x shape: ', x.shape)
    
    y = np.reshape(y, (2, 1))
     
    m = x.size
    
    return x, y, m
  
# This function feeds forward each activation to the next layer. It returns all weighted sums and activations.
def feed_forward(a0) :
    
    # [3 x m] x [m x 1] + [3 x 1]
    z1 = W1 @ a0 + b1
    a1 = sigma(z1) # [3 x 1]
    
    # [2 x 3] x [3 x 1] + [2 x 3]
    z2 = W2 @ a1 + b2
    a2 = sigma(z2) # [2 x 1]
    
    # [2 x 2] x [2 x 1] + [2 x 1]
    z3 = W3 @ a2 + b3
    a3 = sigma(z3) # [2 x 1]
    
    print('shape a1', a1.shape)
    print('shape a2', a2.shape)
    print('shape a3', a3.shape)
    
    return a0, z1, a1, z2, a2, z3, a3
  
  
# This function initialises the network with it's structure, it also resets any training already done.
# n1 = 6, n2 = 7
def reset_network (n0, n1 = 3, n2 = 2, n3=2, random=np.random) :
    global W1, W2, W3, b1, b2, b3
    W1 = random.randn(n1, n0) / 2 ## 3 x 1
    W2 = random.randn(n2, n1) / 2 ## 2 x 3
    W3 = random.randn(n3, n2) / 2 ## 2 x 2
    print('shape W1', W1.shape)
    print('shape W2', W2.shape)
    print('shape W3', W3.shape)
    
    b1 = random.randn(n1, 1) / 2
    b2 = random.randn(n2, 1) / 2
    b3 = random.randn(n3, 1) / 2
    print('shape b1', b1.shape)
    print('shape b2', b2.shape)
    print('shape b3', b3.shape)

    
# test
# global W1, W2, W3, b1, b2, b3
# x, y, m = training_data()
# reset_network(n0=m)
# a0, z1, a1, z2, a2, z3, a3 = feed_forward(x)

In [10]:
reset_network(n0=5)
x, y, m = training_data()
a3 = backprop(x, y, iterations=1, aggression=7, noise=1)
print(a3)

shape W1 (3, 5)
shape W2 (2, 3)
shape W3 (2, 2)
shape b1 (3, 1)
shape b2 (2, 1)
shape b3 (2, 1)
x shape:  (5, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape J_W3:  (2, 2)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape J_b3:  (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape J_W3:  (2, 2)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
shape J_b3:  (2, 1)
shape a1 (3, 1)
shape a2 (2, 1)
shape a3 (2, 1)
[[0.49492989]
 [0.24723845]]
