##### Implementing Backpropagation From Scratch on Python 3+
- Let's see if theory and practice are the same thing.

In [2]:
import numpy as np
from sympy import *

In [3]:
# Provide sigmoid and sigmoid_derivative defined function
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

In [4]:
# Provide some structure of the network
class Network:
    def __init__(self, structure):   
        # A list that contains the number of neurons in each layer of the network
        self.structure = structure
        # Number of layers in the network
        self.num_layers = len(structure)
        # A list of all the bias vectors in the network, _n indicates it has all the bias vectors
        self.B_n = [np.random.randn(l, 1) for l in structure[1:]]
        # A list of all the weight vectors in the network
        self.W_n = [np.random.randn(l, next_l) for l, next_l in zip(structure[:-1], structure[1:])]
        
    # Call forth all the equations for forward and backward process
    def backprop(self, x, y):
        # Initialize bias vector & weights matrix of each layer in the network
        e_Je_B_ns = [np.zeros(b.shape) for b in self.B_n]
        e_Je_W_ns = [np.zeros(W.shape) for W in self.W_n]
        
        # Forward pass
        # Create two lists that contain all the neuron values before & after activation        
        Z_n, A_n = [], []
        
        # Forward pass layer by layer from L=0 thru L=H
        for b, W in zip(self.B_n, self.W_n):
            a = x
            z = np.dot(W.T, a) + b
            a = sigmoid(z)
            
            Z_n.append(z)
            A_n.append(a)             
            x = a
            
        # H : output layer
        H = self.num_layers - 2
        
        # backpropagation
        for L in range(H, -1, -1):
            if L != H:
                delta = sigmoid_derivative(Z_n[L]) * np.dot(self.W_n[L+1], delta)
            else:
                delta = sigmoid_derivative(Z_n[L]) * (A_n[L] - y)
                
            e_Je_B_ns[L] = delta
            
            if L != 0:
                e_Je_W_ns[L] = np.dot(A_n[L-1], delta.T)    
            else:
                e_Je_W_ns[L] = np.dot(x, delta.T) 
        
        return e_Je_B_ns, e_Je_W_ns
    
    # Gradient Descent
    def gradient_descent(self, mini_batch, learning_rate):
        # Initialize bias vector & weights matrix of each layer in the network
        e_Je_B_n = [np.zeros(b.shape) for b in self.B_n]
        e_Je_W_n = [np.zeros(W.shape) for W in self.W_n]
        
        for x, y in mini_batch:
            e_Je_B_ns, e_Je_W_ns = self.backprop(x, y)
            e_Je_B_n = [e_Je_b + e_Je_b_s for e_Je_b, e_Je_b_s in zip(e_Je_B_n, e_Je_B_ns)] 
            e_Je_W_n = [e_Je_W + e_Je_W_s for e_Je_W, e_Je_W_s in zip(e_Je_W_n, e_Je_W_ns)]
            
        d = len(mini_batch)
        self.W_n = [W - learning_rate/d * e_Je_W for W, e_Je_W in zip(self.W_n, e_Je_W_n)]
        self.B_n = [b - learning_rate/d * e_Je_b for b, e_Je_b in zip(self.B_n, e_Je_B_n)]
        
    def train(self, epochs, training_data, learning_rate):
        for j in range(epochs):
            for mini_batch in training_data:
                self.gradient_descent(mini_batch, learning_rate)               

In [7]:
# Insert data
np.random.seed(2023)
my_net = Network([3, 2, 2])

print("Initial Weights:")
print(my_net.W_n[0])

Initial Weights:
[[-0.10215984 -1.14129263]
 [ 2.65440726  1.44060519]
 [ 0.09890227 -3.12153215]]


In [14]:
np.random.seed(2023)
random_vectors = lambda dim, cnt : [np.random.rand(dim, 1) for i in range(cnt)]
a = list(zip(random_vectors(4, 4), random_vectors(2, 4)))
for i in a[:3]:
    print(i)
    print()

(array([[0.3219883 ],
       [0.89042245],
       [0.58805226],
       [0.12659609]]), array([[0.33795869],
       [0.18032328]]))

(array([[0.14134122],
       [0.46789559],
       [0.02208966],
       [0.72727471]]), array([[0.3909914 ],
       [0.03564821]]))

(array([[0.52438734],
       [0.54493524],
       [0.45637326],
       [0.50138226]]), array([[0.56486165],
       [0.20346149]]))



In [15]:
my_net.gradient_descent(a, 2.0)

ValueError: shapes (2,3) and (4,1) not aligned: 3 (dim 1) != 4 (dim 0)

In [6]:
# The following generates a list of cnt vectors of length dim.
np.random.seed(2023)
random_vectors = lambda dim, cnt : [np.random.rand(dim, 1) for i in range(cnt)]
random_batch = list(zip(random_vectors(3, 64), random_vectors(2, 64)))

my_net.gradient_descent(random_batch, 3.0)
print("Optimized Weights:")
print(my_net.W_n[0])

ValueError: operands could not be broadcast together with shapes (3,2) (2,2) 

In [23]:
for rand_vector in random_vectors(3, 64)[-2:]:
    print(rand_vector)
    print()

[[0.37214444]
 [0.57783618]
 [0.01429624]]

[[0.92490225]
 [0.49228575]
 [0.66290319]]



In [12]:
x = np.array([[1,0], [3,4]])
y = 1.0 / (1.0 + np.exp(-x))
print(x)
print()
print(y)

[[1 0]
 [3 4]]

[[0.73105858 0.5       ]
 [0.95257413 0.98201379]]


In [22]:
x = symbols("x")
fx = 4 * (x**3) + 3 * x
print(fx)

first_deriv = Derivative(fx, x).doit()
print(first_deriv)

value = first_deriv.subs({x:3})
print(value)

4*x**3 + 3*x
12*x**2 + 3
111


In [3]:
x = np.array([2,3,5])
y = np.array([5,3,2])

x * y

array([10,  9, 10])