# Backpropagation

Changes in this version
-  Add LinearModel Class to make writing multiple layers easy
-  Add Back Propagation

In [284]:
import numpy as np
np.random.seed(0) # in notebooks, this needs to be present in the cell where the random is being called
np.set_printoptions(precision=4)

class Activation:
    def __init__(self):
        pass
    def __repr__(self):
        pass
    def forward(self, inputs):
        pass
    def backward(self, outputs):
        pass
    def __call__(self, arg):
        return self.forward(arg)

# ReLU function
class Act_Linear(Activation):
    def forward(self, inputs):
        return inputs

    def backward(self, outputs):
        return 1

    def __repr__(self):
        return "Act_Linear"

class Act_ReLU(Activation):
    def forward(self, inputs):
        return np.maximum(0, inputs)

    def backward(self, outputs):
        return np.where(outputs > 0, 1, 0)
    
    def __repr__(self):
        return "Act_ReLU"

class Act_Tanh(Activation):
    def forward(self, inputs):
        return np.tanh(inputs)

    def backward(self, outputs):
        # dy/dx = 1-y**2
        return (1 - outputs**2)
    
    def __repr__(self):
        return "Act_Tanh"

class Act_Sigmoid(Activation):
    def forward(self, inputs):
        return 1/(1+np.exp(-inputs))

    def backward(self, outputs):
        # dy/dx = y*(1-y)
        return outputs * (1-outputs)
    
    def __repr__(self):
        return "Act_Sigmoid"

class Act_Softmax(Activation):
    def forward(self, inputs):
        exp = np.exp(inputs)
        return exp / np.sum(exp)

    def backward(self, outputs):
        # TODO: y_k * (1 - y_i) when i = k
        #       y_k * (  - y_i) when i != k
        pass
    
    def __repr__(self):
        return "Act_Softmax"

class Layer:
    def __init__(self, n_inputs, n_neurons, activation_fn, weights=None, biases=None):
        if activation_fn is None:
            activation_fn = Act_Linear

        if activation_fn is Act_Softmax:
            raise Exception("Softmax is not supported as an activation function, use it after the output")
            
        if weights is None:
            self.weights = 0.1 * np.random.randn(n_neurons, n_inputs) # multiplying with 0.1 to keep the range within (-1, 0, 1)
        else:
            self.weights = weights # used to test the correction of my code

        if biases is None:
            self.biases = np.zeros((1, n_neurons))
        else:
            self.biases = biases
        self.activation = activation_fn()  # new code - initialise the activation class
        self.inputs = []
        self.grad_act = []
        self.grad_new = []
        self.grad_biases = []

    def forward(self, inputs):
        # modified to execute the activation forward code
        self.inputs = inputs
        output_raw = np.dot(self.inputs, self.weights.T) + self.biases
        self.output = self.activation.forward(output_raw)
        return self.output

    def backward(self, prev_grad):
        self.grad_act = self.activation.backward(self.output) # gradient of the activation fn
        self.grad_new = np.multiply(prev_grad, self.grad_act)
        self.grad_weights = np.dot(self.grad_new.T, self.inputs)
        self.grad_biases = np.sum(self.grad_new, axis=0, keepdims=True)
        return np.dot(self.grad_new, self.weights)

    def __call__(self, arg):
        return self.forward(arg)

    def __repr__(self):
        return f"Layer(n_inp={self.weights.shape[1]},\
        n_neurons={self.weights.shape[0]},\
        activation_fn={self.activation.__repr__()})"

# Linea
class LinearModel:
    def __init__(self, *args):
        self.layers = []
        for arg in args:
            self.layers.append(arg)

    def __call__(self, arg):
        return self.forward(arg)

    def forward(self, arg):
        out = arg
        for layer in self.layers:
            out = layer(out)
        return out

    def backward(self, grad):
        # TODO: how to handle different dimension data for the prev gradient
        for layer in self.layers[::-1]:
            grad = layer.backward(grad)
        return grad

    def __repr__(self):
        head = "LinearModel(\n"
        tail = ")"
        body = ""
        for layer in self.layers:
            body += layer.__repr__() + "\n"
        return head + body + tail

In [285]:
l = Layer(3, 4, Act_Tanh)
inp = np.array([[1.,2.,3.]])
l(inp)

array([[0.5006, 0.2954, 0.0338, 0.4669]])

In [286]:
l.backward(np.array([[1.0]]))
l.grad_weights

array([[0.7494, 1.4989, 2.2483],
       [0.9128, 1.8255, 2.7383],
       [0.9989, 1.9977, 2.9966],
       [0.782 , 1.5639, 2.3459]])

In [287]:
m = LinearModel(
    Layer(3, 4, Act_ReLU),
    Layer(4, 5, Act_Tanh),
    Layer(5, 2, Act_Sigmoid)
)

m(np.array([[1.,2.,3.]]))
m.backward(np.array([[-1.0, 2.0]]))

array([[-1.7500e-03, -3.0817e-03,  1.2401e-05]])

## Testing my solution with Pytorch

In [288]:
import numpy as np

# Your existing code for Activation, Layer, LinearModel here...

# Initialize model
m = LinearModel(
    Layer(3, 4, Act_ReLU),
    Layer(4, 5, Act_Tanh),
    Layer(5, 2, Act_Sigmoid)
)

# Forward pass
input_np = np.array([[1., 2., 3.]])
output_custom = m(input_np)

# Backward pass (gradient of loss w.r.t. output)
grad_output = np.array([[-1.0, 2.0]])
m.backward(grad_output)

# Extract gradients from your model
custom_grads = []
for layer in m.layers:
    custom_grads.append({
        "weights": layer.grad_weights,
        "biases": layer.grad_biases
    })

In [289]:
# This code was generated by DeepSeek
import torch
import torch.nn as nn

# Set seed for reproducibility (if needed)
torch.manual_seed(0)

# Build equivalent PyTorch model
pytorch_model = nn.Sequential(
    nn.Linear(3, 4),    # Layer 1 (weights: 4x3, biases: 4)
    nn.ReLU(),
    nn.Linear(4, 5),    # Layer 2 (weights: 5x4, biases: 5)
    nn.Tanh(),
    nn.Linear(5, 2),    # Layer 3 (weights: 2x5, biases: 2)
    nn.Sigmoid()
)

# Copy weights/biases from your model to PyTorch
with torch.no_grad():
    # Layer 1
    pytorch_model[0].weight.data = torch.from_numpy(m.layers[0].weights.astype(np.float32))
    pytorch_model[0].bias.data = torch.from_numpy(m.layers[0].biases.squeeze(0).astype(np.float32))  # Squeeze batch dim
    
    # Layer 2
    pytorch_model[2].weight.data = torch.from_numpy(m.layers[1].weights.astype(np.float32))
    pytorch_model[2].bias.data = torch.from_numpy(m.layers[1].biases.squeeze(0).astype(np.float32))
    
    # Layer 3
    pytorch_model[4].weight.data = torch.from_numpy(m.layers[2].weights.astype(np.float32))
    pytorch_model[4].bias.data = torch.from_numpy(m.layers[2].biases.squeeze(0).astype(np.float32))

# Forward pass
input_torch = torch.from_numpy(input_np.astype(np.float32))
output_torch = pytorch_model(input_torch)

# Manually set gradient of loss w.r.t. output (same as your code)
output_torch.backward(gradient=torch.tensor(grad_output, dtype=torch.float32))

# Extract gradients from PyTorch
pytorch_grads = []
for i in [0, 2, 4]:  # Indices of Linear layers in Sequential
    pytorch_grads.append({
        "weights": pytorch_model[i].weight.grad.numpy(),
        "biases": pytorch_model[i].bias.grad.numpy().reshape(1, -1)  # Match your (1, n_neurons) shape
    })

In [290]:
print(custom_grads)
print('\n-----------------------------*******===-------------------===******-------------\n')
print(pytorch_grads)

[{'weights': array([[0.0014, 0.0027, 0.0041],
       [0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    ]]), 'biases': array([[0.0014, 0.    , 0.    , 0.    ]])}, {'weights': array([[ 0.0033,  0.    ,  0.    ,  0.    ],
       [ 0.0015,  0.    ,  0.    ,  0.    ],
       [ 0.0078,  0.    ,  0.    ,  0.    ],
       [-0.0009,  0.    ,  0.    ,  0.    ],
       [ 0.0047,  0.    ,  0.    ,  0.    ]]), 'biases': array([[ 0.0316,  0.0149,  0.0756, -0.0085,  0.0454]])}, {'weights': array([[ 0.0042, -0.0019, -0.001 ,  0.0008, -0.0012],
       [-0.0084,  0.0037,  0.0021, -0.0016,  0.0024]]), 'biases': array([[-0.25,  0.5 ]])}]

-----------------------------*******===-------------------===******-------------

[{'weights': array([[0.0014, 0.0027, 0.0041],
       [0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    ]], dtype=float32), 'biases': array([[0.0014, 0.    , 0.    , 0.    ]], dtype=float32)}, {'weights': array([[ 

**My backprop seems correct !**