# Building a Multi-Layer Perceptron from Scratch in NumPy

**Objective:** This notebook implements a Multi-Layer Perceptron (MLP) from scratch using only NumPy. The goal is to demonstrate the mechanics of forward and backward propagation for a simple regression task, based on the MLP architecture discussed in the lecture exercise.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Activation Functions

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

## MLP Class

In [None]:
class MLP:
    def __init__(self):
        # Weights from the exercise sheet diagram
        # W1: (3 inputs + 1 bias) x 2 hidden neurons
        # W2: (2 hidden neurons + 1 bias) x 1 output neuron
        # The exercise diagram implies weights are for connections, biases are separate
        self.W1 = np.array([[0.1, 0.4],
                              [0.2, 0.5],
                              [0.3, 0.6]]) # Shape (3, 2)
        self.b1 = np.zeros((1, 2)) # Shape (1, 2) for 2 hidden neurons

        self.W2 = np.array([[0.7],
                              [0.8]]) # Shape (2, 1)
        self.b2 = np.zeros((1, 1)) # Shape (1, 1) for 1 output neuron

        # Attributes to store intermediate values for backpropagation
        self.z1 = None # Input to hidden layer activation
        self.h1 = None # Output of hidden layer activation
        self.z2 = None # Input to output layer activation

    def forward(self, x):
        # x is expected to be a column vector (e.g., shape (3,1) for 3 input features)
        # Ensure x is a 2D array for consistent matrix multiplication
        if x.ndim == 1:
            x = x.reshape(-1, 1)
        
        # Hidden layer
        # z = Wx + b. Here x is (num_features, 1), W1 is (num_hidden_units, num_features)
        # To match exercise: W1 (features, hidden_units), x (features, 1). So x.T @ W1 or W1.T @ x
        # Let's assume W1 is (input_dim, hidden_dim) and W2 is (hidden_dim, output_dim)
        # Input x shape: (num_samples, num_input_features)
        # self.W1 shape: (num_input_features, num_hidden_neurons)
        # self.b1 shape: (1, num_hidden_neurons)
        self.z1 = np.dot(x.T, self.W1) + self.b1 # z1 shape: (1, num_hidden_neurons)
        self.h1 = sigmoid(self.z1) # h1 shape: (1, num_hidden_neurons)

        # Output layer
        # self.W2 shape: (num_hidden_neurons, num_output_neurons)
        # self.b2 shape: (1, num_output_neurons)
        self.z2 = np.dot(self.h1, self.W2) + self.b2 # z2 shape: (1, num_output_neurons)
        y_hat = sigmoid(self.z2) # y_hat shape: (1, num_output_neurons)
        
        return y_hat

    def backward(self, x, y, y_hat):
        # x shape: (num_input_features, 1) or (num_samples, num_input_features)
        # y, y_hat shape: (1, num_output_neurons) or (num_samples, num_output_neurons)
        # Ensure x is a 2D array for consistent operations
        if x.ndim == 1:
            x = x.reshape(-1, 1)
            
        # Ensure y and y_hat are 2D arrays like (1,1)
        if not isinstance(y, np.ndarray) or y.ndim == 0 or y.ndim == 1:
            y = np.array([[y]])
        if not isinstance(y_hat, np.ndarray) or y_hat.ndim == 0 or y_hat.ndim == 1:
            y_hat = np.array([[y_hat]])

        # MSE Loss: L = 0.5 * (y_hat - y)^2
        # dL/dy_hat = y_hat - y
        
        # Output layer gradients (Layer 2)
        # delta2 = dL/dz2 = dL/dy_hat * dy_hat/dz2
        # dy_hat/dz2 = sigmoid_derivative(z2)
        delta2 = (y_hat - y) * sigmoid_derivative(self.z2) # shape (1, num_output_neurons)

        # dL/dW2 = dL/dz2 * dz2/dW2 = delta2 * h1.T
        # self.h1 shape (1, num_hidden_neurons), delta2 shape (1, num_output_neurons)
        # dW2 shape must be same as W2: (num_hidden_neurons, num_output_neurons)
        dW2 = np.dot(self.h1.T, delta2) # (hidden, 1) @ (1, output) -> (hidden, output)

        # dL/db2 = dL/dz2 * dz2/db2 = delta2 * 1
        db2 = np.sum(delta2, axis=0, keepdims=True) # Sum over samples if batch > 1, here it's (1, output)

        # Hidden layer gradients (Layer 1)
        # delta1 = dL/dz1 = (dL/dz2 * dz2/dh1) * dh1/dz1
        # dL/dz2 * dz2/dh1 = delta2 @ W2.T
        # dh1/dz1 = sigmoid_derivative(z1)
        # delta2 shape (1, num_output_neurons), self.W2.T shape (num_output_neurons, num_hidden_neurons)
        delta1 = np.dot(delta2, self.W2.T) * sigmoid_derivative(self.z1) # shape (1, num_hidden_neurons)

        # dL/dW1 = dL/dz1 * dz1/dW1 = delta1 * x.T
        # x.T shape (1, num_input_features), delta1 shape (1, num_hidden_neurons)
        # dW1 shape must be same as W1: (num_input_features, num_hidden_neurons)
        # x has been transposed if it was a column vector for forward pass input
        # If x was (features, 1), x.T is (1, features). Here we need (features, 1) @ (1, hidden)
        dW1 = np.dot(x, delta1) # (features, 1) @ (1, hidden) -> (features, hidden)
        
        # dL/db1 = dL/dz1 * dz1/db1 = delta1 * 1
        db1 = np.sum(delta1, axis=0, keepdims=True) # Sum over samples if batch > 1, here (1, hidden)

        return dW2, db2, dW1, db1

    def update_params(self, dW2, db2, dW1, db1, learning_rate):
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1


## Training Setup

In [None]:
mlp = MLP()

learning_rate = 0.1
epochs = 1000

# Training sample from the exercise (page with forward pass example)
# x = [x1, x2, x3] = [36, 70, 1]
# y = 1 (target output)
x_train = np.array([[36], [70], [1]]) # Shape (3, 1)
y_train = 1 # Target scalar value

loss_history = []

## Training Loop

In [None]:
for epoch in range(epochs):
    # Forward pass
    # The forward method expects x_train.T if x_train is (features, 1)
    # Or it handles x_train directly if it's (1, features) or (features, 1) and transposes internally
    # Let's make sure input to forward is (num_samples, num_features) = (1,3) for our single sample
    # The current forward pass expects x.T to be (1,3) if W1 is (3,2)
    # So x should be (3,1) which is current x_train shape.
    y_hat = mlp.forward(x_train) # y_hat will be (1,1)
    
    # Calculate MSE loss
    loss = 0.5 * (y_hat - y_train)**2
    loss_history.append(loss.item()) # .item() to get scalar from (1,1) array
    
    # Backward pass
    # backward expects x_train (3,1), y_train (scalar), y_hat (1,1)
    dW2, db2, dW1, db1 = mlp.backward(x_train, y_train, y_hat)
    
    # Update parameters
    mlp.update_params(dW2, db2, dW1, db1, learning_rate)
    
    if (epoch + 1) % 100 == 0:
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.6f}')

## Results and Visualization

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(epochs), loss_history)
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.grid(True)
plt.show()

print("\nFinal Trained Weights and Biases:")
print("W1:", mlp.W1)
print("b1:", mlp.b1)
print("W2:", mlp.W2)
print("b2:", mlp.b2)

final_prediction = mlp.forward(x_train)
print(f"\nFinal prediction for x_train = {x_train.T.tolist()}: {final_prediction.item():.6f}")
print(f"Target value y_train: {y_train}")