In [1]:
import numpy as np

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

In [2]:
def forward_pass(x, W, b, activation_fn):
    z = []
    a = [x]
    for i in range(len(W)):
        z_i = W[i] @ a[-1] + b[i]
        z.append(z_i)
        
        if activation_fn[i] == 'relu':
            a.append(relu(z_i))
        elif activation_fn[i] == 'sigmoid':
            a.append(sigmoid(z_i))
        else:
            raise ValueError("Unsupported activation")
            
    return z, a

In [3]:
def backward_pass(a, z, y, W, activation_fn, loss_fn):
    L = len(W)
    gradients_W = [None] * L
    gradients_b = [None] * L
    delta = [None] * L

    # output layer
    if loss_fn == 'MSE':
        loss_grad = a[-1] - y
    elif loss_fn == 'cross_entropy':
        loss_grad = a[-1] - y  # works if sigmoid + CE used together
    else:
        raise ValueError("Unsupported loss")

    if activation_fn[-1] == 'sigmoid':
        delta[-1] = loss_grad * sigmoid_derivative(z[-1]) if loss_fn == 'MSE' else loss_grad
    elif activation_fn[-1] == 'relu':
        delta[-1] = loss_grad * relu_derivative(z[-1])

    # Hidden layers
    for l in reversed(range(L - 1)):
        if activation_fn[l] == 'sigmoid':
            act_deriv = sigmoid_derivative(z[l])
        else:
            act_deriv = relu_derivative(z[l])
        delta[l] = (W[l + 1].T @ delta[l + 1]) * act_deriv

    # Gradients
    for l in range(L):
        gradients_W[l] = np.outer(delta[l], a[l])
        gradients_b[l] = delta[l]

    return gradients_W, gradients_b

In [4]:
def gradient_descent(W, b, gradients_W, gradients_b, lr=0.01):
    for l in range(len(W)):
        W[l] -= lr * gradients_W[l]
        b[l] -= lr * gradients_b[l]

In [5]:
def train_backpropagation(X, Y, lr=0.1, epochs=1000):
    input_dim = X.shape[1]
    output_dim = 1
    hidden_units = np.random.randint(3, 10)

    # Initialize weights and biases for 2-layer NN
    W = [
        np.random.randn(hidden_units, input_dim) * 0.1,  # input -> hidden
        np.random.randn(output_dim, hidden_units) * 0.1  # hidden -> output
    ]
    b = [
        np.zeros(hidden_units),
        np.zeros(output_dim)
    ]

    activation_fn = ['relu', 'sigmoid']
    loss_fn = 'MSE'

    for _ in range(epochs):
        for x, y in zip(X, Y):
            z, a = forward_pass(x, W, b, activation_fn)
            gradients_W, gradients_b = backward_pass(a, z, y, W, activation_fn, loss_fn)
            gradient_descent(W, b, gradients_W, gradients_b, lr)

    return W, b

In [6]:
def predict(x, W, b, activation_fn):
    _, a = forward_pass(x, W, b, activation_fn)
    return a[-1]

In [16]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = np.array([[0], [1], [1], [0]])  # XOR

W, b = train_backpropagation(X, Y, lr=0.5, epochs=5000)

print([predict(x, W, b, ['relu', 'sigmoid']) for x in X])

[array([0.01201378]), array([0.99420449]), array([0.99419912]), array([0.01201131])]
