In [2]:
import numpy as np

class Parameters:
    def __init__(self, n_inputs, n_neurons):
        self.weights = np.random.randn(n_inputs, n_neurons) * 0.01
        self.bias = np.zeros((1, n_neurons))

    def update(self, dW, dB, learning_rate):
        self.weights -= learning_rate * dW
        self.bias -= learning_rate * dB

class ActivationFunction:
    @staticmethod
    def relu(Z):
        return np.maximum(0, Z)

    @staticmethod
    def relu_derivative(dA, Z):
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0
        return dZ

    @staticmethod
    def sigmoid(Z):
        return 1 / (1 + np.exp(-Z))

    @staticmethod
    def sigmoid_derivative(dA, Z):
        s = 1 / (1 + np.exp(-Z))
        return dA * s * (1 - s)

class Layer:
    def __init__(self, n_inputs, n_neurons, activation_type):
        self.parameters = Parameters(n_inputs, n_neurons)
        self.activation_type = activation_type
        self.Z = None
        self.A = None

    def forward(self, inputs):
        self.Z = np.dot(inputs, self.parameters.weights) + self.parameters.bias
        if self.activation_type == 'relu':
            self.A = ActivationFunction.relu(self.Z)
        elif self.activation_type == 'sigmoid':
            self.A = ActivationFunction.sigmoid(self.Z)
        return self.A

    def backward(self, dA, prev_activation, learning_rate):
        m = dA.shape[1]
        if self.activation_type == 'relu':
            dZ = ActivationFunction.relu_derivative(dA, self.Z)
        elif self.activation_type == 'sigmoid':
            dZ = ActivationFunction.sigmoid_derivative(dA, self.Z)

        dW = np.dot(prev_activation.T, dZ) / m
        dB = np.sum(dZ, axis=0, keepdims=True) / m
        dA_prev = np.dot(dZ, self.parameters.weights.T)

        self.parameters.update(dW, dB, learning_rate)
        return dA_prev

class Model:
    def __init__(self):
        self.layers = []

    def add(self, layer):
        self.layers.append(layer)

    def forward_pass(self, X):
        A = X
        for layer in self.layers:
            A = layer.forward(A)
        return A

    def backward_pass(self, Y, Y_hat, learning_rate):
        dA = compute_mse_loss_derivative(Y, Y_hat)
        for i in reversed(range(len(self.layers))):
            if i > 0:
                dA = self.layers[i].backward(dA, self.layers[i-1].A, learning_rate)
            else:  # For the first layer, previous activation is the input X itself
                dA = self.layers[i].backward(dA, X, learning_rate)

    def compute_loss(self, Y, Y_hat):
        return compute_mse_loss(Y, Y_hat)

    def train(self, X, Y, learning_rate, epochs):
        for epoch in range(epochs):
            Y_hat = self.forward_pass(X)
            loss = self.compute_loss(Y, Y_hat)
            self.backward_pass(Y, Y_hat, learning_rate)
            if epoch % 100 == 0:
                print(f'Epoch {epoch}, Loss: {loss}')

def compute_mse_loss(Y, Y_hat):
    return np.mean((Y - Y_hat) ** 2)

def compute_mse_loss_derivative(Y, Y_hat):
    return -2 * (Y - Y_hat) / Y.shape[0]
