In [None]:
import numpy as np

def normalize_inputs(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    normalized_X = (X - mean) / std
    return normalized_X, mean, std

class Parameters:
    def __init__(self, n_inputs, n_neurons):
        self.weights = np.random.randn(n_inputs, n_neurons) * 0.01
        self.bias = np.zeros((1, n_neurons))

    def update(self, dW, dB, learning_rate, lambda_reg, n_samples):
        # Adjust for L2 regularization
        self.weights -= learning_rate * (dW + lambda_reg * self.weights / n_samples)
        self.bias -= learning_rate * dB

class ActivationFunction:
    @staticmethod
    def relu(Z):
        return np.maximum(0, Z)

    @staticmethod
    def relu_derivative(dA, Z):
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0
        return dZ

    @staticmethod
    def sigmoid(Z):
        return 1 / (1 + np.exp(-Z))

    @staticmethod
    def sigmoid_derivative(dA, Z):
        s = 1 / (1 + np.exp(-Z))
        return dA * s * (1 - s)

class Layer:
    def __init__(self, n_inputs, n_neurons, activation_type, dropout_rate=0):
        self.parameters = Parameters(n_inputs, n_neurons)
        self.activation_type = activation_type
        self.dropout_rate = dropout_rate
        self.mask = None
        self.Z = None
        self.A = None

    def forward(self, inputs):
        self.Z = np.dot(inputs, self.parameters.weights) + self.parameters.bias
        if self.activation_type == 'relu':
            self.A = ActivationFunction.relu(self.Z)
        elif self.activation_type == 'sigmoid':
            self.A = ActivationFunction.sigmoid(self.Z)

        # Apply dropout
        if self.dropout_rate > 0:
            self.mask = np.random.rand(*self.A.shape) > self.dropout_rate
            self.A = self.A * self.mask
            self.A /= (1 - self.dropout_rate)

        return self.A

    def backward(self, dA, prev_activation, learning_rate, lambda_reg):
        if self.dropout_rate > 0:
            dA = dA * self.mask  # Apply dropout mask to gradient
            dA /= (1 - self.dropout_rate)

        m = dA.shape[1]
        if self.activation_type == 'relu':
            dZ = ActivationFunction.relu_derivative(dA, self.Z)
        elif self.activation_type == 'sigmoid':
            dZ = ActivationFunction.sigmoid_derivative(dA, self.Z)

        dW = np.dot(prev_activation.T, dZ) / m + (lambda_reg * self.parameters.weights / m)  # L2 regularization adjustment
        dB = np.sum(dZ, axis=0, keepdims=True) / m
        dA_prev = np.dot(dZ, self.parameters.weights.T)

        self.parameters.update(dW, dB, learning_rate, lambda_reg, m)
        return dA_prev

class Model:
    def __init__(self):
        self.layers = []

    def add(self, layer):
        self.layers.append(layer)

    def forward_pass(self, X):
        A = X
        for layer in self.layers:
            A = layer.forward(A)
        return A

    def backward_pass(self, Y, Y_hat, learning_rate, lambda_reg):
        dA = compute_mse_loss_derivative(Y, Y_hat)
        for i in reversed(range(len(self.layers))):
            if i > 0:
                dA = self.layers[i].backward(dA, self.layers[i-1].A, learning_rate, lambda_reg)
            else:  # For the first layer, previous activation is the input X itself
                dA = self.layers[i].backward(dA, X, learning_rate, lambda_reg)

    def compute_loss(self, Y, Y_hat, lambda_reg):
        # Regular loss
        mse_loss = np.mean((Y - Y_hat) ** 2)
        # L2 regularization loss
        l2_loss = sum([np.sum(layer.parameters.weights ** 2) for layer in self.layers])
        return mse_loss + (lambda_reg / (2 * Y.shape[0])) * l2_loss
    
    # MINI BATCH Implementation
    def train(self, X, Y, learning_rate, epochs, lambda_reg, batch_size=64):
        n_samples = X.shape[0]
        n_batches = int(np.ceil(n_samples / batch_size))

        for epoch in range(epochs):
            # Shuffle the dataset at the beginning of each epoch
            permutation = np.random.permutation(n_samples)
            X_shuffled = X[permutation]
            Y_shuffled = Y[permutation]

            for batch_index in range(n_batches):
                start = batch_index * batch_size
                end = start + batch_size
                X_batch = X_shuffled[start:end]
                Y_batch = Y_shuffled[start:end]

                # Step 2: Feed it to Neural Network for a forward pass
                Y_hat = self.forward_pass(X_batch)

                # Calculate the loss 
                loss = self.compute_loss(Y_batch, Y_hat, lambda_reg)

                # Step 3 & 4: Backward pass to calculate mean gradients and update weights
                self.backward_pass(Y_batch, Y_hat, learning_rate, lambda_reg)

            if epoch % 100 == 0:
                # Compute the loss on the entire dataset for monitoring
                Y_hat_full = self.forward_pass(X)
                full_loss = self.compute_loss(Y, Y_hat_full, lambda_reg)
                print(f'Epoch {epoch}, Loss: {full_loss}')

def compute_mse_loss(Y, Y_hat):
    return np.mean((Y - Y_hat) ** 2)

def compute_mse_loss_derivative(Y, Y_hat):
    return -2 * (Y - Y_hat) / Y.shape[0]
