In [12]:
import numpy as np
import matplotlib.pyplot as plt

In [13]:
class MSE:
    def __init__(self):
        pass

    def get_loss(self, y_pred, y_true):
        errors = y_pred - y_true
        return np.mean(errors**2)

    def get_coeff_gradient(self, y_pred, y_true, X):
        errors = y_pred - y_true
        return (2 / len(X)) * np.dot(errors, X)

    def get_bias_gradient(self, y_pred, y_true, X):
        errors = y_pred - y_true
        return (2 / len(X)) * np.sum(errors, axis=0)

In [None]:
(1,3) (3,2)

In [20]:
lf = MSE()
lf.get_coeff_gradient(np.array([1,2,4]), np.array([1.2, 2.3,3]), np.array([[1,2],[1,5], [4,2]]))

array([2.33333333, 0.06666667])

In [14]:
class Adam:
    def __init__(self, learning_rate=0.01, beta_1=0.85, beta_2=0.99):
        self.lr = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
        self.m = 0 
        self.v = 0
        
        self.m_prev = 0
        self.v_prev = 0
        
    def adam(self, gradients:list, epoch):
        self.prev_m = self.m
        self.prev_v = self.v
        
        self.m = self.beta_1 * self.prev_m + (1 - self.beta_1) * gradients
        self.v = self.beta_2 * self.prev_v + (1 - self.beta_2) * (gradients**2)
        
        m_hat = self.m / (1 - self.beta_1 ** (epoch+1))
        v_hat = self.v / (1 - self.beta_2 ** (epoch+1))
        
        
        learning_rate = self.lr / (np.sqrt(v_hat) + 1e-8)
        return learning_rate * m_hat      
        
        
    def step(self, parameters:list, gradients:list, epoch):
        parameters = np.array(parameters).flatten()
        gradients = np.array(gradients).flatten()
        
        new_parameters = []
        for param, gradient in zip(parameters, gradients):
            update = self.adam(gradient, epoch)
            param -= update
            new_parameters.append(param)
        return np.array(new_parameters)
        
        

In [15]:
class DenseLayer:
    def __init__(self, verbose=True):
        self.verbose = verbose
        self.weights = None
        self.bias = None

        # add bool to check if the layer has been initialized
        self.initialized = False

    def forward(self, X):
        if not self.initialized:
            raise Exception("Layer not initialized")
        return np.dot(self.weights, X.T) + self.bias

    def initialize(self, input_dim, output_dim=1):
        self.input_dim = input_dim
        self.output_dim = 1

        self.weights = np.random.randn(self.input_dim)
        self.bias = np.random.randn(1)
        if self.verbose:
            print("Weights: ", self.weights, "Shape: ", self.weights.shape)
            print("Bias: ", self.bias, "Shape: ", self.bias.shape)
        
        self.initialized = True

In [16]:
class Model:
    def __init__(self, layer, loss, otimizer):
        self.layer = layer(verbose=False)
        self.loss = loss
        self.optimizer = otimizer
        self.initialized = False

    def forward(self, X):
        if not self.initialized:
            raise Exception("Model not initialized. Please call the fit method")
        return self.layer.forward(X)

    def backward(self, _y_pred, _y_true, _X):
        if not self.initialized:
            raise Exception("Model not initialized. Please call the fit method")
        # Get the gradient
        coeff_gradient = self.loss.get_coeff_gradient(_y_pred, _y_true, _X)
        bias_gradient = self.loss.get_bias_gradient(_y_pred, _y_true, _X)
        return coeff_gradient, bias_gradient

    def fit(self, X, y, epochs=100, batch_size=32):
        # Initialize the layer
        input_dim = X.shape[-1]
        self.layer.initialize(input_dim)
        self.initialized = True

        loss: float = None

        for epoch in range(epochs):
            
            batch_start = 0
            batch_end = batch_size
            print(f"Epoch: {epoch} | Loss: {loss}")
            
            while batch_start < len(X):
            
                batch_x = X[batch_start:batch_end]
                batch_y = y[batch_start:batch_end]

                # Forward pass
                y_pred = self.forward(batch_x)
                loss = self.loss.get_loss(y_pred, batch_y)
                
                print(np.array(batch_x).shape, np.array(batch_y).shape, np.array(y_pred).shape)

                # Backward pass
                gradient_coeff, gradient_bias = self.backward(y_pred, batch_y, batch_x)

                self.layer.weights = self.optimizer.step(
                    [self.layer.weights], [gradient_coeff], epoch
                )
                self.layer.bias = self.optimizer.step(
                    [self.layer.bias], [gradient_bias], epoch
                )

                batch_start = batch_end
                batch_end += batch_size

In [17]:
X = np.random.rand(100, 2)
y = [0.5 * x1 + 1.5 * x2 + 3 for x1, x2 in X]


model = Model(DenseLayer, MSE(), Adam(learning_rate=0.01, beta_1=0.85, beta_2=0.9))
model.fit(X, y, epochs=100, batch_size=16)

Epoch: 0 | Loss: None
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(4, 2) (4,) (4,)
Epoch: 1 | Loss: 25.12038378325603
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(4, 2) (4,) (4,)
Epoch: 2 | Loss: 22.786039088056842
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(4, 2) (4,) (4,)
Epoch: 3 | Loss: 20.94880783195272
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(4, 2) (4,) (4,)
Epoch: 4 | Loss: 19.364224258073463
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(4, 2) (4,) (4,)
Epoch: 5 | Loss: 17.942703919868354
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (16,)
(16, 2) (16,) (