# TODO

- add mini batches

In [1]:
import numpy as np

In [2]:
class Adam:
    def __init__(self, learning_rate=0.01, beta_1=0.85, beta_2=0.99):
        self.lr = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2

        self.m = 0
        self.v = 0

        self.m_prev = 0
        self.v_prev = 0

    def adam(self, gradients: list, epoch):
        self.prev_m = self.m
        self.prev_v = self.v

        self.m = self.beta_1 * self.prev_m + (1 - self.beta_1) * gradients
        self.v = self.beta_2 * self.prev_v + (1 - self.beta_2) * (gradients**2)

        m_hat = self.m / (1 - self.beta_1 ** (epoch + 1))
        v_hat = self.v / (1 - self.beta_2 ** (epoch + 1))

        learning_rate = self.lr / (np.sqrt(v_hat) + 1e-8)
        return learning_rate * m_hat

    def step(self, parameters: list, gradients: list, epoch):
        parameters = np.array(parameters).flatten()
        gradients = np.array(gradients).flatten()

        new_parameters = []
        for param, gradient in zip(parameters, gradients):
            update = self.adam(gradient, epoch)
            param -= update
            new_parameters.append(param)
        return np.array(new_parameters)

In [3]:
class MSELoss:
    def __init__(self):
        pass

    def calculate_loss(self, y_pred, y_true):
        return np.mean((y_pred - y_true) ** 2)

    def calculate_gradient_coeff(self, y_pred, y_true, X):
        errors = y_pred - y_true
        return (2 / len(y_true)) * np.dot(errors, X)

    def calculate_gradient_bias(self, y_pred, y_true):
        errors = y_pred - y_true
        return (2 / len(y_true)) * np.sum(errors)

In [4]:
# For now, let's manually define the output size of the Dense layer, but in the future, we can infer it from the next layer
class Dense:
    def __init__(self, input_size, output_size, verbose=True):
        self.input_size = input_size
        self.output_size = output_size
        self.verbose = verbose

        self.initialize_params()

    def forward(self, X):
        return np.dot(self.weights, X.T) + self.bias

    def initialize_params(self):
        # new axis is because the dot product of x[100,3] and weights[3,2] will result in a shape of [100,2], but we have bias of shape [2]
        # so we need to add a new axis to the bias to make it [2,1] so that we can add it to the dot product result
        self.weights = np.random.randn(self.output_size, self.input_size)
        self.bias = np.random.randn(self.output_size, 1)
        if self.verbose:
            print(f"Weight shape: {self.weights.shape}")
            print(f"Bias shape: {self.bias.shape}")
            print("Initialized weights:", self.weights)
            print("Initialized bias:", self.bias)

In [5]:
class Model:
    def __init__(self, layer):
        self.layer = layer  # not initialized
        self.initiated = False
        self.compiled = False

    def compile(self, loss, optimizer):
        self.loss = loss
        self.optimizer = optimizer
        self.compiled = True

    def _forward(self, x):
        if not self.initiated:
            raise Exception("Model not initiated - call `fit()` method first")

        return self.layer.forward(x)

    def _backward(self, y_pred, y_real, X):
        if not self.initiated:
            raise Exception("Model not initiated - call `fit()` method first")

        gradient_coeff = self.loss.calculate_gradient_coeff(y_pred, y_real, X)
        gradient_bias = self.loss.calculate_gradient_bias(y_pred, y_real)

        return gradient_coeff, gradient_bias

    def fit(self, X, y, epochs=100):
        if not self.compiled:
            raise Exception("Model not compiled - call `compile()` method first")

        input_dim = X.shape[-1]
        output_dim = y.shape[-1]
        self.layer = self.layer(
            input_size=input_dim, output_size=output_dim, verbose=True
        )
        self.initiated = True
        
        
        loss = None
        print("\n------ Model initiated ------")
        for epoch in range(epochs):
            # we need to find gradients of weights for each output separately
            # so we need to iterate over each output
            # for now let's use the batch (full) gradient descent

            updated_weights = []
            updated_bias = []
            
            print(f"\nEpoch {epoch} | Loss: {loss}")
            for i in range(output_dim):
                # print(f"\nOutput {i}")
                weights = self.layer.weights[i, :]
                bias = self.layer.bias[i]
                # print(f"Weights: {weights}")
                # print(f"Bias: {bias}")
                
                y_pred = self._forward(X)[i, :]
                y_real = y[:, i]
                # print(f"X shape: {X.shape}")
                # print(f"y_pred shape: {y_pred.shape}")
                # print(f"y_real shape: {y_real.shape}")
                
                loss = self.loss.calculate_loss(y_pred, y_real)
                
                
                grad_coeff, grad_bias = self._backward(y_pred, y_real, X)
                # print(f"Gradient coeff: {grad_coeff}")
                # print(f"Gradient bias: {grad_bias}")
                
                new_weights = self.optimizer.step(weights, grad_coeff, epoch)
                new_bias = self.optimizer.step(bias, grad_bias, epoch)
                
                updated_weights.append(new_weights)
                updated_bias.append(new_bias)
                
            self.layer.weights = np.array(updated_weights)
            self.layer.bias = np.array(updated_bias)   


In [6]:
model = Model(layer=Dense)
model.compile(loss=MSELoss(), optimizer=Adam(learning_rate=0.01))

X_train = np.random.normal(0, 1, (100,3))
y_train = np.array(
    [[x1 * 1.5 + x3 * 3.5, x2 * 2.5 + x3 * 3.5] for x1, x2, x3 in X_train]
)


print(X_train.shape, y_train.shape)

model.fit(X_train, y_train, epochs=1000)

(100, 3) (100, 2)
Weight shape: (2, 3)
Bias shape: (2, 1)
Initialized weights: [[-0.71465269 -0.1594483   1.00182898]
 [-0.42849634  0.19283412 -0.33051069]]
Initialized bias: [[ 1.22796239]
 [-0.21034963]]

------ Model initiated ------

Epoch 0 | Loss: None

Epoch 1 | Loss: 21.517050787004056

Epoch 2 | Loss: 21.376553984476736

Epoch 3 | Loss: 21.279441182484216

Epoch 4 | Loss: 21.204939542895918

Epoch 5 | Loss: 21.14339593540862

Epoch 6 | Loss: 21.089805657700207

Epoch 7 | Loss: 21.041392915246675

Epoch 8 | Loss: 20.996508484408213

Epoch 9 | Loss: 20.95410925882116

Epoch 10 | Loss: 20.913499364671033

Epoch 11 | Loss: 20.87419376296551

Epoch 12 | Loss: 20.835842181647454

Epoch 13 | Loss: 20.798184387161708

Epoch 14 | Loss: 20.76102260497095

Epoch 15 | Loss: 20.724203789926975

Epoch 16 | Loss: 20.687607807343213

Epoch 17 | Loss: 20.65113930153176

Epoch 18 | Loss: 20.61472194493761

Epoch 19 | Loss: 20.578294271217345

Epoch 20 | Loss: 20.541806590701537

Epoch 21 | Los