In [2]:
import numpy as np
from sklearn.datasets import load_diabetes

X_diab, y_diab = load_diabetes(return_X_y=True) # returns diabetes data shapes: (442, 10) and (442,)

In [70]:
from sklearn.datasets import make_regression

X_reg, y_reg = make_regression(n_samples=60, n_features=10, noise=0.5, random_state=42)

In [77]:
class Layer:
    def __init__(
            self, 
            units, 
            *, 
            input_layer: bool = False,
            activation: str = "linear",
            use_bias: bool = True,
            ):
        """
        Initialize a neural network layer.

        Args:
            units (int): Count of neurons in the layer.
            input_layer (bool, optional): Whether the layer is an input layer. Defaults to False.
            activation (str, optional): Activation function for the layer. Can be "linear", "relu", or "sigmoid". Defaults to "linear".
            use_bias (bool, optional): Whether to use bias in the layer. Defaults to True.
        """
            
        
        self.units = units
        self.input_layer = input_layer
        self.activation = activation
        self.use_bias = use_bias

        self.optimizer = None # Optimizer for layer

        self._input = None
        self._output = None

        self.w = None # Weights matrix
        self._weight_gradient = None # Weights derivative matrix
        self._bias_gradient = None # Biases derivative vector

    def activationFunction(self, z):
        """
        Apply the activation function to the given input.

        Args:
            z (numpy.ndarray): Input to the activation function.

        Returns:
            numpy.ndarray: Output after applying the activation function.
        """

        if self.activation == "linear":
            return z

        if self.activation == "relu":
            return np.maximum(z, np.zeros(z.shape))

        if self.activation == "sigmoid":
            return 1 / (1 + np.exp(-z))

    def _weightInit(self, input_size):
        """
        Initialize the weights matrix based on the input size.

        Args:
            input_size (int): Size of the input.

        Notes:
            Only executed for layers other than the input layer.
        """

        if self.input_layer:
            return # input_layer doesn't need weights

        self.w = np.random.normal(loc = 0, scale = 1 / input_size, size=(input_size, self.units))
        # Initialize weights matrix using a normal distribution with mean 0 and variance 1 / input_size

        self.bias = np.zeros((1, self.units))
        # Initialize biases as zeros


    def _setOptimizer(self, optimizer, beta_1, beta_2):
        """
        Set the optimizer and initialize optimizer-specific variables.

        Args:
            optimizer (str): Optimization algorithm to use.
            beta_1 (float): Value for the optimizer parameter beta_1.
            beta_2 (float): Value for the optimizer parameter beta_2.

        Notes:
            - Only executed for layers other than the input layer.
            - Sets the optimizer and initializes optimizer-specific variables based on the chosen optimizer.
            - For each optimizer, the corresponding variables are initialized.
        """

        if self.input_layer:
            return

        self.optimizer = optimizer
        self._b1 = beta_1
        self._b2 = beta_2

        if self.optimizer == "sgd":
            self.batch_size = 1  # SGD is the same as mini-batch gradient descent when batch_size = 1

        if self.optimizer == "adagrad":
            self._weight_v = np.zeros(self.w.shape)
            # Initialize weight-specific variables for AdaGrad

            if self.use_bias:
                self._bias_v = np.zeros(self.bias.shape)
                # Initialize bias-specific variables for AdaGrad

        if self.optimizer == 'adam':
            self._iter = 0  # Calculate iterations

            self._weight_m = np.zeros(self.w.shape)
            self._weight_v = np.zeros(self.w.shape)
            # Initialize weight-specific variables for Adam

            if self.use_bias:
                self._bias_m = np.zeros(self.bias.shape)
                self._bias_v = np.zeros(self.bias.shape)
                # Initialize bias-specific variables for Adam

        if self.optimizer == 'rms_prop':
            self._weight_v = np.zeros(self.w.shape)
            self._bias_v = np.zeros(self.bias.shape)
            # Initialize weight and bias-specific variables for RMSprop

        if self.optimizer == 'gdm':
            self._weight_m = np.zeros(self.w.shape)
            self._bias_m = np.zeros(self.bias.shape)
            # Initialize weight and bias-specific variables for Gradient Descent with Momentum   

    def _activationDerivative(self):
        """
        Compute the derivative of the activation function.

        Returns:
            numpy.ndarray: Derivative of the activation function.

        Notes:
            Only supports the "linear", "relu", and "sigmoid" activation functions.
        """

        if self.activation == "linear":
            return 1

        if self.activation == "relu":
            return (self._output > 0) * 1

        if self.activation == "sigmoid":
            return self._output * (1 - self._output)

    def _setGrad(self, grad):
        """
        Calculate the gradients of weights and bias for backpropagation.

        Args:
            grad (numpy.ndarray): Gradient from the previous layer.

        Returns:
            numpy.ndarray: Gradient to be passed to the previous layer.

        Notes:
            Only executed for layers other than the input layer.
        """

        if self.input_layer:
            return
        
        grad = grad * self._activationDerivative()
        self._weight_gradient = self._input.T @ grad

        if self.use_bias:
            self._bias_gradient = grad.sum(axis=0, keepdims=True)

        return grad @ self.w.T
    
    def _updateGrad(self, learning_rate):
        """
        Update the weights and bias based on the computed gradients.

        Args:
            learning_rate (float): Learning rate for gradient descent.

        Notes:
            - Only executed for layers other than the input layer.
            - Updates the weights and biases based on the computed gradients and the chosen optimizer.
            - For each optimizer, the corresponding update rule is applied.
        """



        if self.input_layer:
            return

        eps = 10e-8 # Optimizer's epsilon

        if self.optimizer == "gd":
            self.w -= learning_rate * self._weight_gradient
            if self.use_bias:
                self.bias -= learning_rate * self._bias_gradient

        if self.optimizer == "sgd":
            self.w -= learning_rate * self._weight_gradient
            if self.use_bias:
                self.bias -= learning_rate * self._bias_gradient

        if self.optimizer == "adagrad":
            self._weight_v += np.square(self._weight_gradient)
            learning_rate_weight = learning_rate / ( np.sqrt(self._weight_v) + eps)

            self.w -= learning_rate_weight * self._weight_gradient

            if self.use_bias:
                self._bias_v += np.square(self._bias_gradient)
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_v) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient

        if self.optimizer == 'adam':
            self._iter += 1

            self._weight_m = self._b1 * self._weight_m + (1- self._b1) * self._weight_gradient
            self._weight_v = self._b2 * self._weight_v + (1- self._b2) * np.square(self._weight_gradient)

            weight_m = self._weight_m / (1 - np.power(self._b1, self._iter))
            weight_v = self._weight_v / (1 - np.power(self._b2, self._iter))

            self.w -= learning_rate * weight_m / (np.sqrt(weight_v) + eps) # Updating

            if self.use_bias:
                self._bias_m = self._b1 * self._bias_m + (1- self._b1) * self._bias_gradient
                self._bias_v = self._b2 * self._bias_v + (1- self._b2) * np.square(self._bias_gradient)

                bias_m = self._bias_m / (1 - np.power(self._b1, self._iter)) 
                bias_v = self._bias_v / (1 - np.power(self._b2, self._iter))


                self.bias -= learning_rate * bias_m / (np.sqrt(bias_v) + eps) # Updating

        
        if self.optimizer == 'rms_prop':
            self._weight_v = self._b2 * self._weight_v + (1- self._b2) * np.square(self._weight_gradient)

            learning_rate_weight = learning_rate / ( np.sqrt(self._weight_v) + eps)

            self.w -= learning_rate_weight * self._weight_gradient

            if self.use_bias:
                self._bias_v = self._b2 * self._bias_v + (1- self._b2) * np.square(self._bias_gradient)
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_v) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient

        if self.optimizer == 'gdm':
            self._weight_m = self._b2 * self._weight_m + (1- self._b2) * self._weight_gradient

            self.w -= learning_rate_weight * self._weight_m

            if self.use_bias:
                self._bias_m = self._b2 * self._bias_m + (1 - self._b2) * self._bias_gradient
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_m) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient



    def call(self, X):
        """
        Perform a forward pass through the layer.

        Args:
            X (numpy.ndarray): Input to the layer.

        Returns:
            numpy.ndarray: Output of the layer after applying the activation function.
        """
        if self.input_layer:
            return X
        
        self._input = X
        self._output = self.activationFunction(X @ self.w + self.bias)

        return self._output

In [78]:
class NeauralNetwork:
    def __init__(
            self, 
            layers: list, 
            loss_function: str = "mse", 
            learning_rate = 0.01,
            verbose: bool = False,
            optimizer: str = "gd",
            epochs: int = 1, 
            batch_size: int = 32,
            beta_1: float = 0.9,
            beta_2: float = 0.999
            ):
        """
        Initialize a neural network.

        Args:
            layers (list): List of Layer objects defining the network architecture. 
            loss_function (str, optional): Loss function to use. Defaults to "mse".
            optimizer (str, optional): Optimization algorithm to use for updating weights during training.
                Options include:
                - "gd" (Gradient Descent): Standard gradient descent.
                - "sgd" (Stochastic Gradient Descent): Update weights using a single sample at a time.
                - "adagrad" (Adaptive Gradient): Adjust the learning rate based on the frequency of feature occurrences.
                - "adam" (Adam): Adaptive Moment Estimation algorithm.
                - "rms_prop" (Root Mean Square Propagation): Adapt the learning rate based on the moving average of squared gradients.
                - "gdm" (Gradient Descent with Momentum): Add momentum to the gradient descent algorithm.
                Defaults to "gd".

            learning_rate (float, optional): Learning rate for gradient descent. Defaults to 0.01.
            epochs (int, optional): Number of epochs for training. Defaults to 1.
            batch_size (int, optional): Batch size for training. Defaults to 32.
            verbose (bool, optional): Whether to display training progress. Defaults to False.

            beta_1 (float, optional): Parameter for the optimizer. Defaults to 0.9.
            beta_2 (float, optional): Parameter for the optimizer. Defaults to 0.999.
        """

        self.layers = layers
        self.loss_function = loss_function
        self.learning_rate = learning_rate
        self.verbose = verbose
        self.optimizer = optimizer  # Optimizer for all layers
        self.epochs = epochs
        self.batch_size = batch_size

        self.beta_1 = beta_1  # Optimizer parameters
        self.beta_2 = beta_2  # Optimizer parameters

        # Weights initializing:
        for i in range(len(self.layers)):
            self.layers[i]._weightInit(self.layers[i - 1].units)
            self.layers[i]._setOptimizer(self.optimizer, self.beta_1, self.beta_2)
            # Initialize weights for each layer and set the optimizer


    def lossFunction(self, y_true, y_pred):
        """
        Compute the loss between the true values and predicted values.

        Args:
            y_true (numpy.ndarray): True values.
            y_pred (numpy.ndarray): Predicted values.

        Returns:
            float: Loss value.
        """

        if self.loss_function == "mse":
            return 0.5 * np.mean(np.linalg.norm(y_pred - y_true, axis=1)**2)

        # Can be added

    def _lossFunctionDerivative(self, y_pred, y_true):
        """
        Compute the derivative of the loss function.

        Args:
            y_pred (numpy.ndarray): Predicted values.
            y_true (numpy.ndarray): True values.

        Returns:
            numpy.ndarray: Derivative of the loss function.
        """

        if self.loss_function == "mse":
            return 1 / len(y_pred) * (y_pred - y_true)

        # Can be added

    def fit(self, X, y):
        """
        Train the neural network on the given input-output pairs.

        Args:
            X (numpy.ndarray): Input data.
            y (numpy.ndarray): Output data.

        Notes:
            - Reshape y to a column vector (shape: (n_samples, output_size)).
        """
        batch_separation = [(i, i + self.batch_size) for i in range(0, len(X), self.batch_size)] # Get batch indices
        epoch_len = len(batch_separation)

        indeces = np.arange(len(X))

        for _ in range(self.epochs):    
            np.random.shuffle(indeces) # Shuffle the training data

            for iter, (i, j) in enumerate(batch_separation):
                X_ = X[indeces[i:j]] # Get current batch
                y_ = y[indeces[i:j]] # Get current batch

                pred = self.forward(X_)

                if self.verbose:
                    process_percent = int(iter / epoch_len * 10)
                    print(f"\r Epoch {_ + 1}/{self.epochs}; Batch {iter}/{epoch_len}: [{process_percent * '=' + '>' + (10 - process_percent) * '-'}] - loss: {self.lossFunction(y_, pred)}",end='')
                
                self.backward(pred, y_)
            
            if self.verbose:
                print(f"\r Epoch {_ + 1}/{self.epochs}; Batch {iter + 1}/{epoch_len}: [{11 * '='}] - loss: {self.lossFunction(y_, pred)}")

    def predict(self, X):
        """
        Perform predictions using the trained neural network.

        Args:
            X (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray: Predicted output data.
        """

        return self.forward(X)
        
    def forward(self, X):
        """
        Perform a forward pass through the network.

        Args:
            X (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray
        """

        X_ = np.copy(X)
        
        for layer in self.layers:
            X_ = layer.call(X_)
        return X_

    def backward(self, y_pred, y_true):
        """
        Perform backpropagation to update the weights of the network.

        Args:
            y_pred (numpy.ndarray): Predicted values.
            y_true (numpy.ndarray): True values.
        """
        
        gradient = self._lossFunctionDerivative(y_pred, y_true)

        for layer in reversed(self.layers):
            gradient = layer._setGrad(gradient)
            layer._updateGrad(self.learning_rate)

In [85]:
nn = NeauralNetwork(layers=[
        Layer(units=10, input_layer=True),
        # Layer(units=40, activation="sigmoid"),
        Layer(units=40, activation="relu"),
        Layer(units=40, activation="relu"),
        Layer(units=1),
    ],
    loss_function = "mse",
    learning_rate=0.001, 
    verbose=True,
    optimizer="gdm",
    batch_size = 64,
    epochs=1000
)

y_diab = y_diab.reshape(-1, 1) # Network requirement

nn.fit(X_diab, y_diab)

