In [110]:
import os
import numpy as np
from sklearn.datasets import load_diabetes
from PIL import Image

In [2]:
class DenseLayer:
    """
    Represents a neural network layer.
    ==========
    Attributes:
    ----------
        size (int): Number of neurons in the layer.
        input_layer (bool): Whether the layer is an input layer.
        activation (str): Activation function for the layer.
        use_bias (bool): Whether to use bias in the layer.
        optimizer (str): Optimization algorithm used for the layer.
        w (numpy.ndarray): Weights matrix for the layer.

    Methods:
    ---------
        activationFunction(z):
            Apply the activation function to the given input.

        __call__(X):
            Perform a forward pass through the layer.

    """
    def __init__(
            self, 
            size, 
            *, 
            input_layer: bool = False,
            activation: str = "linear",
            use_bias: bool = True,
            ):
        """
        Initialize a neural network layer.

        Args:
            size (int): Count of neurons in the layer.
            input_layer (bool, optional): Whether the layer is an input layer. Defaults to False.
            activation (str, optional): Activation function for the layer. Can be "linear", "relu", or "sigmoid". Defaults to "linear".
            use_bias (bool, optional): Whether to use bias in the layer. Defaults to True.
        """
            
        
        self.size = size
        self.input_layer = input_layer
        self.activation = activation
        self.use_bias = use_bias

        self.optimizer = None # Optimizer for layer

        self._input = None
        self._output = None

        self.w = None # Weights matrix
        self._weight_gradient = None # Weights derivative matrix
        self._bias_gradient = None # Biases derivative vector

    def _weightInit(self, input_size):
        """
        Initialize the weights matrix based on the input size.

        Args:
            input_size (int): Size of the input.

        Notes:
            Only executed for layers other than the input layer.
        """

        if self.input_layer:
            return # input_layer doesn't need weights

        self.w = np.random.normal(loc = 0, scale = 1 / input_size, size=(input_size, self.size))
        # Initialize weights matrix using a normal distribution with mean 0 and variance 1 / input_size

        self.bias = np.zeros((1, self.size))
        # Initialize biases as zeros

    def activationFunction(self, z):
        """
        Apply the activation function to the given input.

        Args:
            z (numpy.ndarray): Input to the activation function.

        Returns:
            numpy.ndarray: Output after applying the activation function.
        """

        if self.activation == "linear":
            return z

        if self.activation == "relu":
            return np.maximum(z, np.zeros(z.shape))

        if self.activation == "sigmoid":
            return 1 / (1 + np.exp(-z))

    def _setOptimizer(self, optimizer, beta_1, beta_2):
        """
        Set the optimizer and initialize optimizer-specific variables.

        Args:
            optimizer (str): Optimization algorithm to use.
            beta_1 (float): Value for the optimizer parameter beta_1.
            beta_2 (float): Value for the optimizer parameter beta_2.

        Notes:
            - Only executed for layers other than the input layer.
            - Sets the optimizer and initializes optimizer-specific variables based on the chosen optimizer.
            - For each optimizer, the corresponding variables are initialized.
        """

        if self.input_layer:
            return

        self.optimizer = optimizer
        self._b1 = beta_1
        self._b2 = beta_2

        if self.optimizer == "adagrad":
            self._weight_v = np.zeros(self.w.shape)
            # Initialize weight-specific variables for AdaGrad

            if self.use_bias:
                self._bias_v = np.zeros(self.bias.shape)
                # Initialize bias-specific variables for AdaGrad

        if self.optimizer == 'adam':
            self._iter = 0  # Calculate iterations

            self._weight_m = np.zeros(self.w.shape)
            self._weight_v = np.zeros(self.w.shape)
            # Initialize weight-specific variables for Adam

            if self.use_bias:
                self._bias_m = np.zeros(self.bias.shape)
                self._bias_v = np.zeros(self.bias.shape)
                # Initialize bias-specific variables for Adam

        if self.optimizer == 'rms_prop':
            self._weight_v = np.zeros(self.w.shape)

            if self.use_bias:
                self._bias_v = np.zeros(self.bias.shape)
            # Initialize weight and bias-specific variables for RMSprop

        if self.optimizer == 'gdm':
            self._weight_m = np.zeros(self.w.shape)

            if self.use_bias:
                self._bias_m = np.zeros(self.bias.shape)
            # Initialize weight and bias-specific variables for Gradient Descent with Momentum   

    def _activationDerivative(self):
        """
        Compute the derivative of the activation function.

        Returns:
            numpy.ndarray: Derivative of the activation function.

        Notes:
            Only supports the "linear", "relu", and "sigmoid" activation functions.
        """

        if self.activation == "linear":
            return 1

        if self.activation == "relu":
            return (self._output > 0) * 1

        if self.activation == "sigmoid":
            return self._output * (1 - self._output)

    def _setGrad(self, grad):
        """
        Calculate the gradients of weights and bias for backpropagation.

        Args:
            grad (numpy.ndarray): Gradient from the previous layer.

        Returns:
            numpy.ndarray: Gradient to be passed to the previous layer.

        Notes:
            Only executed for layers other than the input layer.
        """

        if self.input_layer:
            return
        
        grad = grad * self._activationDerivative()
        self._weight_gradient = self._input.T @ grad

        if self.use_bias:
            self._bias_gradient = grad.sum(axis=0, keepdims=True)

        return grad @ self.w.T
    
    def _updateGrad(self, learning_rate):
        """
        Update the weights and bias based on the computed gradients.

        Args:
            learning_rate (float): Learning rate for gradient descent.

        Notes:
            - Only executed for layers other than the input layer.
            - Updates the weights and biases based on the computed gradients and the chosen optimizer.
            - For each optimizer, the corresponding update rule is applied.
        """



        if self.input_layer:
            return

        eps = 10e-8 # Optimizer's epsilon

        if self.optimizer == "gd":
            self.w -= learning_rate * self._weight_gradient
            if self.use_bias:
                self.bias -= learning_rate * self._bias_gradient

        if self.optimizer == "adagrad":
            self._weight_v += np.square(self._weight_gradient)
            learning_rate_weight = learning_rate / ( np.sqrt(self._weight_v) + eps)

            self.w -= learning_rate_weight * self._weight_gradient

            if self.use_bias:
                self._bias_v += np.square(self._bias_gradient)
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_v) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient

        if self.optimizer == 'adam':
            self._iter += 1

            self._weight_m = self._b1 * self._weight_m + (1- self._b1) * self._weight_gradient
            self._weight_v = self._b2 * self._weight_v + (1- self._b2) * np.square(self._weight_gradient)

            weight_m = self._weight_m / (1 - np.power(self._b1, self._iter))
            weight_v = self._weight_v / (1 - np.power(self._b2, self._iter))

            self.w -= learning_rate * weight_m / (np.sqrt(weight_v) + eps) # Updating

            if self.use_bias:
                self._bias_m = self._b1 * self._bias_m + (1- self._b1) * self._bias_gradient
                self._bias_v = self._b2 * self._bias_v + (1- self._b2) * np.square(self._bias_gradient)

                bias_m = self._bias_m / (1 - np.power(self._b1, self._iter)) 
                bias_v = self._bias_v / (1 - np.power(self._b2, self._iter))


                self.bias -= learning_rate * bias_m / (np.sqrt(bias_v) + eps) # Updating

        
        if self.optimizer == 'rms_prop':
            self._weight_v = self._b2 * self._weight_v + (1- self._b2) * np.square(self._weight_gradient)

            learning_rate_weight = learning_rate / ( np.sqrt(self._weight_v) + eps)

            self.w -= learning_rate_weight * self._weight_gradient

            if self.use_bias:
                self._bias_v = self._b2 * self._bias_v + (1- self._b2) * np.square(self._bias_gradient)
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_v) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient

        if self.optimizer == 'gdm':
            self._weight_m = self._b2 * self._weight_m + (1 - self._b2) * self._weight_gradient

            self.w -= learning_rate * self._weight_m

            if self.use_bias:
                self._bias_m = self._b2 * self._bias_m + (1 - self._b2) * self._bias_gradient

                self.bias -= learning_rate * self._bias_m

    def __call__(self, X):
        """
        Perform a forward pass through the layer.

        Args:
            X (numpy.ndarray): Input to the layer.

        Returns:
            numpy.ndarray: Output of the layer after applying the activation function.
        """
        if self.input_layer:
            return X
        
        self._input = X
        self._output = self.activationFunction(X @ self.w + self.bias)

        return self._output

In [231]:
class Conv2d:
    def __init__(
            self,
            size: int,
            kernel_size: tuple,
            *,
            padding: int = 0,
            stride: int = 1,
            activation: str = "linear",
            mode: str = "rgb",
            pooling: str = None,
            use_bias: bool = True,
        ):
        self.size = size
        self.kernel_size = kernel_size

        self.padding = padding
        self.stride = stride
        self.activation = activation
        self.mode = mode
        self.pooling = pooling
        self.use_bias = use_bias

        self.kernel = None
        self.optimizer = None

        self._kernel_gradient = None # Kernel derivative matrix

        if self.use_bias:
            self._bias_gradient = None # Biases derivative vector

    def _weightInit(self, depth):

        self.kernel_size = self.kernel_size + (depth, ) # (kenel_size, depth)

        # Initialize weights matrix using a normal distribution with mean 0 and variance 1 / input_size
        self.kernel = np.random.normal(loc = 0, scale = 1 / (self.kernel_size[0] * self.kernel_size[1] * self.kernel_size[2]), size=(self.kernel_size + (self.size,)))        

        # Description here:
        self._kernel_gradient = np.zeros_like(self.kernel)


        if self.use_bias:
            # Initialize biases as zeros
            self.bias = np.zeros(self.size)

            # Description here:
            self._bias_gradient = np.zeros_like(self.bias)

    
    def activationFunction(self, z):
        """
        Apply the activation function to the given input.

        Args:
            z (numpy.ndarray): Input to the activation function.

        Returns:
            numpy.ndarray: Output after applying the activation function.
        """

        if self.activation == "linear":
            return z

        if self.activation == "relu":
            return np.maximum(z, np.zeros(z.shape))

    def _setOptimizer(self, optimizer, beta_1, beta_2):
        self.optimizer = optimizer
        self._b1 = beta_1
        self._b2 = beta_2


        if self.optimizer == "adagrad":
            self._weight_v = np.zeros(self.kernel.shape)
            # Initialize weight-specific variables for AdaGrad

            if self.use_bias:
                self._bias_v = np.zeros(self.bias.shape)
                # Initialize bias-specific variables for AdaGrad

        if self.optimizer == 'adam':
            self._iter = 0  # Calculate iterations

            self._weight_m = np.zeros(self.kernel.shape)
            self._weight_v = np.zeros(self.kernel.shape)
            # Initialize weight-specific variables for Adam

            if self.use_bias:
                self._bias_m = np.zeros(self.bias.shape)
                self._bias_v = np.zeros(self.bias.shape)
                # Initialize bias-specific variables for Adam

        if self.optimizer == 'rms_prop':
            self._weight_v = np.zeros(self.kernel.shape)

            if self.use_bias:
                self._bias_v = np.zeros(self.bias.shape)
            # Initialize weight and bias-specific variables for RMSprop

        if self.optimizer == 'gdm':
            self._weight_m = np.zeros(self.kernel.shape)

            if self.use_bias:
                self._bias_m = np.zeros(self.bias.shape)
            # Initialize weight and bias-specific variables for Gradient Descent with Momentum   


    def _setGrad(self, grad):
        if self.pooling:
            grad = grad[np.newaxis, ...]

        self._kernel_gradient = grad.reshape(-1, grad.shape[-1]).sum(axis=0).reshape(1, 1, 1, -1) * self._kernel_gradient

        if self.use_bias:
            self._bias_gradient = grad.reshape(-1, grad.shape[-1]).sum(axis=0)

        for i in range(grad.shape[0]):
            axis1_x1, axis1_x2 = self._indices_axis1[i]
            for j in range(grad.shape[1]):
                axis2_y1, axis2_y2 = self._indices_axis2[j]

                self._input_gradient[axis1_x1:axis1_x2, axis2_y1:axis2_y2] *= grad[i, j].sum()


        return self._input_gradient

    def _updateGrad(self, learning_rate):
        eps = 10e-8 # Optimizer's epsilon

        if self.optimizer == "gd":
            self.kernel -= learning_rate * self._kernel_gradient

            if self.use_bias:
                self.bias -= learning_rate * self._bias_gradient

        if self.optimizer == "adagrad":
            self._weight_v += np.square(self._kernel_gradient)
            learning_rate_weight = learning_rate / ( np.sqrt(self._weight_v) + eps)

            self.kernel -= learning_rate_weight * self._kernel_gradient

            if self.use_bias:
                self._bias_v += np.square(self._bias_gradient)
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_v) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient

        if self.optimizer == 'adam':
            self._iter += 1

            self._weight_m = self._b1 * self._weight_m + (1- self._b1) * self._kernel_gradient
            self._weight_v = self._b2 * self._weight_v + (1- self._b2) * np.square(self._kernel_gradient)

            weight_m = self._weight_m / (1 - np.power(self._b1, self._iter))
            weight_v = self._weight_v / (1 - np.power(self._b2, self._iter))

            self.kernel -= learning_rate * weight_m / (np.sqrt(weight_v) + eps) # Updating

            if self.use_bias:
                self._bias_m = self._b1 * self._bias_m + (1- self._b1) * self._bias_gradient
                self._bias_v = self._b2 * self._bias_v + (1- self._b2) * np.square(self._bias_gradient)

                bias_m = self._bias_m / (1 - np.power(self._b1, self._iter)) 
                bias_v = self._bias_v / (1 - np.power(self._b2, self._iter))


                self.bias -= learning_rate * bias_m / (np.sqrt(bias_v) + eps) # Updating

        
        if self.optimizer == 'rms_prop':
            self._weight_v = self._b2 * self._weight_v + (1- self._b2) * np.square(self._kernel_gradient)

            learning_rate_weight = learning_rate / ( np.sqrt(self._weight_v) + eps)

            self.kernel -= learning_rate_weight * self._kernel_gradient

            if self.use_bias:
                self._bias_v = self._b2 * self._bias_v + (1- self._b2) * np.square(self._bias_gradient)
                learning_rate_bias = learning_rate / ( np.sqrt(self._bias_v) + eps)

                self.bias -= learning_rate_bias * self._bias_gradient

        if self.optimizer == 'gdm':
            self._weight_m = self._b2 * self._weight_m + (1 - self._b2) * self._kernel_gradient

            self.kernel -= learning_rate * self._weight_m

            if self.use_bias:
                self._bias_m = self._b2 * self._bias_m + (1 - self._b2) * self._bias_gradient

                self.bias -= learning_rate * self._bias_m
                
    def __call__(self, tensor):
        tensor = tensor[..., np.newaxis]

        self._input = tensor

        tensor_shape = np.array(tensor.shape[:2])

        output_shape = ((tensor_shape + 2 * self.padding - self.kernel_size[:2]) / self.stride).astype(int) + 1
        output_shape = np.concatenate([output_shape, [self.size]])

        self._output = np.zeros(output_shape)

        self._indices_axis1 = [(i - self.kernel_size[0], i) for i in range(self.kernel_size[0], tensor_shape[0] + 1, self.stride)]
        self._indices_axis2 = [(i - self.kernel_size[0], i) for i in range(self.kernel_size[1], tensor_shape[1] + 1, self.stride)]

        self._input_gradient = np.copy(tensor)

        for i in range(output_shape[0]):
            axis1_x1, axis1_x2 = self._indices_axis1[i]
            for j in range(output_shape[1]):

                axis2_y1, axis2_y2 = self._indices_axis2[j]

                self._output[i, j] = (tensor[axis1_x1:axis1_x2, axis2_y1:axis2_y2] * self.kernel).sum(axis=(0, 1, 2))
                self._kernel_gradient += tensor[axis1_x1:axis1_x2, axis2_y1:axis2_y2] * (self._output[i, j] > 0) # ReLU implementation
                self._input_gradient[axis1_x1:axis1_x2, axis2_y1:axis2_y2] += self.kernel.sum(axis=3, keepdims=True)


        self._output = self.activationFunction(self._output) # + self.bias

        # May be added Max pooling
        # if self.pooling == "max":
        #     return output.max(axis=(0, 1))[np.newaxis, ...] # (1, output_shape)
        
        if self.pooling == "average":
            self._kernel_gradient /= self._output.shape[0] * self._output.shape[1]
            self._input_gradient /= self._output.shape[0] * self._output.shape[1]

            return self._output.mean(axis=(0, 1))[np.newaxis, ...] # (1, output_shape)
        

        return self._output

In [232]:
class NeauralNetwork:
    """
    Neural Network
    ==============

    A neural network model for deep learning.

    The `NeuralNetwork` class allows you to create and train a neural network model with customizable architecture and
    training parameters.

    Args:
    -------
        layers (list): List of Layer objects defining the network architecture.
        loss_function (str, optional): Loss function to use. Defaults to "mse".
        learning_rate (float, optional): Learning rate for gradient descent. Defaults to 0.01.
        verbose (bool, optional): Whether to display training progress. Defaults to False.
        optimizer (str, optional): Optimization algorithm to use for updating weights during training. Defaults to "gd".
        epochs (int, optional): Number of epochs for training. Defaults to 1.
        batch_size (int, optional): Batch size for training. Defaults to 32.
        beta_1 (float, optional): Parameter for the optimizer. Defaults to 0.9.
        beta_2 (float, optional): Parameter for the optimizer. Defaults to 0.999.

    Methods:
    --------
        __init__(self, layers, loss_function="mse", learning_rate=0.01, verbose=False, optimizer="gd", epochs=1,
                 batch_size=32, beta_1=0.9, beta_2=0.999)
            Initializes a neural network object.
        lossFunction(self, y_true, y_pred)
            Compute the loss between the true values and predicted values.
        fit(self, X, y)
            Train the neural network on the given input-output pairs.
        predict(self, X)
            Perform predictions using the trained neural network.
        forward(self, X)
            Perform a forward pass through the network.
        backward(self, y_pred, y_true)
            Perform backpropagation to update the weights of the network.
    """
    def __init__(
            self, 
            layers: list, 
            loss_function: str = "mse", 
            softmax: bool = False,
            learning_rate = 0.01,
            verbose: bool = False,
            optimizer: str = "gd",
            epochs: int = 1, 
            batch_size: int = 32,
            beta_1: float = 0.9,
            beta_2: float = 0.999
            ):
        """
        Initialize a neural network.
        --------
        Args:
        --------
            layers (list): List of Layer objects defining the network architecture. 
            loss_function (str, optional): Loss function to use. Defaults to "mse".
            optimizer (str, optional): Optimization algorithm to use for updating weights during training.
                Options include:
                - "gd" (Gradient Descent): Standard gradient descent.
                - "sgd" (Stochastic Gradient Descent): Update weights using a single sample at a time.
                - "adagrad" (Adaptive Gradient): Adjust the learning rate based on the frequency of feature occurrences.
                - "adam" (Adam): Adaptive Moment Estimation algorithm.
                - "rms_prop" (Root Mean Square Propagation): Adapt the learning rate based on the moving average of squared gradients.
                - "gdm" (Gradient Descent with Momentum): Add momentum to the gradient descent algorithm.
                Defaults to "gd".

            learning_rate (float, optional): Learning rate for gradient descent. Defaults to 0.01.
            epochs (int, optional): Number of epochs for training. Defaults to 1.
            batch_size (int, optional): Batch size for training. Defaults to 32.
            verbose (bool, optional): Whether to display training progress. Defaults to False.

            beta_1 (float, optional): Parameter for the optimizer. Defaults to 0.9.
            beta_2 (float, optional): Parameter for the optimizer. Defaults to 0.999.
        """

        self.layers = layers
        self.loss_function = loss_function
        self.softmax = softmax
        self.learning_rate = learning_rate
        self.verbose = verbose
        self.optimizer = optimizer  # Optimizer for all layers
        self.epochs = epochs
        self.batch_size = batch_size

        self.beta_1 = beta_1  # Optimizer parameters
        self.beta_2 = beta_2  # Optimizer parameters


        # Weights initializing:
        self.layers[0]._weightInit(3) # TODO: KEEP ATTENTION MAY BE CHANGED
        self.layers[0]._setOptimizer(self.optimizer, self.beta_1, self.beta_2)

        for i in range(1, len(self.layers)):
            self.layers[i]._weightInit(self.layers[i - 1].size)
            self.layers[i]._setOptimizer(self.optimizer, self.beta_1, self.beta_2)
            # Initialize weights for each layer and set the optimizer

    def lossFunction(self, y_true, y_pred):
        """
        Compute the loss between the true values and predicted values.
        
        Args:
            y_true (numpy.ndarray): True values.
            y_pred (numpy.ndarray): Predicted values.

        Returns:
            float: Loss value.
        """

        if self.loss_function == "mse":
            return 0.5 * np.mean(np.linalg.norm(y_pred - y_true, axis=1)**2)
        
        if self.loss_function == "cross_entropy":
            return -np.log(y_pred[0, np.argmax(y_true)])

        # Can be added

    def _lossFunctionDerivative(self, y_pred, y_true):
        """
        Compute the derivative of the loss function.

        Args:
            y_pred (numpy.ndarray): Predicted values.
            y_true (numpy.ndarray): True values.

        Returns:
            numpy.ndarray: Derivative of the loss function.
        """

        if self.loss_function == "mse":
            derivative = 1 / len(y_pred) * (y_pred - y_true)
        
        if self.loss_function == "cross_entropy":
            derivative = np.zeros_like(y_pred.flatten())
            derivative[np.argmax(y_true)] = - 1 / y_pred[0, np.argmax(y_true)]

        # Can be added

        if self.softmax:
            softmax_der = np.zeros_like(y_pred)
            softmax_der[0, np.argmax(y_true)] = 1
            softmax_der = -y_pred[0, np.argmax(y_true)] * (y_pred - softmax_der)
            derivative = derivative * softmax_der

        return derivative

    def fit(self, X, y):
        """
        Train the neural network on the given input-output pairs.

        Args:
            X (numpy.ndarray): Input data.
            y (numpy.ndarray): Output data.

        Notes:
            - Reshape y to a column vector (shape: (n_samples, output_size)).
        """
        for _ in range(self.epochs):
            for i in range(len(y)):
                pred = self.forward(X[i])

                if self.verbose:
                    # process_percent = int(iter / epoch_len * 10)
                    print(f"\r Epoch {_ + 1}/{self.epochs}: loss: {self.lossFunction(y[i], pred)}",end='')
                    
                self.backward(pred, y[i])
            
        if self.verbose:
            print(f"\r Epoch {self.epochs}: loss: {self.lossFunction(y[i], pred)}")

    def predict(self, X):
        """
        Perform predictions using the trained neural network.

        Args:
            X (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray: Predicted output data.
        """

        return self.forward(X)
        
    def forward(self, X):
        """
        Perform a forward pass through the network.

        Args:
            X (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray
        """

        X_ = np.copy(X)
        
        for layer in self.layers:
            X_ = layer(X_)

        if self.softmax:
            self.probs = np.exp(X_)
            self.probs = self.probs / np.sum(self.probs)
            return self.probs

        return X_

    def backward(self, y_pred, y_true):
        """
        Perform backpropagation to update the weights of the network.

        Args:
            y_pred (numpy.ndarray): Predicted values.
            y_true (numpy.ndarray): True values.
        """
        
        gradient = self._lossFunctionDerivative(y_pred, y_true)


        loss = self.lossFunction(y_true, y_pred)
        if loss < 1e-20:
            return

        for layer in reversed(self.layers):
            gradient = layer._setGrad(gradient)
            layer._updateGrad(self.learning_rate)

In [233]:
def makeData(path, labels):
    X_train, y_train = [], []
    for image_path in os.listdir(path):
        image = Image.open(path + image_path)
        image_array = np.asarray(image) / 256

        label = np.zeros(3)
        label[[labels[image_path.split("_")[0]]]] = 1

        X_train.append(image_array)
        y_train.append(label)  

    return X_train, y_train   

labels = {
    "cucumber": 0,
    "eggplant": 1,
    "mushroom": 2,
}

path = "./data/train/"

X_train, y_train = makeData(path, labels)   

In [236]:
nn = NeauralNetwork(layers=[
        Conv2d(10, (5, 5), stride=3, activation='relu'),
        Conv2d(20, (7, 7), stride=5, activation='relu'),
        Conv2d(30, (3, 3), stride=2, activation='relu'),
        Conv2d(80, (3, 3), stride=1, activation='relu', pooling="average"),

        DenseLayer(size=3),
        # DenseLayer(size=40, activation="sigmoid"),
        # DenseLayer(size=40, activation="relu"),
        # DenseLayer(size=40, activation="relu"),
        # DenseLayer(size=1),
    ],
    loss_function = "cross_entropy",
    learning_rate=0.01,
    softmax=True,
    verbose=True,
    optimizer="gd",
    epochs=10
)

nn.fit(X_train, y_train)

 Epoch 1/1: loss: 1.2344886417258072

KeyboardInterrupt: 