# Import Files and Libraries

In [1]:
import os
import sys
import pandas as pd
import numpy as np



dir_path = os.getcwd().split(os.path.sep)
root_index = dir_path.index('Machine_Learning_project')
root_path = os.path.sep.join(dir_path[:root_index + 1])
sys.path.append(root_path + '/code/')
sys.path.append(root_path + '/code/data_loaders/')
sys.path.append(root_path + '/code/utils_visualizations/')


from data import *
from data_cup import *
from Nostra_Neural_Network import *
from Nostra_Neural_Network.Batch import Batch
from Nostra_Neural_Network.Train import Train
from Nostra_Neural_Network.K_FOLD import k_fold
from Nostra_Neural_Network.NN_creation import NN_creation
from Nostra_Neural_Network.Average import Average
from Nostra_Neural_Network.Neural_Network import Neural_Network
from Nostra_Neural_Network.Activation_Function import activation_function
from plot import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder




# **NEURAL NETWORK**

#### In this project, we have developed a neural network in Python, specifically a fully connected Multi-Layer Perceptron (MLP) that utilizes a backpropagation algorithm for training. The network is designed to be highly flexible, allowing for a general number of layers and units per layer. The training algorithm has been implemented exclusively using computational libraries such as NumPy. Key features of our implementation include:

## - **OPTIMIZERS** :
#### The network can be trained using the following optimizers:
- ##### **Batch Gradient Descent (GD)** : The most basic and widely used optimization algorithm.
- ##### **Stocastich Gradient Descent (SGD)** : Simple yet effective optimization algorithm, the network supports variable batch sizes and shuffles the training data at each epoch to ensure that the model generalizes well and does not overfit to the order of the data.
- ##### **ADAM Optimizer**: The network can also be trained using the Adam optimizer, which combines the advantages of both Adaptive Gradient Algorithm (AdaGrad) and Root Mean Square Propagation (RMSProp), providing an efficient and effective optimization method.

## - **Regularization Techniques**:
#### We have integrated several regularization techniques to help prevent overfitting and improve generalization:
##### - **L2 Regularization**: Also known as weight decay, this technique helps prevent overfitting by penalizing large weights, encouraging the network to maintain smaller and more stable weights.
##### - **L1 Regularization**: This technique promotes sparsity by penalizing the absolute values of the weights, which can lead to some weights becoming zero and thereby creating a sparse network.
##### - **Early Stopping**: Early stopping halts the training process when the performance on a validation set starts to deteriorate, preventing overfitting and saving computational resources.

## **Additional Features**:
##### - **Momentum**: To accelerate the convergence of the gradient descent algorithm, we have implemented momentum, which helps the network navigate through ravines and avoid local minima more effectively.
##### - **Gradient Clipping**: We have implemented gradient thresholding to prevent exploding gradients, ensuring that the updates to the model parameters remain stable.
Activation Functions: The implementation includes a variety of activation functions such as tanh, sigmoid, ReLU, etc., allowing users to experiment with different non-linearities in the network.
Loss Functions: We have incorporated several loss functions, including Huber Loss and Mean Squared Error (MSE), providing flexibility in optimizing the network for different types of regression tasks.




##



Data Split:

- TR (Training Set): This dataset consists of 80% of the total data and is used to train the machine learning model.

- VL (Validation Set): This dataset comprises 10% of the total data and is used to evaluate the model's performance.

- TS (Test Set): This dataset makes up the remaining 10% of the total data and is used for evaluation of the final model's performance.


Data Preprocessing:

- Polynomial Feature (Degree 2): We create second-degree polynomial terms of the original features to capture non-linear relationships.

- Arctanh Normalization: We apply the arctanh function to normalize the data for improved training convergence and feature scaling.

# BASE MODEL CREATION

## SGD

In [2]:

# This function trains a neural network using the SGD (Stochastic Gradient Descent) optimization algorithm.
# Parameters:
#   - X_TR: array-like, training set features.
#   - Y_TR: array-like, training set target.
#   - X_TS: array-like, test set features.
#   - Y_TS: array-like, test set target.
#   - epoche: int, number of training epochs.
#   - learning_rate: float, learning rate.
#   - lamda: float, L2 regularization parameter.
#   - alfa: float, momentum parameter.
#   - batch_size: int, mini-batch size.

def MLP_SGD(X_TR, Y_TR, X_TS, Y_TS, epochs, learning_rate, lamda, alfa, batch_size):
    # Create the neural network object
    nn = Neural_Network(1, 17)
    nn.layers[0].activation_function = activation_function("sigmoid")
    nn.add_layer(4, activation="tanh")
    nn.gradient_treshold = -1
    nn.batch_size = 16
    nn.lamda_2 = lamda
    nn.alfa = alfa
    nn.Inizialization(type="Xavier_uniform", scale=1)

    # Set the training data
    nn.set_train(X_TR, Y_TR, batch_size=batch_size, Loss_Function="MSE", random_state=1)

    # Lists to store performance metrics
    accuracy_list_TR = []
    accuracy_list_TS = []
    MSE_list_TR = []
    MSE_list_TS = []

    # Training loop
    iter_epoch = X_TR.shape[0]//nn.batch_size if nn.batch_size!= -1 else 1
    for i in range(epochs):
        [ nn.step_train(learning_rate) for j in range(iter_epoch) ]
        y_pred_train = nn.forward(X_TR)
        y_pred_test = nn.forward(X_TS)

        # Compute performance metrics
        accuracy_list_TR.append(accuracy_score(Y_TR, np.round(y_pred_train)))
        accuracy_list_TS.append(accuracy_score(Y_TS, np.round(y_pred_test)))
        MSE_list_TR.append(mean_squared_error(Y_TR, y_pred_train))
        MSE_list_TS.append(mean_squared_error(Y_TS, y_pred_test))


    # Return performance metrics
    return accuracy_list_TR, accuracy_list_TS, MSE_list_TR, MSE_list_TS


## ADAM

In [3]:

# This function trains a neural network using the SGD (Stochastic Gradient Descent) optimization algorithm.
# Parameters:
#   - X_TR: array-like, training set features.
#   - Y_TR: array-like, training set target.
#   - X_TS: array-like, test set features.
#   - Y_TS: array-like, test set target.
#   - epoche: int, number of training epochs.
#   - learning_rate: float, learning rate.
#   - lamda: float, L2 regularization parameter.
#   - alfa: float, momentum parameter.
#   - batch_size: int, mini-batch size.

def MLP_ADAM(X_TR, Y_TR, X_TS, Y_TS, epochs, learning_rate, lamda, batch_size):
    # Create the neural network object
    nn = Neural_Network(1, 17, adam = True)
    nn.layers[0].activation_function = activation_function("sigmoid")
    nn.add_layer(4, activation="tanh")
    nn.gradient_treshold = 10
    nn.batch_size = 16
    nn.lamda_2 = lamda
    nn.beta_1 = 0.9
    nn.beta_2 = 0.999
    nn.Inizialization(type="Xavier_uniform", scale=1)

    # Set the training data
    nn.set_train(X_TR, Y_TR, batch_size=batch_size, Loss_Function="MSE", random_state = None)

    # Lists to store performance metrics
    accuracy_list_TR = []
    accuracy_list_TS = []
    MSE_list_TR = []
    MSE_list_TS = []

    iter_epoch = X_TR.shape[0]//nn.batch_size if nn.batch_size!= -1 else 1
    # Training loop
    for i in range(epochs):
        [ nn.step_train(learning_rate) for j in range(iter_epoch) ]
        y_pred_train = nn.forward(X_TR)
        y_pred_test = nn.forward(X_TS)

        # Compute performance metrics
        accuracy_list_TR.append(accuracy_score(Y_TR, np.round(y_pred_train)))
        accuracy_list_TS.append(accuracy_score(Y_TS, np.round(y_pred_test)))
        MSE_list_TR.append(mean_squared_error(Y_TR, y_pred_train))
        MSE_list_TS.append(mean_squared_error(Y_TS, y_pred_test))

    # Return performance metrics
    return accuracy_list_TR, accuracy_list_TS, MSE_list_TR, MSE_list_TS

# MONK 1

In [6]:

m1_train = MonksDataset('monk1_train')
m1_test= MonksDataset('monk1_test')

#Splitting the data into train, and test sets
X_TR_m1, Y_TR_m1, X_TS_m1, Y_TS_m1 = get_monks_data(m1_train, m1_test)

Y_TS_m1 = Y_TS_m1.values
Y_TR_m1 = Y_TR_m1.values
X_TR_m1 = X_TR_m1.values
X_TS_m1 = X_TS_m1.values

# Encoding with the O.H.E. method using pandas get_dummies()
encoder = OneHotEncoder()
X_TR_m1 = encoder.fit_transform(X_TR_m1).toarray()
X_TS_m1= encoder.transform(X_TS_m1).toarray()

In [7]:

accuracy_TR, accuracy_TS, MSE_TR, MSE_TS= MLP_SGD(X_TR_m1, Y_TR_m1, X_TS_m1, Y_TS_m1, epochs = 300, learning_rate = 0.8, alfa=0.8, batch_size = 8, lamda = 0)

In [None]:
plot_history_monk('Monk 1', accuracy_TR, accuracy_TS, MSE_TR, MSE_TS)
print('Monk 1', accuracy_TR[-1], accuracy_TS[-1], MSE_TR[-1], MSE_TS[-1])

# MONK 2

In [9]:
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, "DATA")
m2_train = MonksDataset('monk2_train')
m2_test= MonksDataset('monk2_test')

#Splitting the data into train, and test sets
X_dev, Y_TR_m2, X_test_m2, Y_TS_m2 = get_monks_data(m2_train, m2_test)

Y_TS_m2 = Y_TS_m2.values
Y_TR_m2 = Y_TR_m2.values

# Encoding with the O.H.E. method using pandas get_dummies()
X_TR_m2 = pd.get_dummies(X_dev, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6']).values
X_TS_m2= pd.get_dummies(X_test_m2, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6']).values

In [10]:

accuracy_TR, accuracy_TS, MSE_TR, MSE_TS= MLP_SGD(X_TR_m2, Y_TR_m2, X_TS_m2, Y_TS_m2, epochs = 300, learning_rate = 0.9, batch_size = 1, lamda = 0, alfa = 0.8)
print('Monk 2', accuracy_TR[-1], accuracy_TS[-1], MSE_TR[-1], MSE_TS[-1])

Monk 2 1.0 1.0 3.238688992054987e-05 4.648470641720167e-05


In [None]:
plot_history_monk('Monk 2', accuracy_TR, accuracy_TS, MSE_TR, MSE_TS)

# MONK 3

In [12]:
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, "DATA")
m3_train = MonksDataset('monk3_train')
m3_test= MonksDataset('monk3_test')

#Splitting the data into train, and test sets
X_dev, Y_TR_m3, X_test_m3, Y_TS_m3 = get_monks_data(m3_train, m3_test)

Y_TS_m3 = Y_TS_m3.values
Y_TR_m3 = Y_TR_m3.values

# Encoding with the O.H.E. method using pandas get_dummies()
X_TR_m3 = pd.get_dummies(X_dev, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6']).values
X_TS_m3= pd.get_dummies(X_test_m3, columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6']).values

In [13]:

accuracy_TR, accuracy_TS, MSE_TR, MSE_TS= MLP_ADAM(X_TR_m3, Y_TR_m3, X_TS_m3, Y_TS_m3, epochs = 300, learning_rate = 0.01, batch_size = 8, lamda = 0.5)
print('Monk 2', accuracy_TR[-1], accuracy_TS[-1], MSE_TR[-1], MSE_TS[-1])


Monk 2 0.9426229508196722 0.9675925925925926 0.056040645327578936 0.04865039909749305


In [None]:
plot_history_monk('Monk 3', accuracy_TR, accuracy_TS, MSE_TR, MSE_TS)

# CUP

In [None]:
# Create an instance of the dataset for Cup training and Cup test
cup = CupDataset('Cup_tr')



df =cup.data





#df=pd.read_csv("DATA/ML-CUP23-TR.csv",comment="#",header=None)

y=df.iloc[:,-3:]
x=df.iloc[:,1:-3]

#PRIMO SPLITTING DEI DATI PER INTERNAL TEST SET
X_TR, X_TS, Y_TR, Y_TS = train_test_split(x, y, test_size=0.1, random_state=0)

#SECONDO SPLITTING DEI DATI PER MODEL ASSESSTEMENT
X_TR, X_VL, Y_TR, Y_VL = train_test_split(X_TR, Y_TR, test_size=0.111,random_state=0)

X_TR = X_TR.values
X_TS = X_TS.values
Y_TR = Y_TR.values
Y_TS = Y_TS.values


#Generate polynomial features
poly = PolynomialFeatures(2)
X_TR = poly.fit_transform(X_TR)
X_TS = poly.transform(X_TS)
X_VL = poly.transform(X_VL)

#Apply arctanh normalization to numerical features
X_TR = np.arctanh(X_TR[:,1:])
X_TS = np.arctanh(X_TS[:,1:])
X_VL = np.arctanh(X_VL[:,1:])

## Hyper-parameters Tuning
A common approach is to start with a coarse search across a wide range of values to find promising sub-ranges of our parameter space. Then, you would zoom into these ranges and perform another search to fine-tune the configurations.

Here, we proceed as follows:
1. (coarse) Grid-search across a wide range of hyper-paramaters and values;
2. (fine-tune) Random-search into zoomed intervals w.r.t. best configuration found by grid-search.

Then, we perform a single run of grid-search and random-search with the respectively best configurations while taking into account a PolynomialFeatures pre-processing with fixed degree. The best configurations that will be used for final re-training and evaluation on internal test is the one with the best mean MEE on the validation cross-validation.

Note that, tuning of the polynomial degree wasn't performed because it would be very expensive. Thus, we simply decided to use a fixed degree value.

## Grid Search

In [36]:
from itertools import product
def k_fold_grid_search(X, Y, grid, epoche, k):
    """
    This function performs a K-Fold Grid Search on a Neural Network model. It takes a grid of hyperparameters, the number of training epochs, and the number of folds (k) as input.

    Args:
        grid (dict): A dictionary containing the grid of hyperparameters to search. Keys are hyperparameter names, values are lists of possible values.
        epoche (int): The number of training epochs for the Neural Network.
        k (int): The number of folds to use in K-Fold Cross Validation.

    Returns:
        dict: A dictionary containing the results of the Grid Search. Keys are hyperparameter names, values are lists of the searched values. Additionally, keys prefixed with "MEE_TR_", "MEE_TS_", "MSE_TR_", "MSE_TS_", and "MEE_std_" contain the corresponding Mean Absolute Error (MAE) and Mean Squared Error (MSE) for training and validation sets, along with the standard deviation of the validation MAE for each combination of hyperparameters.
    """
    # KFold object iterates through folds
    fold = k_fold(k, X, Y)

    # Create a dictionary to store results for each hyperparameter combination
    dic_df = {_: [] for _ in grid}

    # Initialize additional metrics for tracking
    dic_df["MEE_TR"] = []
    dic_df["MEE_TR"]=[]
    dic_df["MEE_TS"]=[]
    dic_df["MSE_TR"]=[]
    dic_df["MSE_TS"]=[]
    dic_df["MEE_std"]=[]

    # Define a default Neural Network configuration
    dizionario_NN = NN_creation({}).dic
    dizionario_NN["NN_shape"]=(3,65)

    while(fold.EO()): # Loop until all folds have been used

        # Get the next fold of data for training and validation
        _X_TR,_Y_TR,_X_VL,_Y_VL = fold.data()

        # Update Neural Network configuration with current fold data
        dizionario_NN["X_TR"]=_X_TR
        dizionario_NN["Y_TR"]=_Y_TR
        dizionario_NN["X_TS"]=_X_VL
        dizionario_NN["Y_TS"]=_Y_VL

        # Iterate through all combinations of hyperparameter values in the grid
        for i,iperparameters in enumerate(product( *[grid[_] for _ in grid ])):
            stringa =str(i)+" addestramento --> "

            # Record each hyperparameter value for this combination
            for iperparameter,name_iperparameter in zip(iperparameters,grid):
                dic_df[name_iperparameter].append(iperparameter)
                dizionario_NN[name_iperparameter] = iperparameter
                stringa= stringa +"  "+name_iperparameter+" : "+str(iperparameter)

            # Create a Neural Network instance with current hyperparameters
            nn = NN_creation(dizionario_NN)

            # Train the Neural Network and get performance metrics
            MSE_TR,MSE_TS,MEE_TR,MEE_TS,MEE_TS_std = nn.automatic_learning(epoche = epoche,verbouse=False)

            # Store performance metrics for this hyperparameter combination
            dic_df["MSE_TR"].append(MSE_TR[-1])
            dic_df["MSE_TS"].append(MSE_TS[-1])
            dic_df["MEE_TR"].append(MEE_TR[-1])
            dic_df["MEE_TS"].append(MEE_TS[-1])
            dic_df["MEE_std"].append(MEE_TS_std[-1])

            stringa += "  MEE_TR : "+str(dic_df["MEE_TR"][-1])+"  MEE_TS : "+str(dic_df["MEE_TS"][-1])+" +- "+str(dic_df["MEE_std"][-1])
            print(stringa)

    #Return the results of the Grid Search.
    return dic_df


In [None]:
grid= {"N_layer":[1,2,3],"N_units": [10,30,50,100,150],"lamda_2":[0.01,0.001,0.0001],"alfa":[0.25,0.5,0.75]}

result = k_fold_grid_search( X_TR, Y_TR, grid , epoche = 5000, k = 5)

## Random Search

In [38]:
def random_search_kfold(X, Y, random_grid, epoche, k, n_iterazioni):
    """
    This function performs a K-Fold Random Search on a Neural Network model. It takes a grid of hyperparameters, the number of training epochs, and the number of folds (k) as input.

    Args:
        grid (dict): A dictionary containing the grid of hyperparameters to search. Keys are hyperparameter names, values are lists of possible values.
        epoche (int): The number of training epochs for the Neural Network.
        k (int): The number of folds to use in K-Fold Cross Validation.

    Returns:
        dict: A dictionary containing the results of the Random Search. Keys are hyperparameter names, values are lists of the searched values. Additionally, keys prefixed with "MEE_TR_", "MEE_TS_", "MSE_TR_", "MSE_TS_", and "MEE_std_" contain the corresponding Mean Absolute Error (MAE) and Mean Squared Error (MSE) for training and validation sets, along with the standard deviation of the validation MAE for each combination of hyperparameters.
    """
    # KFold object iterates through folds
    fold = k_fold(k, X, Y)

    # Create a dictionary to store results for each hyperparameter combination
    dic_df = {_: [] for _ in random_grid}

    # Initialize additional metrics for tracking
    dic_df["MEE_TR"] = []
    dic_df["MEE_TR"]=[]
    dic_df["MEE_TS"]=[]
    dic_df["MSE_TR"]=[]
    dic_df["MSE_TS"]=[]
    dic_df["MEE_std"]=[]

    # Define a default Neural Network configuration
    dizionario_NN = NN_creation({}).dic
    dizionario_NN["NN_shape"]=(3,65)

    for i in range(n_iterazioni):
        stringa =str(i)+" addestramento --> "
        for name_iperparameter in random_grid:
            iperparameter = np.random.choice(random_grid[name_iperparameter])
            dic_df[name_iperparameter].append(iperparameter)
            dizionario_NN[name_iperparameter] = iperparameter
            stringa= stringa +"  "+name_iperparameter+" : "+str(iperparameter)
        while(fold.EO()):
            stringa1 =""
            _X_TR,_Y_TR,_X_VL,_Y_VL = fold.data()
            dizionario_NN["X_TR"]=_X_TR
            dizionario_NN["Y_TR"]=_Y_TR
            dizionario_NN["X_TS"]=_X_VL
            dizionario_NN["Y_TS"]=_Y_VL

            # Create a Neural Network instance with current hyperparameters
            nn = NN_creation(dizionario_NN)

            # Train the Neural Network and get performance metrics
            MSE_TR,MSE_TS,MEE_TR,MEE_TS,MEE_TS_std = nn.automatic_learning(epoche = epoche,verbouse=False)

            # Store performance metrics for this hyperparameter combination
            dic_df["MSE_TR"].append(MSE_TR[-1])
            dic_df["MSE_TS"].append(MSE_TS[-1])
            dic_df["MEE_TR"].append(MEE_TR[-1])
            dic_df["MEE_TS"].append(MEE_TS[-1])
            dic_df["MEE_std"].append(MEE_TS_std[-1])

            stringa1 += "  MEE_TR : "+str(dic_df["MEE_TR"][-1])+"  MEE_TS : "+str(dic_df["MEE_TS"][-1])+" +- "+str(dic_df["MEE_std"][-1])
            print(stringa + stringa1)
    return dic_df

In [None]:
random_grid = {"N_layer":[1],"N_units": [150],"lamda_2":np.linspace(0.0003,0.0007,100),"alfa":np.linspace(0.15,0.65,100)}
dic_df = random_search_kfold(X_TR, Y_TR, random_grid, epoche = 15000, k=5, n_iterazioni=30)

## Training Best Model

In [None]:
# Settting best iperparameter found in the model selection
alfa=0.35
lamda = 0.00055
batch_size=-1
epoche=130000
epoche_bath=15000


nn=Neural_Network(3,65)
nn.add_layer(150,activation= "tanh")

nn.gradient_treshold = 10
nn.lamda_2=lamda
nn.alfa=alfa

# Create an Average ensemble object with the Neural Network and data
avg = Average(nn, X_TR, Y_TR, X_VL,  epoche,epoche_bath)

# Get the final prediction from the Average ensemble
y_pred = avg.predict()

#np.savetxt("y_pred_VL_NNN.csv", y_pred, delimiter=",")

## Training Best Model for Model Assesstment


In [None]:
# Settting best iperparameter found in the model selection
alfa=0.35
lamda = 0.00055
batch_size=-1
epoche=130000
epoche_bath=15000


nn=Neural_Network(3,65)
nn.add_layer(150,activation= "tanh")

nn.gradient_treshold = 10
nn.lamda_2=lamda
nn.alfa=alfa

X_DEV = np.concatenate((X_TR, X_VL), axis = 0)
Y_DEV = np.concatenate((Y_TR, Y_VL), axis = 0)

# Create an Average ensemble object with the Neural Network and data
avg = Average(nn, X_DEV, Y_DEV, X_TS,  epoche,epoche_bath)

# Get the final prediction from the Average ensemble
y_pred = avg.predict()

#np.savetxt("y_pred_internal_TS_NNN.csv", y_pred, delimiter=",")


## Final Retraining

In [None]:
X = x.values
Y = y.values

df1 = pd.read_csv("DATA/ML-CUP23-TR.csv",comment="#",header=None)
X_BLIND = df1.iloc[:,1:]

# Settting best iperparameter found in the model selection
alfa=0.35
lamda = 0.00055
batch_size=-1
epoche=130000
epoche_bath=15000


nn=Neural_Network(3,65)
nn.add_layer(150,activation= "tanh")

nn.gradient_treshold = 10
nn.lamda_2=lamda
nn.alfa=alfa

# Create an Average ensemble object with the Neural Network and data
avg = Average(nn, X, Y, X_BLIND,  epoche,epoche_bath)

# Get the final prediction from the Average ensemble
y_pred = avg.predict()

#np.savetxt("y_pred_TS_NNN.csv", y_pred, delimiter=",")