### Defining imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.decomposition import PCA
from torch.optim.lr_scheduler import ReduceLROnPlateau

seed=42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.benchmark = False

### Loading the dataset

After dataset loading, we remove duplicated or NaN rows.
We expect to get the same NaN and duplicates results as in ML notebook, so we skip this print and we pass to test, validation and training split. We also do a winsorization to handle outliers.

In [2]:
df = pd.read_csv("../datasets/train.csv")

df = df.drop_duplicates()

df = df.dropna()

X = df.drop('Year', axis=1)
y = df['Year']

def winsorize_outliers(df, column, lower_limit, upper_limit):
    df[column] = np.where(df[column] < lower_limit, lower_limit, df[column])
    df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
    return df

lower_limit = X.quantile(0.05, axis=0)
upper_limit = X.quantile(0.95, axis=0)

for col in X.columns:
    X = winsorize_outliers(X.copy(), col, lower_limit[col], upper_limit[col])

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.15, random_state=seed)

print("----Training Set----")
print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)

print("\n----Validation Set----")
print("X_val Shape:", X_val.shape)
print("y_val Shape:", y_val.shape)

print("\n----Test Set----")
print("X_test Shape:", X_test.shape)
print("y_test Shape:", y_test.shape)

----Training Set----
X_train Shape: (182158, 90)
y_train Shape: (182158,)

----Validation Set----
X_val Shape: (32146, 90)
y_val Shape: (32146,)

----Test Set----
X_test Shape: (37819, 90)
y_test Shape: (37819,)


### Standard Scaling
A Standard scaler is fitted and saved for later usage. The MinMaxScaler, L1/L2 normalization and PCA were also tried but gave worse results on the network.

In [3]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)
#X_val = scaler.transform(X_val)
#X_test = scaler.transform(X_test)

#normalization l1
#X_train = normalize(X_train, norm="l1")
#X_val = normalize(X_val, norm="l1")
#X_test = normalize(X_test, norm="l1")

#normalization l2
#X_train = normalize(X_train, norm="l2")
#X_val = normalize(X_val, norm="l2")
#X_test = normalize(X_test, norm="l2")

#pca
#pca = PCA(0.99)
#X_train = pca.fit_transform(X_train)
#X_val = pca.transform(X_val)
#X_test = pca.transform(X_test)

X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)
X_test = pd.DataFrame(X_test)

file = open("scaler.save","wb")
pickle.dump(scaler, file)
file.close()

#file = open("pca.save","wb")
#pickle.dump(pca, file)
#file.close()

### Custom Dataset
The dataset is converted in Torch tensors and a CustomDataset class is created and later used to convert the three dataset we obtained before.

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X.values)
        self.y = torch.FloatTensor(y.values)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

### FeedForward Neural Network architecture
After several tries we settled on this architecture as the one with the best performances.
This network use linear layers and the RELU activation function.

In [5]:
class FeedForward(nn.Module):
    def __init__(self, input_size):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.relu(x)
        x = self.fc6(x)
        return x

### Network training parameters
Here are all the settings we applied to our NN training:
* MSE is picked as loss function
* The batch size is set at 1024 items
* The chosen optimizer is Adam

We let the training run for 100 epochs, checking the loss at every epoch and reducing the learning rate by 10% every time the loss doesn't improve for 5 epochs in a row.

In [6]:
input_size = X_train.shape[1]
model = FeedForward(input_size)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

num_epochs = 100
batch_size = 1024
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Neural Network training
During the training we also restore the model to the epoch in which it had the best validation loss, before saving it in `best_model.pth`

In [7]:
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}")

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_outputs = model(val_inputs)
            val_loss += criterion(val_outputs, val_labels.view(-1, 1)).item() * val_inputs.size(0)
    val_loss /= len(val_dataset)
    print(f"Validation Loss: {val_loss:.4f}")
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state = model.state_dict()
        torch.save(best_model_state, 'best_model.pth')

Epoch [1/100], Training Loss: 344819.5424
Validation Loss: 34235.4482
Epoch [2/100], Training Loss: 24627.4769
Validation Loss: 14768.7085
Epoch [3/100], Training Loss: 7788.0141
Validation Loss: 2760.8226
Epoch [4/100], Training Loss: 1333.9090
Validation Loss: 662.6027
Epoch [5/100], Training Loss: 371.7464
Validation Loss: 250.7145
Epoch [6/100], Training Loss: 200.3449
Validation Loss: 205.3655
Epoch [7/100], Training Loss: 167.1373
Validation Loss: 167.0869
Epoch [8/100], Training Loss: 170.7569
Validation Loss: 223.1501
Epoch [9/100], Training Loss: 255.2103
Validation Loss: 195.7880
Epoch [10/100], Training Loss: 201.2988
Validation Loss: 160.9152
Epoch [11/100], Training Loss: 215.5266
Validation Loss: 143.1721
Epoch [12/100], Training Loss: 187.7732
Validation Loss: 149.1644
Epoch [13/100], Training Loss: 234.8771
Validation Loss: 300.2647
Epoch [14/100], Training Loss: 288.9444
Validation Loss: 213.8176
Epoch [15/100], Training Loss: 191.9556
Validation Loss: 240.9037
Epoch [

### Network testing
Here are the metrics from `best_model.pth` on the test set

In [8]:
input_size = X_train.shape[1]
clf = FeedForward(input_size)
best_model_state = torch.load('best_model.pth')
clf.load_state_dict(best_model_state)

clf.eval()
test_loss = 0.0
predictions = []
true_labels = []

with torch.no_grad():
    for test_inputs, test_labels in test_loader:
        test_outputs = model(test_inputs)
        test_loss += criterion(test_outputs, test_labels.view(-1, 1)).item() * test_inputs.size(0)
        predictions.extend(test_outputs.numpy())
        true_labels.extend(test_labels.numpy())

test_loss /= len(test_dataset)
print(f"Test Loss: {test_loss:.4f}")
r2_test = r2_score(true_labels, predictions)
mse_test = mean_squared_error(true_labels, predictions)
mape_test = mean_absolute_percentage_error(true_labels, predictions)
mae_test = mean_absolute_error(true_labels, predictions)
print(f"Test R2 Score: {r2_test:.4f}, Test MSE: {mse_test:.4f}, Test MAPE: {mape_test:.4f}, Test MAE: {mae_test:.4f}")

Test Loss: 76.0278
Test R2 Score: 0.2955, Test MSE: 76.0278, Test MAPE: 0.0032, Test MAE: 6.3458
