### Defining imports

In [1]:
#!pip install pytorch_tabular --quiet
#!pip install pytorch_tabular[extra] --quiet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (TabNetModelConfig, TabTransformerConfig)
from pytorch_tabular.config import (DataConfig, OptimizerConfig, TrainerConfig)

### Loading the dataset

After dataset loading, we remove duplicated or NaN rows.
 We expect to get the same NaN and duplicates results as in ML notebook, so we skip this print and we pass to test, validation and training split. We also do a winsorization to handle outliers.

In [2]:
df = pd.read_csv("../datasets/train.csv")

df = df.drop_duplicates()

df = df.dropna()

X = df.drop('Year', axis=1)
y = df['Year']

def winsorize_outliers(df, column, lower_limit, upper_limit):
    df[column] = np.where(df[column] < lower_limit, lower_limit, df[column])
    df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
    return df

lower_limit = X.quantile(0.05, axis=0)
upper_limit = X.quantile(0.95, axis=0)

for col in X.columns:
    X = winsorize_outliers(X.copy(), col, lower_limit[col], upper_limit[col])

df = pd.concat([X, y], axis = 1)

train, test = train_test_split(df, random_state=42, test_size=0.15)
train, val = train_test_split(train, random_state=42, test_size=0.15)
print(f"Train Shape: {train.shape} | Val Shape: {val.shape} | Test Shape: {test.shape}")

feature_columns = [col for col in df.columns if col != "Year"]
target_column = ['Year']

Train Shape: (182158, 91) | Val Shape: (32146, 91) | Test Shape: (37819, 91)


### Data, Trainer, Model configuration for TabNet

We configure the data preprocessing, the trainer (i.e. epochs, batch size, ecc.) and the model for TabNet.

In [3]:
data_config_tabnet = DataConfig(
    target=target_column,
    continuous_cols=feature_columns,
    categorical_cols=[],
    num_workers=0,
    normalize_continuous_features=True,
    continuous_feature_transform="quantile_uniform",
)

trainer_config_tabnet = TrainerConfig(
    auto_lr_find=True, 
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss",  
    early_stopping_mode="min",  
    early_stopping_patience=5,  
    checkpoints="valid_loss", 
    load_best=True,
)
optimizer_config_tabnet = OptimizerConfig()

model_config_tabnet = TabNetModelConfig(
    task="regression",
    n_d=64,
    n_a=32,
    n_steps=5
)

tabular_model_tabnet = TabularModel(
    data_config=data_config_tabnet,
    model_config=model_config_tabnet,
    optimizer_config=optimizer_config_tabnet,
    trainer_config=trainer_config_tabnet,
    verbose=False,
    suppress_lightning_logger=True,
)

tabular_model_tabnet.fit(train=train, validation=val)

C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x1dba757a250>

### Prediction and performance

We test the previously trained TabNet on the test set and calculate the metrics.

In [4]:
pred_df_tabnet = tabular_model_tabnet.predict(test)

result = tabular_model_tabnet.evaluate(test)
print(result)

mse = mean_squared_error(test["Year"], pred_df_tabnet)
r2 = r2_score(test["Year"], pred_df_tabnet)
mae = mean_absolute_error(test["Year"], pred_df_tabnet)
mape = mean_absolute_percentage_error(test["Year"], pred_df_tabnet)

perf = {"model": "tb", "mse": mse, "mae": mae, "mape": mape, "r2score": r2}
print(perf)

Output()

C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


[{'test_loss': 101.26390838623047, 'test_mean_squared_error': 101.26390838623047}]
{'model': 'tb', 'mse': 101.2639374782676, 'mae': 7.736785139026721, 'mape': 0.003880427293564938, 'r2score': 0.061589442874220524}


In [5]:
tabular_model_tabnet.save_model("../test-module/tabnet/")

### Data, Trainer, Model configuration for TabTransformer

We configure the data preprocessing, the trainer (i.e. epochs, batch size, ecc.) and the model for TabTransformer.

In [6]:
data_config_tabtransformer = DataConfig(
    target=target_column,
    continuous_cols=feature_columns,
    categorical_cols=[],
    num_workers=0,
    normalize_continuous_features=True,
    continuous_feature_transform="quantile_uniform",
)
trainer_config_tabtransformer = TrainerConfig(
    auto_lr_find=True,  
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss", 
    early_stopping_mode="min", 
    early_stopping_patience=5,  
    checkpoints="valid_loss", 
    load_best=True,
)
optimizer_config_tabtransformer = OptimizerConfig()

model_config_tabtransformer = TabTransformerConfig(
    task="regression",
    num_heads=32,
    num_attn_blocks=6,
)

tabular_model_tabtransformer = TabularModel(
    data_config=data_config_tabtransformer,
    model_config=model_config_tabtransformer,
    optimizer_config=optimizer_config_tabtransformer,
    trainer_config=trainer_config_tabtransformer,
    verbose=False,
    suppress_lightning_logger=True,
)

tabular_model_tabtransformer.fit(train=train, validation=val)

C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\samue\Desktop\data-analytics\DA\train-module\saved_models exists and is not empty.
C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing 

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x1dbbcfb8fd0>

### Prediction and performance

We test the previously trained TabTransformer on the test set and calculate the metrics.

In [7]:
pred_df_tabtransformer = tabular_model_tabtransformer.predict(test)

result = tabular_model_tabtransformer.evaluate(test)
print(result)

mse = mean_squared_error(test["Year"], pred_df_tabtransformer)
r2 = r2_score(test["Year"], pred_df_tabtransformer)
mae = mean_absolute_error(test["Year"], pred_df_tabtransformer)
mape = mean_absolute_percentage_error(test["Year"], pred_df_tabtransformer)

perf = {"model": "tf", "mse": mse, "mae": mae, "mape": mape, "r2score": r2}
print(perf)

Output()

C:\Users\samue\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


[{'test_loss': 86.19668579101562, 'test_mean_squared_error': 86.19668579101562}]
{'model': 'tf', 'mse': 86.19669107333917, 'mae': 6.8501751319556226, 'mape': 0.0034365272844129498, 'r2score': 0.2012172654268911}


In [8]:
tabular_model_tabtransformer.save_model("../test-module/tabtransformer/")