## Colab

<a target="_blank" href="https://colab.research.google.com/github/yandex-research/tabular-dl-revisiting-models/blob/main/package/example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Notes

- This notebook provides a usage example of [this Python package](https://yandex-research.github.io/tabular-dl-revisiting-models).
- Some specific details (data preprocessing, hyperparameters, etc.) may be suboptimal.

In [None]:
%pip install delu
%pip install paper_tabular_dl_revisiting_models

In [None]:
import math
from typing import Dict, List, Optional

import delu  # Documentation: https://yura52.github.io/delu
import numpy as np
import scipy.special
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
from torch import Tensor
from tqdm.std import tqdm

from paper_tabular_dl_revisiting_models import MLP, ResNet, FTTransformer

In [3]:
device = torch.device('cpu')
# Set random seeds in all libraries.
delu.random.seed(0)

## Data

In [4]:
dataset = sklearn.datasets.fetch_california_housing()
task_type = 'regression'
X_cont_all = dataset['data']  # type: ignore
y_all = dataset['target']  # type: ignore

# NOTE: uncomment the classification dataset if needed.
# n_classes = 3
# task_type = 'binclass' if n_classes == 2 else 'multiclass'
# X_all, y_all = sklearn.datasets.make_classification(
#     n_samples=20000,
#     n_features=10,
#     n_informative=5,
#     n_redundant=5,
#     n_classes=n_classes,
# )

assert task_type in ['binclass', 'multiclass', 'regression']
n_cont_features = X_cont_all.shape[1]
# NOTE: if the dataset has categorical features, you must compute cat_cardinalities;
# see `paper_tabular_dl_revisiting_models.CategoricalFeatureEmbeddings` for details
cat_cardinalities: List[int] = []

X_cont_all: np.ndarray = X_cont_all.astype('float32')
y_all: np.ndarray = y_all.astype('float32' if task_type == 'regression' else 'int64')
if task_type != 'regression':
    # Encode in range(0, n_classes).
    y_all = sklearn.preprocessing.LabelEncoder().fit_transform(y_all).astype('int64')

X_cont: Dict[str, np.ndarray] = {}
y: Dict[str, np.ndarray] = {}
X_cont['train'], X_cont['test'], y['train'], y['test'] = sklearn.model_selection.train_test_split(
    X_cont_all, y_all, train_size=0.8
)
X_cont['train'], X_cont['val'], y['train'], y['val'] = sklearn.model_selection.train_test_split(
    X_cont['train'], y['train'], train_size=0.8
)
train_size = len(X_cont['train'])

# NOTE
# The choice between preprocessing strategies depends on a task and a model.

# (A) Simple preprocessing strategy.
# preprocessing = sklearn.preprocessing.StandardScaler().fit(X_cont['train'])

# (B) Fancy preprocessing strategy.
X_std = X_cont['train'].std(axis=0)
noise_hint = 1e-3
noise_std = np.minimum(noise_hint, noise_hint * X_std)
noise = np.random.normal(0.0, noise_std, X_cont['train'].shape)
preprocessing = sklearn.preprocessing.QuantileTransformer(
    n_quantiles=min(train_size // 30, 1000),
    output_distribution='normal',
    subsample=10**9,
).fit(X_cont['train'] + noise)

X_cont = {
    k: torch.tensor(preprocessing.transform(v), device=device)
    for k, v in X_cont.items()
}  # type: ignore[code]
y = {k: torch.tensor(v, device=device) for k, v in y.items()}  # type: ignore[code]

# NOTE: this is CRUCIAL for neural networks to standardize regression labels
if task_type == 'regression':
    y_mean = y['train'].mean().item()
    y_std = y['train'].std().item()
    y = {k: (v - y_mean) / y_std for k, v in y.items()}
else:
    y_std = y_mean = None

if task_type != 'multiclass':
    y = {k: v.float() for k, v in y.items()}


## Model

In [5]:
d_out = len(np.unique(y_all)) if task_type == 'multiclass' else 1


# NOTE: uncomment to train MLP
# model = MLP(
#     d_in=n_cont_features + sum(cat_cardinalities),
#     d_out=d_out,
#     n_blocks=2,
#     d_block=128,
#     dropout=0.1,
# ).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)

# NOTE: uncomment to train ResNet
# model = ResNet(
#     d_in=n_cont_features + sum(cat_cardinalities),
#     d_out=d_out,
#     n_blocks=2,
#     d_block=128,
#     d_hidden=None,
#     d_hidden_multiplier=2.0,
#     dropout1=0.3,
#     dropout2=0.0,
# ).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)

model = FTTransformer(
    n_cont_features=n_cont_features,
    cat_cardinalities=cat_cardinalities,
    d_out=d_out,
    **FTTransformer.get_default_kwargs()
).to(device)
optimizer = model.make_default_optimizer()

## Training

In [6]:
def apply_model(x_cont: Tensor, x_cat: Optional[Tensor]) -> Tensor:
    # x_cont ~ continuous features (float32)
    # x_cat ~ categorical features (int64)
    if isinstance(model, FTTransformer):
        return model(x_cont, x_cat)
    elif isinstance(model, (MLP, ResNet)):
        if x_cat is None:
            return model(x_cont)
        else:
            assert cat_cardinalities, (
                'cat_cardinalities must be non-empty when x_cat is not None'
            )
            return model(
                torch.column_stack(
                    [x_cont]
                    + [
                        F.one_hot(column, cardinality)
                        for column, cardinality in zip(x_cat.T, cat_cardinalities)
                    ]
                )
            )
    else:
        raise RuntimeError(f'Unknown model type: {type(model)}')


loss_fn = (
    F.binary_cross_entropy_with_logits
    if task_type == 'binclass'
    else F.cross_entropy
    if task_type == 'multiclass'
    else F.mse_loss
)

@torch.no_grad()
def evaluate(part: str) -> float:
    model.eval()

    eval_batch_size = 8096
    prediction = [
        apply_model(x_batch, None)
        for x_batch in delu.iter_batches(X_cont[part], eval_batch_size)
    ]
    prediction = torch.cat(prediction).squeeze(1).cpu().numpy()
    target = y[part].cpu().numpy()

    if task_type == 'binclass':
        prediction = np.round(scipy.special.expit(prediction))
        score = sklearn.metrics.accuracy_score(target, prediction)
    elif task_type == 'multiclass':
        prediction = prediction.argmax(1)
        score = sklearn.metrics.accuracy_score(target, prediction)
    else:
        assert task_type == 'regression'
        score = -(sklearn.metrics.mean_squared_error(target, prediction) ** 0.5 * y_std)
    return score  # The higher -- the better.


print(f'Test score before training: {evaluate("test"):.4f}')

Test score before training: -1.2040


In [None]:
batch_size = 256

# (A) Faster training, worse task performance.
n_epochs = 100
patience = 5

# (B) Longer training, better task performance.
# n_epochs = 1_000_000_000
# patience = 16

early_stopping = delu.EarlyStopping(patience, mode='max')
best = {
    'val': -math.inf,
    'test': -math.inf,
    'epoch': -1,
}

for epoch in range(n_epochs):
    for x_batch, y_batch in tqdm(
        delu.iter_batches(
            (X_cont['train'], y['train']), batch_size, shuffle=True
        ),
        desc=f'Epoch {epoch}',
        total=math.ceil(len(X_cont['train']) / batch_size),
    ):
        model.train()
        optimizer.zero_grad()
        loss = loss_fn(apply_model(x_batch, None).squeeze(1), y_batch)
        loss.backward()
        optimizer.step()

    val_score = evaluate('val')
    test_score = evaluate('test')
    print(f'(val) {val_score:.4f} (test) {test_score:.4f}')

    early_stopping.update(val_score)
    if early_stopping.should_stop():
        break

    if val_score > best['val']:
        print('🌸 New best epoch! 🌸')
        best = {'val': val_score, 'test': test_score, 'epoch': epoch}
    print()

print('\n\nResult:')
print(best)