In [None]:
# Requirements:
!pip install rtdl
!pip install libzero==0.0.4

In [None]:
import rtdl
import sklearn.datasets
import sklearn.model_selection
import sklearn.preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
import zero

In [None]:
device = torch.device('cpu')
# Docs: https://yura52.github.io/zero/0.0.4/reference/api/zero.improve_reproducibility.html
zero.improve_reproducibility(seed=123456)

### Data

In [None]:
# !!! NOTE !!! The dataset splits, preprocessing and other details are
# significantly different from those used in the
# paper "Revisiting Deep Learning Models for Tabular Data",
# so the results will be different from the reported in the paper.
dataset = sklearn.datasets.fetch_california_housing()
X_all = dataset['data'].astype('float32')
y_all = dataset['target'].astype('float32')
X = {}
y = {}
X['train'], X['test'], y['train'], y['test'] = sklearn.model_selection.train_test_split(
    X_all, y_all, train_size=0.8
)
X['train'], X['val'], y['train'], y['val'] = sklearn.model_selection.train_test_split(
    X['train'], y['train'], train_size=0.8
)

# not the best way to preprocess features, but enough for the demonstration
preprocess = sklearn.preprocessing.StandardScaler().fit(X['train'])
X = {
    k: torch.tensor(preprocess.fit_transform(v), device=device)
    for k, v in X.items()
}

# !!! CRUCIAL for neural networks when solving regression problems !!!
y_mean = float(y['train'].mean())
y_std = float(y['train'].std())
y = {
    k: torch.tensor((v - y_mean) / y_std, device=device)
    for k, v in y.items()
}

### Model
Carefully read the comments and uncomment the code for the model you want to test.

In [None]:
# model = rtdl.MLP.make_baseline(
#     d_in=X_all.shape[1],
#     d_layers=[128, 256, 128],
#     dropout=0.1,
#     d_out=1
# )
# lr = 0.001
# weight_decay = 0.0

# model = rtdl.ResNet.make_baseline(
#     d_in=X_all.shape[1],
#     d_main=128,
#     d_intermidiate=256,
#     dropout_first=0.2,
#     dropout_second=0.0,
#     n_blocks=2,
#     d_out=1
# )
# lr = 0.001
# weight_decay = 0.0

model = rtdl.FTTransformer.make_default(
    n_num_features=X_all.shape[1],
    cat_cardinalities=None,
    last_layer_query_idx=[-1],  # it makes the model faster and does NOT affect its output
    d_out=1,
)

# === ABOUT CATEGORICAL FEATURES ===
# IF you use MLP, ResNet or any other simple feed-forward model (NOT transformer-based model)
# AND there are categorical features
# AND you want to transform the categorical features to embeddings
# THEN continue reading this comment.
# ==================================
# 1. When you have both numerical and categorical features, you should prepare you data like this:
#    (X_num<float32>, X_cat<int64>) instead of X<float32>
#    Each column in X_cat should contain values within the range from 0 to <(the number of unique values in column) - 1>;
#    use sklean.preprocessing.OrdinalEncoder to achieve this;
# 2. Prepare a list of so called "cardinalities":
#    cardinalities[i] = <the number of unique values of the i-th categorical feature>
# 3. Uncomment the following snippet and set `d_token` to any appropriate value
#
# class Model(nn.Module):
#     def __init__(self, embedding: rtdl.FlatEmbedding, model: nn.Module):
#         super().__init__()
#         self.embedding = embedding
#         self.model = model
#
#     def forward(self, x_num, x_cat):
#         return self.model(self.embedding(x_num, x_cat))
#
# model = Model(
#     # `None` means "Do not transform numerical features"
#     # `d_token` is the size of embedding for ONE categorical feature
#     rtdl.FlatEmbedding(None, rtdl.CategoricalFeatureTokenizer(cardinalities, d_token, True, 'uniform')),
#     base_model  # a model such as MLP, ResNet, etc.
# )
# Then the model should be used as `model(x_num, x_cat)` instead of of `model(x)`.

model.to(device)
optimizer = (
    model.make_default_optimizer()
    if isinstance(model, rtdl.FTTransformer)
    else torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
)

### Training

In [None]:
def apply_model(x_num, x_cat=None):
    # rtdl.FTTransformer expects two inputs: x_num and x_cat
    return model(x_num, x_cat) if isinstance(model, rtdl.FTTransformer) else model(x_num)


@torch.no_grad()
def evaluate(part):
    model.eval()
    # hopefully, the whole dataset fits in memory
    mse = F.mse_loss(apply_model(X[part]).squeeze(1), y[part]).item()
    rmse = mse ** 0.5 * y_std
    return rmse


# Create a dataloader for batches of indices
# Docs: https://yura52.github.io/zero/reference/api/zero.data.IndexLoader.html
batch_size = 256
train_loader = zero.data.IndexLoader(len(X['train']), batch_size, device=device)

# Create a progress tracker for early stopping
# Docs: https://yura52.github.io/zero/reference/api/zero.ProgressTracker.html
progress = zero.ProgressTracker(patience=100)

print(f'Test RMSE before training: {evaluate("test"):.4f}')

In [None]:
n_epochs = 1000
for epoch in range(1, n_epochs + 1):
    for batch_idx in train_loader:
        model.train()
        optimizer.zero_grad()
        x_batch = X['train'][batch_idx]
        y_batch = y['train'][batch_idx]
        F.mse_loss(apply_model(x_batch).squeeze(1), y_batch).backward()
        optimizer.step()

    val_rmse = evaluate('val')
    test_rmse = evaluate('test')
    print(f'Epoch {epoch:03d} | Validation RMSE: {val_rmse:.4f} | Test RMSE: {test_rmse:.4f}', end='')
    progress.update(-val_rmse)
    if progress.success:
        print(' <<< BEST VALIDATION EPOCH', end='')
    print()
    if progress.fail:
        break