# Extending the fastai tabular classification model to handle missing and categorical data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sklearn.datasets as sk_data
import sklearn.model_selection as sk_selection
import sklearn.metrics as sk_metrics
import random_neural_net_models.tabular as rnnm_tab
import torch.optim as optim
import random_neural_net_models.losses as rnnm_loss
import random_neural_net_models.learner as rnnm_learner
import random_neural_net_models.data as rnnm_data
import random_neural_net_models.utils as rnnm_utils
from pathlib import Path
from torch.utils.data import DataLoader

In [None]:
rnnm_utils.make_deterministic(42)

In [None]:
device = rnnm_utils.get_device()
device

## Generating data

In [None]:
n_samples = 1_000
n_features = 3
n_classes = 2
X, y = sk_data.make_blobs(
    n_samples=n_samples,
    n_features=n_features,
    random_state=42,
    centers=n_classes,
)

In [None]:
y = y.astype(int)

intentionally leaking the target class, if embedding is used properly model should pick up on it

In [None]:
import numpy as np

X_cat = y.reshape((-1, 1))
X_cat.shape, X.shape

In [None]:
X[:3], y[:3]

In [None]:
X_noise = np.random.normal(size=X.shape)

In [None]:
X0_num, X1_num, X0_cat, X1_cat, y0, y1 = sk_selection.train_test_split(
    X_noise, X_cat, y, test_size=0.2
)

In [None]:
ds_train = rnnm_data.NumpyNumCatTrainingDataset(X0_num, X0_cat, y0)
ds_valid = rnnm_data.NumpyNumCatTrainingDataset(X1_num, X1_cat, y1)

In [None]:
ds_train[2]

In [None]:
batch_size = 50
dl_train = DataLoader(
    ds_train,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_numcat_dataset_to_xyblock_keep_orig_y,
)
dl_valid = DataLoader(
    ds_valid,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_numcat_dataset_to_xyblock_keep_orig_y,
)

In [None]:
next(iter(dl_train))

In [None]:
next(iter(dl_train)).x_categorical[:10]

## TabularModel - categorical

In [None]:
import seaborn as sns

sns.set_theme()

In [None]:
ordinalities = np.linspace(1, 100)
emb_dims = np.array(
    [
        rnnm_tab.calc_categorical_feature_embedding_dimension(o)
        for o in ordinalities
    ]
)

ax = sns.lineplot(x=ordinalities, y=emb_dims)
ax.set(xlabel="ordinality", ylabel="embedding dimensionality");

In [None]:
# cols_num = [0,1,2]
# cols_cat = [3]
n_categories_per_column = rnnm_data.calc_n_categories_per_column(X0_cat)
n_categories_per_column

In [None]:
do_impute = False
impute_bias_source = rnnm_tab.BiasSources.zero
n_features_in = n_features + 1  # +1 because of the added leakage feature

model = rnnm_tab.TabularModelNumericalAndCategorical(
    [n_features_in, 4, 4, n_classes],
    n_categories_per_column=n_categories_per_column,
    use_batch_norm=False,
    do_impute=do_impute,
    impute_bias_source=impute_bias_source,
)

In [None]:
model

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(yscale="log")

In [None]:
learning_rate = 1e-1
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot()

In [None]:
model.embeddings[0].weight

In [None]:
model.net[0].lin.weight

In [None]:
probs = learner.predict(dl_valid).detach().softmax(dim=1).numpy()[:, 1]
probs[:3]

In [None]:
y1

In [None]:
probs

In [None]:
print(sk_metrics.roc_auc_score(y1, probs))

## TabularModelClassification - categorical

In [None]:
n_hidden = [4, 4]
do_impute = False
impute_bias_source = rnnm_tab.BiasSources.zero

model = rnnm_tab.TabularModelClassification(
    n_features=n_features_in,
    n_hidden=n_hidden,
    n_classes=n_classes,
    use_batch_norm=False,
    do_impute=do_impute,
    impute_bias_source=impute_bias_source,
    n_categories_per_column=n_categories_per_column,
)

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(
    yscale="log",
    ylim=(
        0.1,
        1,
    ),
)

In [None]:
learning_rate = 1e-1
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot()

In [None]:
probs = learner.predict(dl_valid).detach().softmax(dim=1).numpy()[:, 1]
probs[:3]

In [None]:
print(sk_metrics.roc_auc_score(y1, probs))

## TabularModel - categorical + missingness

In [None]:
cols_with_missingness_num = (1,)
X_noise_miss, _ = rnnm_tab.make_missing_numerical(
    X_noise, p_missing=0.1, cols_with_missing=cols_with_missingness_num
)
X_noise_miss[:5, :]

In [None]:
cols_with_missingness_cat = (0,)
X_cat_miss, _ = rnnm_tab.make_missing_categorical(
    X_cat, p_missing=0.1, cols_with_missing=cols_with_missingness_cat
)
X_cat_miss[:10, :]

In [None]:
X0_num_miss, X1_num_miss, X0_cat_miss, X1_cat_miss, y0_miss, y1_miss = (
    sk_selection.train_test_split(X_noise_miss, X_cat_miss, y, test_size=0.2)
)

In [None]:
ds_train_miss = rnnm_data.NumpyNumCatTrainingDataset(
    X0_num_miss, X0_cat_miss, y0_miss
)
ds_valid_miss = rnnm_data.NumpyNumCatTrainingDataset(
    X1_num_miss, X1_cat_miss, y1_miss
)

In [None]:
ds_train_miss[2]

In [None]:
batch_size = 50
dl_train_miss = DataLoader(
    ds_train_miss,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_numcat_dataset_to_xyblock_keep_orig_y,
)
dl_valid_miss = DataLoader(
    ds_valid_miss,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_numcat_dataset_to_xyblock_keep_orig_y,
)

In [None]:
next(iter(dl_train_miss))

In [None]:
next(iter(dl_train_miss)).x_categorical[:10]

In [None]:
n_categories_per_column = rnnm_data.calc_n_categories_per_column(X0_cat_miss)
n_categories_per_column

In [None]:
do_impute = True
impute_bias_source = rnnm_tab.BiasSources.zero
n_features_in = n_features + 1  # +1 because of the added leakage feature

model = rnnm_tab.TabularModelNumericalAndCategorical(
    [n_features_in, 4, 4, n_classes],
    n_categories_per_column=n_categories_per_column,
    use_batch_norm=False,
    do_impute=do_impute,
    impute_bias_source=impute_bias_source,
    cols_with_missing_num=cols_with_missingness_num,
)

In [None]:
model

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train_miss, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(yscale="log", ylim=(0.2, 0.9))

In [None]:
learning_rate = 1e-1
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train_miss),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train_miss, n_epochs=n_epochs, dataloader_valid=dl_valid_miss)

In [None]:
loss_callback.plot()

In [None]:
model.embeddings[0].weight

In [None]:
model.net[0].lin.weight

In [None]:
probs = learner.predict(dl_valid_miss).detach().softmax(dim=1).numpy()[:, 1]
probs[:3]

In [None]:
y1_miss

In [None]:
probs

In [None]:
print(sk_metrics.roc_auc_score(y1_miss, probs))

## TabularModelClassification - categorical + missingness

In [None]:
n_hidden = [4, 4]
do_impute = True
impute_bias_source = rnnm_tab.BiasSources.zero

model = rnnm_tab.TabularModelClassification(
    n_features=n_features_in,
    n_hidden=n_hidden,
    n_classes=n_classes,
    use_batch_norm=False,
    do_impute=do_impute,
    impute_bias_source=impute_bias_source,
    n_categories_per_column=n_categories_per_column,
    cols_with_missing=cols_with_missingness_num,
)

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train_miss, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(
    yscale="log",
    ylim=(
        0.1,
        1,
    ),
)

In [None]:
learning_rate = 1e-1
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train_miss),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train_miss, n_epochs=n_epochs, dataloader_valid=dl_valid_miss)

In [None]:
loss_callback.plot()

In [None]:
probs = learner.predict(dl_valid_miss).detach().softmax(dim=1).numpy()[:, 1]
probs[:3]

In [None]:
print(sk_metrics.roc_auc_score(y1_miss, probs))