# Implementing the fastai tabular regression model

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sklearn.datasets as sk_data
import sklearn.model_selection as sk_selection
import sklearn.metrics as sk_metrics
import random_neural_net_models.tabular as rnnm_tab
import torch.optim as optim
import random_neural_net_models.losses as rnnm_loss
import random_neural_net_models.learner as rnnm_learner
import random_neural_net_models.data as rnnm_data
import random_neural_net_models.utils as rnnm_utils
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler

In [None]:
rnnm_utils.make_deterministic(42)

In [None]:
device = rnnm_utils.get_device()
device

## Generating data

In [None]:
n_samples = 1_000
n_features = 3
n_informative = 2
n_targets = 1
X, y = sk_data.make_regression(
    n_samples=n_samples,
    n_features=n_features,
    random_state=42,
    n_informative=n_informative,
    n_targets=n_targets,
)

In [None]:
X[:3], y[:3]

In [None]:
cols_with_missingness = (1,)
X_miss, _ = rnnm_tab.make_missing_numerical(
    X, p_missing=0.1, cols_with_missing=cols_with_missingness
)
X_miss[:5, :]

In [None]:
X0, X1, y0, y1 = sk_selection.train_test_split(X_miss, y, test_size=0.2)

In [None]:
import seaborn as sns

sns.set_theme()

sns.scatterplot(x=X0[:, 2], y=y0)

In [None]:
X0.min(axis=0), X0.max(axis=0)

In [None]:
ds_train = rnnm_data.NumpyTrainingDataset(X0, y0)
ds_valid = rnnm_data.NumpyTrainingDataset(X1, y1)

In [None]:
sampler = RandomSampler(
    ds_train,
    replacement=True,
    num_samples=int(1e5),
    generator=torch.manual_seed(3407),
)

In [None]:
batch_size = 50
dl_train = DataLoader(
    ds_train,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_dataset_to_xyblock,
    sampler=sampler,
)
dl_valid = DataLoader(
    ds_valid,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_dataset_to_xyblock,
)

In [None]:
next(iter(dl_train))

## TabularModelRegression

In [None]:
y0.min(), y0.max()

In [None]:
n_hidden = [200, 100]

do_impute = True
impute_bias_source = rnnm_tab.BiasSources.zero

mean = y0.mean()
std = y0.std()
model = rnnm_tab.TabularModelRegression(
    n_features=n_features,
    n_hidden=n_hidden,
    mean=mean,
    std=std,
    use_batch_norm=False,
    do_impute=do_impute,
    impute_bias_source=impute_bias_source,
    cols_with_missing=cols_with_missingness,
)

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.MSELossXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(yscale="log")

In [None]:
learning_rate = 1e-3
n_epochs = 2

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot()

In [None]:
model.net.net[0].bias

In [None]:
X.mean(axis=0)

In [None]:
preds = learner.predict(dl_valid).detach().numpy()
preds[:3]

In [None]:
y1.min(), y1.max()

In [None]:
preds.min(), preds.max()

In [None]:
sns.histplot(x=y1.ravel())

In [None]:
sns.histplot(x=preds.ravel())

In [None]:
print(sk_metrics.mean_squared_error(y1, preds))