# Implementing the fastai tabular classification model

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sklearn.datasets as sk_data
import sklearn.model_selection as sk_selection
import sklearn.metrics as sk_metrics
import random_neural_net_models.tabular as rnnm_tab
import torch.optim as optim
import random_neural_net_models.losses as rnnm_loss
import random_neural_net_models.learner as rnnm_learner
import random_neural_net_models.data as rnnm_data
import random_neural_net_models.utils as rnnm_utils
from pathlib import Path
from torch.utils.data import DataLoader

In [None]:
rnnm_utils.make_deterministic(42)

In [None]:
device = rnnm_utils.get_device()
device

## Generating data

In [None]:
n_samples = 1_000
n_features = 3
n_classes = 2
X, y = sk_data.make_blobs(
    n_samples=n_samples,
    n_features=n_features,
    random_state=42,
    centers=n_classes,
)

In [None]:
y = y.astype(int)

In [None]:
X[:3], y[:3]

In [None]:
X0, X1, y0, y1 = sk_selection.train_test_split(X, y, test_size=0.2)

In [None]:
ds_train = rnnm_data.NumpyTrainingDataset(X0, y0)
ds_valid = rnnm_data.NumpyTrainingDataset(X1, y1)

In [None]:
batch_size = 50
dl_train = DataLoader(
    ds_train,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_dataset_to_xyblock_keep_orig_y,
)
dl_valid = DataLoader(
    ds_valid,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_dataset_to_xyblock_keep_orig_y,
)

In [None]:
next(iter(dl_train))

## TabularModel

https://github.com/fastai/fastai/blob/master/fastai/layers.py#L175

```python
class LinBnDrop(nn.Sequential):
    "Module grouping `BatchNorm1d`, `Dropout` and `Linear` layers"
    def __init__(self, n_in, n_out, bn=True, p=0., act=None, lin_first=False):
        layers = [BatchNorm(n_out if lin_first else n_in, ndim=1)] if bn else []
        if p != 0: layers.append(nn.Dropout(p))
        lin = [nn.Linear(n_in, n_out, bias=not bn)]
        if act is not None: lin.append(act)
        layers = lin+layers if lin_first else layers+lin
        super().__init__(*layers)
```

https://github.com/fastai/fastai/blob/master/fastai/tabular/model.py#L35:

```python
class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, 
        emb_szs:list, # Sequence of (num_embeddings, embedding_dim) for each categorical variable
        n_cont:int, # Number of continuous variables
        out_sz:int, # Number of outputs for final `LinBnDrop` layer
        layers:list, # Sequence of ints used to specify the input and output size of each `LinBnDrop` layer
        ps:float|MutableSequence=None, # Sequence of dropout probabilities for `LinBnDrop`
        embed_p:float=0., # Dropout probability for `Embedding` layer
        y_range=None, # Low and high for `SigmoidRange` activation 
        use_bn:bool=True, # Use `BatchNorm1d` in `LinBnDrop` layers
        bn_final:bool=False, # Use `BatchNorm1d` on final layer
        bn_cont:bool=True, # Use `BatchNorm1d` on continuous variables
        act_cls=nn.ReLU(inplace=True), # Activation type for `LinBnDrop` layers
        lin_first:bool=True # Linear layer is first or last in `LinBnDrop` layers
    ):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont) if bn_cont else None
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont = n_emb,n_cont
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [act_cls for _ in range(len(sizes)-2)] + [None]
        _layers = [LinBnDrop(sizes[i], sizes[i+1], bn=use_bn and (i!=len(actns)-1 or bn_final), p=p, act=a, lin_first=lin_first)
                       for i,(p,a) in enumerate(zip(ps+[0.],actns))]
        if y_range is not None: _layers.append(SigmoidRange(*y_range))
        self.layers = nn.Sequential(*_layers)

    def forward(self, x_cat, x_cont=None):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            if self.bn_cont is not None: x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        return self.layers(x)
```

In [None]:
model = rnnm_tab.TabularModel(
    [n_features, 200, 100, n_classes], use_batch_norm=False
)

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(yscale="log")

In [None]:
learning_rate = 0.1
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot()

In [None]:
probs = learner.predict(dl_valid).detach().softmax(dim=1).numpy()[:, 1]
probs[:3]

In [None]:
print(sk_metrics.roc_auc_score(y1, probs))

## TabularModelClassification

In [None]:
n_hidden = [200, 100]
model = rnnm_tab.TabularModelClassification(
    n_features=n_features,
    n_hidden=n_hidden,
    n_classes=n_classes,
    use_batch_norm=False,
)

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot(yscale="log")

In [None]:
learning_rate = 0.1
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot()

In [None]:
probs = learner.predict(dl_valid).detach().softmax(dim=1).numpy()[:, 1]
probs[:3]

In [None]:
print(sk_metrics.roc_auc_score(y1, probs))