# Multi Layer Perceptron (MLP)

> Simple feedforward Multilayer perceptron model

In [None]:
#| default_exp models.mlp

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

In [None]:
#| export
import torch.nn as nn
import torch
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

from lightning import LightningModule

from hydra.utils import instantiate
from omegaconf import OmegaConf
from matplotlib import pyplot as plt

from nimrod.utils import get_device
from nimrod.image.datasets import ImageDataset, MNISTDataModule
from nimrod.utils import logger
from nimrod.models.core import Classifier
# torch.set_num_interop_threads(1)

# from IPython.core.debugger import set_trace

In [None]:
show_doc(MNISTDataModule)

---

[source](https://github.com/slegroux/nimrod/blob/main/nimrod/image/datasets.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### MNISTDataModule

>      MNISTDataModule (data_dir:str='~/Data/',
>                       train_val_test_split:List[float]=[0.8, 0.1, 0.1],
>                       batch_size:int=64, num_workers:int=0,
>                       pin_memory:bool=False, persistent_workers:bool=False)

*A DataModule standardizes the training, val, test splits, data preparation and transforms. The main advantage is
consistent data splits, data preparation and transforms across models.

Example::

    import lightning.pytorch as L
    import torch.utils.data as data
    from pytorch_lightning.demos.boring_classes import RandomDataset

    class MyDataModule(L.LightningDataModule):
        def prepare_data(self):
            # download, IO, etc. Useful with shared filesystems
            # only called on 1 GPU/TPU in distributed
            ...

        def setup(self, stage):
            # make assignments here (val/train/test split)
            # called on every process in DDP
            dataset = RandomDataset(1, 100)
            self.train, self.val, self.test = data.random_split(
                dataset, [80, 10, 10], generator=torch.Generator().manual_seed(42)
            )

        def train_dataloader(self):
            return data.DataLoader(self.train)

        def val_dataloader(self):
            return data.DataLoader(self.val)

        def test_dataloader(self):
            return data.DataLoader(self.test)

        def on_exception(self, exception):
            # clean up state after the trainer faced an exception
            ...

        def teardown(self):
            # clean up state after the trainer stops, delete files...
            # called on every process in DDP
            ...*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data_dir | str | ~/Data/ | path to source data dir |
| train_val_test_split | List | [0.8, 0.1, 0.1] | train val test % |
| batch_size | int | 64 | size of compute batch |
| num_workers | int | 0 | num_workers equal 0 means that it’s the main process that will do the data loading when needed, num_workers equal 1 is the same as any n, but you’ll only have a single worker, so it might be slow |
| pin_memory | bool | False | If you load your samples in the Dataset on CPU and would like to push it during training to the GPU, you can speed up the host to device transfer by enabling pin_memory. This lets your DataLoader allocate the samples in page-locked memory, which speeds-up the transfer |
| persistent_workers | bool | False |  |

## Basic model

In [None]:
#| export
class MLP(nn.Module):
    def __init__(
                self,
                n_in:int=784, # input dimension e.g. (H,W) for image
                n_h:int=64, # hidden dimension
                n_out:int=10, # output dimension (= number of classes for classification)
                dropout:float=0.2,
                **kwargs
                ) -> None:
        logger.info("MLP initi: n_in: {}, n_h: {}, n_out: {}, dropout: {}".format(n_in, n_h, n_out, dropout))
        super().__init__(**kwargs)
        l1 = nn.Linear(n_in, n_h)
        dropout = nn.Dropout(dropout)
        relu = nn.ReLU()
        l2 = nn.Linear(n_h, n_out)
        self.layers = nn.Sequential(l1, dropout, relu, l2)
        
    def forward(self, x: torch.Tensor # dim (B, H*W)
                ) -> torch.Tensor:
        return self.layers(x)

### Usage

In [None]:
show_doc(MLP)

---

[source](https://github.com/slegroux/nimrod/blob/main/nimrod/models/mlp.py#L30){target="_blank" style="float:right; font-size:smaller"}

### MLP

>      MLP (n_in:int=784, n_h:int=64, n_out:int=10, dropout:float=0.2, **kwargs)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in
a tree structure. You can assign the submodules as regular attributes::

    import torch.nn as nn
    import torch.nn.functional as F

    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(1, 20, 5)
            self.conv2 = nn.Conv2d(20, 20, 5)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their
parameters converted too when you call :meth:`to`, etc.

.. note::
    As per the example above, an ``__init__()`` call to the parent class
    must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or
                evaluation mode.
:vartype training: bool*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| n_in | int | 784 | input dimension e.g. (H,W) for image |
| n_h | int | 64 | hidden dimension |
| n_out | int | 10 | output dimension (= number of classes for classification) |
| dropout | float | 0.2 |  |
| kwargs |  |  |  |
| **Returns** | **None** |  |  |

In [None]:
image = torch.rand((5, 28*28))
mlp = MLP(n_in=28*28, n_h=64, n_out=10)
out = mlp(image)
print(out.shape)

2024-12-13 09:27:34,175 - INFO - MLP initi: n_in: 784, n_h: 64, n_out: 10, dropout: 0.2


torch.Size([5, 10])


### Basic training
#### Data Module
Data module
c.f. recipes/image/mnist

```bash
cat ../config/image/data/mnist.yaml
```

In [None]:
# load from config file
cfg = OmegaConf.load('../config/image/data/mnist.yaml')
print(cfg.datamodule)
# cfg.datamodule.num_workers = 1
datamodule = instantiate(cfg.datamodule)
datamodule.prepare_data()
datamodule.setup()
x = datamodule.data_test[0][0] # (C, H, W)
print(len(datamodule.data_test))
label = datamodule.data_test[0][1] #(int)
print("original shape (C,H,W): ", x.shape)
print("reshape (C,HxW): ", x.view(x.size(0), -1).shape)
print(x[0][1])

{'_target_': 'nimrod.image.datasets.MNISTDataModule', 'data_dir': '../data/image', 'train_val_test_split': [0.8, 0.1, 0.1], 'batch_size': 64, 'num_workers': 1, 'pin_memory': False, 'persistent_workers': False}
7000
original shape (C,H,W):  torch.Size([1, 28, 28])
reshape (C,HxW):  torch.Size([1, 784])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


In [None]:
# using default Pytorch datasets
train_dataset = MNIST("../data/image", train=True, download=True, transform=ToTensor())
test_dataset = MNIST("../data/image", train=False, download=True, transform=ToTensor())

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# using nimrod datamodule
train_loader = datamodule.train_dataloader()
val_loader = datamodule.val_dataloader()
test_loader = datamodule.test_dataloader()

In [None]:
type(datamodule.data_test)

torch.utils.data.dataset.Subset

#### Hardware acceleration

In [None]:
# device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cpu" # for CI on cpu instance
device = torch.device(device)
model = mlp.to(device)

#### Loss & optimizer setup

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

#### Training loop

In [None]:

# %%time
# n_epochs = 1
# for epoch in range(n_epochs):
#     model.train()
#     for images, labels in train_loader:
#         images = images.view(-1, 28*28)
#         images = images.to(device)
#         labels = labels.to(device)
#         outputs = model(images)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#     model.eval()
#     with torch.no_grad():
#         correct = 0
#         total = 0
#         for images, labels in test_loader:
#             # model expects input (B,H*W)
#             images = images.view(-1, 28*28).to(device)
#             images = images.to(device)
#             labels = labels.to(device)
#             # Pass the input through the model
#             outputs = model(images)
#             # Get the predicted labels
#             _, predicted = torch.max(outputs.data, 1)

#             # Update the total and correct counts
#             total += labels.size(0)
#             correct += (predicted == labels).sum()

#         # Print the accuracy
#         print(f"Epoch {epoch + 1}: Accuracy = {100 * correct / total:.2f}%")


## Integrated model + training settings

In [None]:
#| export

class MLP_X(Classifier, MLP, LightningModule):
    def __init__(self,
                n_in:int, # input dimension e.g. (H,W) for image
                n_h:int, # hidden dimension
                n_out:int, # output dimension (= number of classes for classification)
                dropout:float=0.2, # dropout
                lr:float=1e-3 # learning rate
        ):
        
        logger.info("MLP_PL init: n_in: {}, n_h: {}, n_out: {}, dropout: {}, lr: {}".format(n_in, n_h, n_out, dropout, lr))        
        super().__init__(num_classes=n_out, lr=lr, n_in=n_in, n_h=n_h, n_out=n_out, dropout=dropout)


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer
    
    def _step(self, batch, batch_idx):
        x, y = batch
        x = x.view(x.size(0), -1)
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y)
        acc = self.accuracy(y_hat, y)
        return loss, acc
    
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x, y = batch
        x = x.view(x.size(0), -1)
        y_hat = self.forward(x)
        return y_hat.argmax(dim=1)

In [None]:
mlp_pl = MLP_X(n_in=28*28, n_h=128, n_out=10, dropout=0.1, lr=1e-3)
MLP_X.mro()

2024-12-13 09:27:34,505 - INFO - MLP_PL init: n_in: 784, n_h: 128, n_out: 10, dropout: 0.1, lr: 0.001
2024-12-13 09:27:34,505 - INFO - Classifier init: num_classes: 10, lr: 0.001
2024-12-13 09:27:34,506 - INFO - MLP initi: n_in: 784, n_h: 128, n_out: 10, dropout: 0.1


[__main__.MLP_X,
 nimrod.core.Classifier,
 abc.ABC,
 __main__.MLP,
 lightning.pytorch.core.module.LightningModule,
 lightning.fabric.utilities.device_dtype_mixin._DeviceDtypeModuleMixin,
 lightning.pytorch.core.mixins.hparams_mixin.HyperparametersMixin,
 lightning.pytorch.core.hooks.ModelHooks,
 lightning.pytorch.core.hooks.DataHooks,
 lightning.pytorch.core.hooks.CheckpointHooks,
 torch.nn.modules.module.Module,
 object]

In [None]:
# #| export

# class MLP_PL(MLP, LightningModule):
#     def __init__(self,
#                 n_in:int, # input dimension e.g. (H,W) for image
#                 n_h:int, # hidden dimension
#                 n_out:int, # output dimension (= number of classes for classification)
#                 dropout:float=0.2, # dropout factor
#                 lr:float=1e-3, # learning rate
#                 ):

#         super().__init__(n_in, n_h, n_out, dropout)

#         self.save_hyperparameters()
#         self.loss = nn.CrossEntropyLoss()
#         self.accuracy = Accuracy(task="multiclass", num_classes=n_out)
#         self.lr = lr

#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
#         return optimizer
    
#     def _step(self, batch, batch_idx):
#         x, y = batch
#         x = x.view(x.size(0), -1)
#         y_hat = self.forward(x)
#         loss = self.loss(y_hat, y)
#         acc = self.accuracy(y_hat, y)
#         return loss, acc

#     def training_step(self, batch, batch_idx):
#         loss, acc = self._step(batch, batch_idx)
#         metrics = {"train/loss": loss, "train/acc": acc}
#         self.log_dict(metrics, on_epoch=True)
#         return loss
    
#     def validation_step(self, batch, batch_idx, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True):
#         loss, acc = self._step(batch, batch_idx)
#         metrics = {"val/loss":loss, "val/acc": acc}
#         self.log_dict(metrics, on_step=on_step, on_epoch=on_epoch, sync_dist=sync_dist)
    
#     def test_step(self, batch, batch_idx, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True):
#         loss, acc = self._step(batch, batch_idx)
#         metrics = {"test/loss":loss, "test/acc": acc}
#         self.log_dict(metrics, on_step=on_step, on_epoch=on_epoch, sync_dist=sync_dist)

#     def predict_step(self, batch, batch_idx, dataloader_idx=0):
#         x, y = batch
#         x = x.view(x.size(0), -1)
#         y_hat = self.forward(x)
#         return y_hat.argmax(dim=1)


### Usage

In [None]:
mlp_pl = MLP_X(28*28, 64, 10, dropout=0.2, lr=1e-3)
# print(mlp_pl.training_step)
# from pprint import pprint 
# pprint(mlp_pl.__dict__)
b = torch.rand((5,1, 28*28))
print(mlp_pl(b).shape)

2024-12-13 09:27:34,558 - INFO - MLP_PL init: n_in: 784, n_h: 64, n_out: 10, dropout: 0.2, lr: 0.001
2024-12-13 09:27:34,559 - INFO - Classifier init: num_classes: 10, lr: 0.001
2024-12-13 09:27:34,559 - INFO - MLP initi: n_in: 784, n_h: 64, n_out: 10, dropout: 0.2


torch.Size([5, 1, 10])


In [None]:
# move model and data to hardware
mlp_pl = mlp_pl.to(device)

b = b.to(device)
y_hat = mlp_pl(b)
print(y_hat.shape)

torch.Size([5, 1, 10])


In [None]:
# real data
batch = next(iter(test_loader))
print(batch[0].shape, batch[1].shape)
print(mlp_pl.predict_step(batch, 0))

  torch.set_num_threads(1)


torch.Size([64, 1, 28, 28]) torch.Size([64])
tensor([7, 2, 8, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, 3, 5, 9, 5, 2, 2, 2, 4, 2, 2, 8,
        4, 2, 6, 2, 7, 2, 4, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 5, 6, 2, 7, 2, 8,
        2, 8, 2, 2, 2, 2, 4, 6, 7, 2, 2, 7, 3, 2, 5, 5])


In [None]:
print(mlp_pl.lr)

0.001


## Integrated trainer

```python
trainer = Trainer(accelerator='mps', devices = 1, max_epochs=1)
trainer.fit(mlp_pl, datamodule.data_train)
trainer.fit(mlp_pl, datamodule.data_train)
```

## Training scripts with config file 

To check an example script leveraging model training with configurable yaml files check recipes folder

```bash
cd recipes/image/mnist
python train.py trainer.max_epochs 20 trainer.accelerator='mps' datamodule.num_workers=0
```

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()