<font color='green'>

## Import libraries

</font>

In [2]:
from pathlib import Path
import torch
import torch.nn as nn
from loguru import logger
import gin
import warnings
warnings.simplefilter("ignore", UserWarning)
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import BasePreprocessor

<font color='green'>

**Let's use the same dataset from the previous session i.e. Fashion MNIST**
</font>

In [3]:
fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
batchsize = 64
preprocessor = BasePreprocessor()
streamers = fashionfactory.create_datastreamer(batchsize=batchsize, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]
trainstreamer = train.stream()
validstreamer = valid.stream()

[32m2025-02-17 16:10:49.218[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/sarmad/.cache/mads_datasets/fashionmnist[0m
[32m2025-02-17 16:10:49.219[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /home/sarmad/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


# 1. Adding dropout and normalization layers
Study the pytorch documentation for:
- Dropout https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
- normalization layers https://pytorch.org/docs/stable/nn.html#normalization-layers

Experiment with adding dropout and normalization layers to your model. Some rough guidelines where to add them relative to Linear or Conv2d layers:
- Dropout: after Linear or Conv2d layers. Often added after the last Linear layer *before* the output layer, but could occur more often.
- Normalization layers: right after (blocks of) Linear or Conv2d layers, but before activation functions.

<font color='green'>

Dropout and Normalization layers are added to avoid wieghts overflow and able the model to learn better generalized weights.

- Adding BatchNorm layer after each Conv2D and Linear layer to normalize the weigths.

- Adding Dropout after each Linear layer to reduce the overfitting effect. The dropout rate (e.g., 0.5) represents the probability of dropping a neuron, with higher values increasing regularization but potentially slowing learning.

- We will be using `gin-config` to configure our custom CNN model, hence, we need to make it configurable. Please read the folloiwng note to understand what is happening in the code.

**Important note: to make `gin configurable` function, we have include two lines of code before the function defination.**
1. **gin.enter_interactive_mode()**
1. **@gin.configurable**

</font>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")


# CNN model with BatchNorm and Dropout
"""
The following two lines of code is added to make our custom CNN model configurable with gin-config
These two lines are very important other you model will give error.
"""
gin.enter_interactive_mode()
@gin.configurable
class CNN_custom(nn.Module):
    def __init__(self, filters, units1, units2, input_size=(32, 1, 28, 28), dropout_rate=0.5) -> None:
        super().__init__()

        self.filters = filters
        self.units1 = units1
        self.units2 = units2
        self.dropout_rate = dropout_rate

        self.convolutions = nn.Sequential(
            nn.Conv2d(1, filters, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(filters),            # Added BatchNorm here
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(filters, filters * 2, kernel_size=3, stride=1, padding=0),
            nn.BatchNorm2d(filters * 2),        # Added BatchNorm here
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(filters * 2, filters * 2, kernel_size=3, stride=1, padding=0),
            nn.BatchNorm2d(filters * 2),       # Added BatchNorm here
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )

        activation_map_size = self._conv_test(input_size)
        logger.info(f"Aggregating activation map with size {activation_map_size}")
        self.agg = nn.AvgPool2d(activation_map_size)

        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(filters * 2, units1),
            nn.BatchNorm1d(units1),             # Added BatchNorm here
            nn.ReLU(),
            nn.Dropout(dropout_rate),           # Added Dropout here

            nn.Linear(units1, units2),
            nn.BatchNorm1d(units2),             # Added BatchNorm here
            nn.ReLU(),
            nn.Dropout(dropout_rate),          # Added Dropout here

            nn.Linear(units2, 10)
        )

    def _conv_test(self, input_size=(32, 1, 28, 28)):
        x = torch.ones(input_size)
        x = self.convolutions(x)
        return x.shape[-2:]

    def forward(self, x):
        x = self.convolutions(x)
        x = self.agg(x)
        logits = self.dense(x)
        return logits

model = CNN_custom(filters=32, units1=128, units2=64).to('cpu')
summary(model, input_size=(1, 28, 28), device="cpu")

[32m2025-02-17 16:10:49.265[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m47[0m - [1mAggregating activation map with size torch.Size([2, 2])[0m


using cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
       BatchNorm2d-2           [-1, 32, 28, 28]              64
              ReLU-3           [-1, 32, 28, 28]               0
         MaxPool2d-4           [-1, 32, 14, 14]               0
            Conv2d-5           [-1, 64, 12, 12]          18,496
       BatchNorm2d-6           [-1, 64, 12, 12]             128
              ReLU-7           [-1, 64, 12, 12]               0
         MaxPool2d-8             [-1, 64, 6, 6]               0
            Conv2d-9             [-1, 64, 4, 4]          36,928
      BatchNorm2d-10             [-1, 64, 4, 4]             128
             ReLU-11             [-1, 64, 4, 4]               0
        MaxPool2d-12             [-1, 64, 2, 2]               0
        AvgPool2d-13             [-1, 64, 1, 1]               0
          Flatten-14        

# 2. Adding convolutional and pooling layers
Previous lessons, you have started to experiment with you model.
You might have tested the impact of the amount of units, the depth of layers and different learning rates.

This lesson, we have added some new types of layers: convolutional and pooling layers.
Experiment with adding these new layers.
<font color='green'>

**A pooling layer is added to reduces the size of feature maps by taking the most important information, usually by selecting the maximum value (max pooling) or the average value (average pooling) in a small window. We use it to make the model faster, reduce memory usage, and make the model more stable by keeping only the key features. It helps the model by reducing overfitting and making it better at recognizing patterns even if they change a little. If we don’t use pooling, the model will be too large, slower, and more sensitive to small changes in the input, which can make it less accurate. `nn.MaxPool2d(kernel_size=2)` will add a max pooling layer to the model to extract maximum value from the 2x2 grid in the feature map.**

</font>

Also, have a look at the `ModuleList`: https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html#modulelist
It can be really useful to create a list of layers from a configfile, and then use that list to create your model.
Instead of just adding a single layer, you could also add a block of layers (eg a Conv2d layer, followed by a ReLU layer, followed by a BatchNorm2d layer, followed by a MaxPool2d layer) and repeat that in a loop, adding it to the `ModuleList`.

<font color='green'>

**For easy readability of the code, we can include `ModuleList` to combine the blocks of layers into one, just like a list. This helps in organizing multiple layers or blocks in a structured way, making the model definition cleaner and easier to manage.**

</font>

In [5]:
# CNN model with nn.Module
class CNN_ModuleList(nn.Module):
    def __init__(self, conv_layers_config, units1, units2, input_size=(32, 1, 28, 28), dropout_rate=0.5):
        """
        conv_layers_config: List defining number of conv layers and filters per layer.
        Example: [(filters, kernel_size, stride, padding), (filters*2, kernel_size, stride, padding)]
        """
        super().__init__()

        # Creating convolutional layers dynamically using ModuleList
        self.convolutions = nn.ModuleList()
        in_channels = 1  # Initial input (grayscale image)

        """
        without explicitly adding each layer CNN-block one by one,
        we can do this with 'ModuleList', which is iteratable in the training regime... 
        """
        for filters, kernel_size, stride, padding in conv_layers_config:
            self.convolutions.append(nn.Sequential(
                nn.Conv2d(in_channels, filters, kernel_size=kernel_size, stride=stride, padding=padding),  # Added Conv2d here
                nn.BatchNorm2d(filters),        # Added BatchNorm here
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2)  # added Max-pooling layer
            ))
            in_channels = filters  # Update channels for next layer

        # Compute final activation map size
        activation_map_size = self._conv_test(input_size)
        logger.info(f"Aggregating activation map with size {activation_map_size}")
        self.agg = nn.AvgPool2d(activation_map_size)

        # Fully connected layers
        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_channels, units1),
            nn.BatchNorm1d(units1),             # Added BatchNorm here
            nn.ReLU(),
            nn.Dropout(dropout_rate),           # Added dropout here

            nn.Linear(units1, units2),
            nn.BatchNorm1d(units2),             # Added BatchNorm here
            nn.ReLU(),
            nn.Dropout(dropout_rate),           # Added dropout here

            nn.Linear(units2, 10)
        )

    def _conv_test(self, input_size=(32, 1, 28, 28)):
        x = torch.ones(input_size)
        for layer in self.convolutions:
            x = layer(x)
        return x.shape[-2:]

    # this is the forward function that uses called when data to passed to the model....
    def forward(self, x):
        for layer in self.convolutions:
            x = layer(x)
        x = self.agg(x)
        logits = self.dense(x)
        return logits


<font color='green'>

**To initialize this CNN model we first need to configure the list to include each block paramerters.**

</font>


In [6]:
# Example: [(filters, kernel_size, stride, padding)]
conv_layers_config = [(32, 3, 1, 1),        # For 1st CNN-block
                      (32*2, 3, 1, 0),      # For 2nd CNN-block
                      (32*2, 3, 1, 0)]      # For 3rd CNN-block

model = CNN_ModuleList(conv_layers_config=conv_layers_config, units1=128, units2=64).to('cpu')
summary(model, input_size=(1, 28, 28), device="cpu")

[32m2025-02-17 16:10:49.290[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m25[0m - [1mAggregating activation map with size torch.Size([2, 2])[0m


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
       BatchNorm2d-2           [-1, 32, 28, 28]              64
              ReLU-3           [-1, 32, 28, 28]               0
         MaxPool2d-4           [-1, 32, 14, 14]               0
            Conv2d-5           [-1, 64, 12, 12]          18,496
       BatchNorm2d-6           [-1, 64, 12, 12]             128
              ReLU-7           [-1, 64, 12, 12]               0
         MaxPool2d-8             [-1, 64, 6, 6]               0
            Conv2d-9             [-1, 64, 4, 4]          36,928
      BatchNorm2d-10             [-1, 64, 4, 4]             128
             ReLU-11             [-1, 64, 4, 4]               0
        MaxPool2d-12             [-1, 64, 2, 2]               0
        AvgPool2d-13             [-1, 64, 1, 1]               0
          Flatten-14                   

# 3. Improve your pipeline
In addition to new layers, we have expanded our logging tools with MLFlow, so we currently can choose between gin-config, tensorboard and MLFlow.

Expand your training pipeline you started in the previous lesson such that:

- you can switch between models by changing a config file
<font color='green'>

**Answer: Instead of hardcoding architectures & hyperparameters, store them in a config file. You can check the paramters in the `model.gin` file.**

</font>
- you can test different hyperparameters by changing a config file
<font color='green'>

**Answer: The `model.gin` file contains the paremeters for our `CNN_custom` model. The file is easily configurable to change the parameters of the model. After modifying the `model.gin` you have to load the file again in the notebook, so the changed parameters are loaded successfully.**

</font>
- you automatically log settings: model picked, hyperparameters, metrics, etc. : use either gin-config, tensorboard or MLFlow to log that, or a combination, whatever you prefer.
<font color='green'>

**Answer: All the experiments will be logged in the `modellogs` directory with the timestamps. Each experiment directory will contains 2 files; `saved_config.gin` (contains experiemnt configuration) and `events` file (contains tensorboard logs during tarining for loss and accuracy values.)**

</font>
- Important: doing a master means you don't just start engineering a pipeline, but you need to reflect. Why do you see the results you see? What does this mean, considering the theory? Write down lessons learned and reflections, based on experimental results.
<font color='green'>

**Answer: In this excercise, we are advised to use a CNN model to train the `FashionMNIST` dataset. The main aim for this excercise is to develop a training pipeline. Firstly, we develop a custom CNN architecture class `CNN_custom` by adding `BatchNorm` and `dropout` layers to regularization to the model for better generalization of the dataset. This `CNN_custom` is configurable with `gin-config`, so we added a config file called `model.gin` with all the parameters required for our CNN architecture. The file is easily configurable to change the parameters of the model.**

**Now, lets ponder on the results we obtained from this excercise. Adding regulariation is always better for better generalization of the model as it avoids the model to overfit. The `BatchNorm` layer tries to normalize the trained weights to avoid exploding gradient issue which can cause deviation from the local minima. This exploding gradient also can cause the model to deviates from the learning, hence the loss is increased. The `dropout` layer adds a regularization effect to the model to restrict the model to overfitting on the training example.**

**From the results, we can observed that the added layers helps the model to learn faster as compared to the model in `02_convolutions` notebook file. However, the final accuracy is almost similar but the learning curve is improved for our new model. It may means that we have maxed the accuracy for the provided dataset and adding more layers might not have huge impact on the final accuracy.**

</font>

- continuously improve your code: 
    - clean up your experimental environment, such that it doesnt get too messy
    - automate the boring stuff: use a Makefile, use configfiles, automate logging, etc.
    - use git: commit your changes often and with descriptive messages
    - separate code for pipelines, configs, models, modeltraining and results.

<font color='green'>

**Instead of hardcoding architectures & hyperparameters, store them in a config file `model.gin`. You can check the parameters in the `model.gin` file.**

</font>


In [7]:
import torch.optim as optim
import gin

gin.parse_config_file("model.gin")

ParsedConfigFileIncludesAndImports(filename='model.gin', imports=['gin.torch.external_configurables'], includes=[])

In [8]:
import torch.optim as optim
from mltrainer import Trainer, TrainerSettings, ReportTypes, metrics
import mlflow
import mlflow.pytorch
from torch.utils.tensorboard import SummaryWriter

<font color='green'>

**Now we can use the defined parameters of the model to initialize the model accordingly. Hence, by changing only the config file we can change the parameters of the model. It is more conviniennt then working with hardcoded parameters.**

</font>

In [9]:
# Initialize the CNN model wrt the config file defined in model.gin
model = CNN_custom().to('cpu')

# we will use `Adam` optimizer for the following task....
optimizer = optim.Adam

loss_fn = nn.CrossEntropyLoss()
accuracy = metrics.Accuracy()

[32m2025-02-17 16:10:49.367[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m47[0m - [1mAggregating activation map with size torch.Size([2, 2])[0m


In [10]:
settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir="modellogs/",
    train_steps=len(streamers["train"]),
    valid_steps=len(streamers["valid"]),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN],
)

In [11]:
trainer = Trainer(
    model=model,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optimizer,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device='cpu',
)

trainer.loop()

[32m2025-02-17 16:10:49.380[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to modellogs/20250217-161049[0m
[32m2025-02-17 16:10:49.869[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m72[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:45<00:00, 20.47it/s]
[32m2025-02-17 16:11:38.377[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 0 train 0.5744 test 0.3426 metric ['0.8775'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:44<00:00, 20.92it/s]
[32m2025-02-17 16:12:26.069[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 1 train 0.3407 test 0.2924 metric ['0.8971'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [00:46<00:00, 20.28it/s]
[32m2025-02-17 16:13:15.423[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[3

<font color='green'>

we can observe that the added layers helps the model to learn faster as compared to the model in `02_convolutions` notebook file. However, the final accuracy is almost similar but the learning curve is improved for our new model. It may means that we have maxed the accuracy for the provided dataset and adding more layers might not have huge impact on the final accuracy.
</font>

<font color='green'>

#### Extra experiment ...

**As we see in our previous experiment that accuracy is around 91%, so can we increase the accuracy by increasing the number of layers and filter size? to find out this lets perform another experiment.**

**Lets again utilize the `CNN_ModuleList` class to initialize the CNN architecture that we previously defined. The configuration of the model is defined below. We have in total 4 CNN-blocks and then 2 Linear layers acting as the classification layer.**

**Note: this model will take some time to train. On my PC, it took around 20mins to train.**
</font>


In [18]:
# Example: [(filters, kernel_size, stride, padding)]
conv_layers_config = [(64, 3, 1, 1),        # For 1st CNN-block
                      (128*2, 3, 1, 'same'),      # For 2nd CNN-block
                      (128*2, 3, 1, 'same'),      # For 3rd CNN-block
                      (64*2, 3, 1, 'same')]      # For 4th CNN-block

# Initialize the model with more CNN layers
model = CNN_ModuleList(conv_layers_config=conv_layers_config, units1=512, units2=256).to('cpu')
summary(model, input_size=(1, 28, 28), device="cpu")

[32m2025-02-17 16:53:38.244[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m25[0m - [1mAggregating activation map with size torch.Size([1, 1])[0m


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 28, 28]             640
       BatchNorm2d-2           [-1, 64, 28, 28]             128
              ReLU-3           [-1, 64, 28, 28]               0
         MaxPool2d-4           [-1, 64, 14, 14]               0
            Conv2d-5          [-1, 256, 14, 14]         147,712
       BatchNorm2d-6          [-1, 256, 14, 14]             512
              ReLU-7          [-1, 256, 14, 14]               0
         MaxPool2d-8            [-1, 256, 7, 7]               0
            Conv2d-9            [-1, 256, 7, 7]         590,080
      BatchNorm2d-10            [-1, 256, 7, 7]             512
             ReLU-11            [-1, 256, 7, 7]               0
        MaxPool2d-12            [-1, 256, 3, 3]               0
           Conv2d-13            [-1, 128, 3, 3]         295,040
      BatchNorm2d-14            [-1, 12

In [19]:
trainer = Trainer(
    model=model,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optimizer,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device='cpu',
)

trainer.loop()

[32m2025-02-17 16:53:39.712[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to modellogs/20250217-165339[0m
[32m2025-02-17 16:53:39.713[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m72[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [01:43<00:00,  9.03it/s]
[32m2025-02-17 16:55:28.597[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 0 train 0.4187 test 0.3111 metric ['0.8862'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [01:48<00:00,  8.64it/s]
[32m2025-02-17 16:57:22.081[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 1 train 0.2732 test 0.2941 metric ['0.8907'][0m
100%|[38;2;30;71;6m██████████[0m| 937/937 [01:44<00:00,  8.98it/s]
[32m2025-02-17 16:59:12.006[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[3

<font color='green'>

**The above result suggest that even after increasing the number of CNN layers we did not achieve significant performance boost. This new model took almsot double the time to train, but still only able to achieve the 92% accuracy. Hence, we can conclude that the we have maxed the performance of the model on the `FashionMNIST` dataset.**
</font>
