In [1]:
import os.path as osp
from functools import partial
from pathlib import Path
from typing import Callable, List, Optional, Union

import lightning as L
from loguru import logger
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import os
from torchvision import datasets, transforms
from pathlib import Path
from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
from torchvision import models
import torchvision.transforms as transforms
import torchmetrics

In [5]:
class MRDataModule(L.LightningDataModule):
    """Data module for the Mon Reader model."""

    def __init__(
        self,
        data_dir: Path | str,
        transform: Optional[Callable] = transforms.Compose(
            [
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                ),
            ]
        ),
        batch_size: int = 32,
    ):
        """Initialization.

        Args:
            data_dir (str): The root directory of the dataset.
        """
        super().__init__()
        self.save_hyperparameters()

    def setup(self, stage=None) -> None:
        """Setup.
        Args:
          stage: Optional[str], default=None. The stage of the setup.
        """
        # Loading the training dataset. We need to split it into a training and validation part
        train_set = datasets.ImageFolder(
            root=f"{self.hparams['data_dir']}/training",
            transform=self.hparams.transform,
        )
        self.train_set, self.val_set = data.random_split(train_set, [0.75, 0.25])
        # Loading the test set
        self.test_set = datasets.ImageFolder(
            root=f"{self.hparams['data_dir']}/testing", transform=self.hparams.transform
        )

    def train_dataloader(self):
        """Return the training dataloader.

        Returns:
            DataLoader: The training dataloader.
        """
        return data.DataLoader(
            self.train_set, batch_size=self.hparams["batch_size"], shuffle=True
        )

    def val_dataloader(self):
        """Return the validation dataloader.

        Returns:
            DataLoader: The validation dataloader.
        """
        return data.DataLoader(
            self.val_set, batch_size=self.hparams["batch_size"], shuffle=False
        )

    def test_dataloader(self):
        """Return the test dataloader.

        Returns:
            DataLoader: The test dataloader.
        """
        return data.DataLoader(
            self.test_set, batch_size=self.hparams["batch_size"], shuffle=False
        )


class MRNet(L.LightningModule):
    def __init__(self, model_params, optimizer_hparams):
        """CIFARModule.

        Args:
            model_name: Name of the model/CNN to run. Used for creating the model (see function below)
            model_hparams: Hyperparameters for the model, as dictionary.
            optimizer_hparams: Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        # Create model
        self.model = create_model(**model_params)
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()
        # Metrics
        self.metrics = torchmetrics.MetricCollection(
            {
                "f1": torchmetrics.F1Score(
                    num_classes=model_params["num_classes"], task="binary"
                ),
                "accuracy": torchmetrics.Accuracy(
                    num_classes=model_params["num_classes"], task="binary"
                ),
            }
        )

    def forward(self, imgs):
        # Forward function
        return self.model(imgs)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), **self.hparams.optimizer_hparams)
        # We will reduce the learning rate by 0.1 after 100 and 150 epochs
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[50, 100], gamma=0.1
        )
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        # "batch" is the output of the training data loader.
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        self._compute_metrics(labels, preds, "train")
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        # By default logs it per epoch (weighted average over batches)
        self._compute_metrics(labels, preds, "val")

    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        # By default logs it per epoch (weighted average over batches), and returns it afterwards
        self._compute_metrics(labels, preds, "test")

    def _compute_metrics(self, labels, preds, stage):
        metric_results = self.metrics(preds.argmax(dim=-1), labels)
        for name, result in metric_results.items():
            self.log(f"{stage}_{name}", result, on_epoch=True)


# Models


class CustomMRNet(nn.Module):
    def __init__(self):
        super(CustomMRNet, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(), nn.Linear(32 * 56 * 56, 128), nn.ReLU(), nn.Linear(128, 2)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


def create_model(model_name: str, num_classes: int, **kwargs):
    # Choose a pre-trained model from torchvision
    if model_name == "custom":
        model = CustomMRNet()
    else:
        model = models.get_model(
            model_name, weights=None, num_classes=num_classes, **kwargs
        )
        model.train()
    # print(model)
    # # Replace the last fully connected layer with a new one for binary classification
    # num_features = model.fc.in_features
    # model.fc = nn.Linear(num_features, num_classes)

    return model

In [10]:
def train_model(
    data_params,
    model_params,
    optimizer_hparams,
    checkpoint_path: Optional[str] = None,
    num_epochs: int = 1,
    **kwargs,
):
    """Train model.

    Args:
        model_name: Name of the model you want to run. Is used to look up the class in "model_dict"
        save_name (optional): If specified, this name will be used for creating the checkpoint and logging directory.
    """
    L.seed_everything(42)

    data = MRDataModule(**data_params)
    data.setup()
    print(f"Train --->{len(data.train_set)}")
    print(f"Val --->{len(data.val_set)}")
    print(f"Test --->{len(data.test_set)}")
    # Create a PyTorch Lightning trainer with the generation callback
    trainer = L.Trainer(
        default_root_dir=os.path.join(
            checkpoint_path, model_params["model_name"]
        ),  # Where to save models
        # We run on a single GPU (if possible)
        accelerator="auto",
        devices=1,
        # How many epochs to train for if no patience is set
        max_epochs=num_epochs,
        # limit_train_batches= 2, # FOR DEBUG
        # limit_val_batches = 2, # FOR DEBUG
        # limit_test_batches= 2, #FOR DEBUG
        callbacks=[
            ModelCheckpoint(
                save_weights_only=True, mode="max", monitor="val_accuracy"
            ),  # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
            LearningRateMonitor("epoch"),
        ],  # Log learning rate every epoch
    )
    trainer.logger._default_hp_metric = (
        None  # Optional logging argument that we don't need
    )

    # Check whether pretrained model exists. If yes, load it and skip training
    model_name = model_params["model_name"]
    pretrained_filename = os.path.join(checkpoint_path, f"{model_name}.ckpt")
    if os.path.isfile(pretrained_filename):
        print(f"Found pretrained model at {pretrained_filename}, loading...")
        # Automatically loads the model with the saved hyperparameters
        model = MRNet.load_from_checkpoint(pretrained_filename)
    else:
        L.seed_everything(42)  # To be reproducible
        model = MRNet(model_params, optimizer_hparams)
        trainer.fit(model, data)
        model = MRNet.load_from_checkpoint(
            trainer.checkpoint_callback.best_model_path
        )  # Load best checkpoint after training

    # Test best model
    test_result = trainer.test(model, data, verbose=False)
    return {"model": model_name, **test_result[0]}

In [7]:
data_params = {"data_dir": "../images/images", "batch_size": 16}
model_params = {"num_classes": 2}
checkpoint_path = "../.checkpoints/"
optimizer_hparams = {"lr": 1e-3, "weight_decay": 1e-4}

In [18]:
baseline = train_model(
    data_params,
    {"model_name": "custom", **model_params},
    optimizer_hparams,
    checkpoint_path,
    num_epochs = 100
) # Only running 2 batches for 1 one epoch - just for Debug

Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Seed set to 42

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | CustomMRNet      | 12.9 M
1 | loss_module | CrossEntropyLoss | 0     
2 | metrics     | MetricCollection | 0     
-------------------------------------------------
12.9 M    Trainable params
0         Non-trainable params
12.9 M    Total params
51.402    Total estimated model params size (MB)


Train --->1794
Val --->598
Test --->597
Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/rogia/p99pe5XYBfCuAWBK/.venv/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/home/rogia/p99pe5XYBfCuAWBK/.venv/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 99: 100%|██████████| 113/113 [01:51<00:00,  1.01it/s, v_num=1]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 113/113 [01:51<00:00,  1.01it/s, v_num=1]


/home/rogia/p99pe5XYBfCuAWBK/.venv/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 38/38 [00:18<00:00,  2.00it/s]


In [6]:
pretrained_models = [
    train_model(
        data_params,
        {"model_name": model, **model_params},
        optimizer_hparams,
        checkpoint_path,
    )
    for model in models.list_models()
] # I was not able to run them on my computer because too heavy but the code is functionnal

Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Seed set to 42


Train --->1794
Val --->598
Test --->597


Missing logger folder: ../.checkpoints/alexnet/lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | AlexNet          | 57.0 M
1 | loss_module | CrossEntropyLoss | 0     
2 | metrics     | MetricCollection | 0     
-------------------------------------------------
57.0 M    Trainable params
0         Non-trainable params
57.0 M    Total params
228.048   Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2/2 [00:08<00:00,  0.25it/s, v_num=0]            

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2/2 [00:08<00:00,  0.23it/s, v_num=0]
Testing DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.82it/s]

Seed set to 42



Train --->1794
Val --->598
Test --->597


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Seed set to 42
Missing logger folder: ../.checkpoints/convnext_base/lightning_logs

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | ConvNeXt         | 87.6 M
1 | loss_module | CrossEntropyLoss | 0     
2 | metrics     | MetricCollection | 0     
-------------------------------------------------
87.6 M    Trainable params
0         Non-trainable params
87.6 M    Total params
350.274   Total estimated model params size (MB)


Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]                             

: 

In [20]:
from pandas import DataFrame

print(DataFrame([baseline])) #print(DataFrame([baseline] + pretrained_models))

    model  test_accuracy   test_f1
0  custom       0.998325  0.517588
