# 1. Introduction

 This notebook outlines the creation, compilation, and training of a deep learing network using the [TorchSuite](https://github.com/sergio-sanz-rodriguez/torchsuite) framework. In particular, a Vision Transformer (ViT) will be used to classify 101 types of food from the following dataset: https://huggingface.co/datasets/ethz/food101.
 
 The following table summarizes the model:

| Version | **ViT Type** | **Image Size** | **Patch Size** | **Encoding Layers** | **Hidden Size** | **Multi-layer Perceptron size** | **Attention Heads** | **Hidder Layer Units for Classification** | **Transfer Learning** | **Number of Epochs** | **Learning Rate** | **Scheduler** | **Params**
| ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
| 1 | Base | 384x384 | 16x16 | 12 | 768 | 3072 | 12 | 64 | IMAGENET1K_SWAG_E2E_V1 | 60 | 0.0001 | CosineAnnealingLR | 86.2M |

The custom vision transformer architectures have been implemented from scratch based on the paper titled ["An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale"](https://arxiv.org/abs/2010.11929). The custom library is called **vision_transformer** where the **ViT class** can be imported.

# 2. Importing Libraries

In [None]:
import os
import torch
import torchvision
import torch.backends.cudnn as cudnn
import pandas as pd

from torchvision.transforms import v2
from torchinfo import summary
from pathlib import Path
from torchvision import datasets
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR, ConstantLR
from tqdm import tqdm

# Import custom libraries
from utils.classification_utils import set_seeds, display_random_images
from engines.classification import ClassificationEngine
from models.vision_transformer import ViT
from engines.schedulers import FixedLRSchedulerWrapper
from dataloaders.image_dataloaders import create_classification_dataloaders_vit, create_classification_dataloaders_swin
from models.pretrained_classifiers import build_pretrained_classifier

# Dataset
from datasets import load_dataset

import warnings
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore", category=UserWarning, module="torch.autograd.graph")
warnings.filterwarnings("ignore", category=FutureWarning, module="onnxscript.converter")

# 3. Importing Dataset

The dataset should be organized as follows, with one subdirectory per class containing the corresponding images:

```
dataset/
├── train/
│   └── <class_label>/
│       ├── img1.jpg
│       ├── img2.png
│       └── ...
└── test/ (or val/)/
    └── <class_label>/
        ├── img1.jpg
        ├── img2.png
        └── ...
```

In [None]:
# Define some constants
NUM_WORKERS = os.cpu_count()
AMOUNT_TO_GET = 1.0
SEED = 42

# Define target data directory
TARGET_DIR_NAME = f"data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent"

# Setup training and test directories
TARGET_DIR = Path(TARGET_DIR_NAME)
TRAIN_DIR = TARGET_DIR / "train"
TEST_DIR = TARGET_DIR / "test"
TARGET_DIR.mkdir(parents=True, exist_ok=True)

# Create target model directory
MODEL_DIR = Path("outputs")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Set seeds
set_seeds(SEED)

IMPORT_DATASET = False

In [None]:
if IMPORT_DATASET:
    # Download dataset from Hugging Face
    ds = load_dataset("ethz/food101")

In [None]:
if IMPORT_DATASET:
    # Get class names
    class_names = ds["train"].features["label"].names

    # Function to save images into appropriate directories
    def save_images(split, target_dir):
        for example in tqdm(ds[split], desc=f"Saving {split} images"):
            image = example["image"]
            label = example["label"]
            class_name = class_names[label]

            # Define class directory
            class_dir = target_dir / class_name
            class_dir.mkdir(parents=True, exist_ok=True)

            # Save image
            img_path = class_dir / f"{len(list(class_dir.iterdir()))}.jpg"
            image.save(img_path)

    # Save training and test images
    save_images("train", TRAIN_DIR)
    save_images("validation", TEST_DIR)

    print("Dataset has been saved successfully!")

# 4. Specifying the Target Device

In [None]:
# Activate cuda benchmark
cudnn.benchmark = True

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

if device == "cuda":
    !nvidia-smi

# 5. Image Visualization

In [None]:
# Display images
manual_transforms = v2.Compose([
    v2.Resize((256)),
    v2.RandomCrop((256, 256)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True)
])

train_data = datasets.ImageFolder(TRAIN_DIR, transform=manual_transforms)
display_random_images(train_data,
                      n=25,
                      classes=train_data.classes,
                      rows=5,
                      cols=5,
                      display_shape=False,
                      seed=None)

# 6. Preparing Dataloaders

In [None]:
# Create the training and test dataloaders
IMG_SIZE_2 = 384
BATCH_SIZE = 32
train_dataloader, test_dataloader, class_names = create_classification_dataloaders_vit(
    model=f"vit_b_16_{IMG_SIZE_2}", # corresponds to ViT-Base/16-384
    batch_size=BATCH_SIZE,
    train_dir=TRAIN_DIR,
    test_dir=TEST_DIR,
    aug=True,
    num_workers=NUM_WORKERS
    )

train_dataloader_no_aug, _, _ = create_classification_dataloaders_vit(
    model=f"vit_b_16_{IMG_SIZE_2}", # corresponds to ViT-Base/16-384
    batch_size=BATCH_SIZE,
    train_dir=TRAIN_DIR,
    test_dir=TEST_DIR,
    aug=False,
    num_workers=NUM_WORKERS
    )

dataloaders = {
    'train':         train_dataloader,
    'train_aug_off': train_dataloader_no_aug, # Optional: only if off_first/last or random augmentation is enabled.
    'test':          test_dataloader
}

# 7. Creating a Custom Vision Transformer (ViT) Model

In [None]:
# Create the ViT-Base/16-384 model
NUM_CLASSES = len(class_names)
model = ViT(
    img_size=IMG_SIZE_2,
    in_channels=3,
    patch_size=16,
    num_transformer_layers=12,
    emb_dim=768,
    mlp_size=3072,
    num_heads=12,
    attn_dropout=0,
    mlp_dropout=0.1,
    emb_dropout=0.1,
    num_classes=NUM_CLASSES
)

# Copy weights from torchvision.models
if IMG_SIZE_2 == 384:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1) # For image size of 384x384
else:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1) # For image size of 224x224

# Compile model (optional)
#model = torch.compile(model, backend="aot_eager")

# Print summary
summary(model,
        input_size=(BATCH_SIZE,3,IMG_SIZE_2, IMG_SIZE_2),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

# Or Using PyTorch's Default ViT

In [None]:
USE_DEFAULT_PYTORCH_VIT = False
if USE_DEFAULT_PYTORCH_VIT:
    # Instantiate the model
    model = build_pretrained_classifier(
        model="vit_b_16_384" if IMG_SIZE_2 == 384 else "vit_b_16_224", # ViT-Base/16-384, otherwise ViT-Base/16-224
        num_classes=NUM_CLASSES,            
        seed=SEED,
        freeze=False,
        device=device
        )

# 8. Training the Model

## 8.1. Allways-on Augmentation

This strategy is the most commonly used approach and is also the default configuration of the training engine, maximizing the effective diversity of the training set from the outset. However, if the selected augmentation operations are too strong while the model is still untrained, it may struggle to learn the underlying structure early on.

In [None]:
# Configure training strategy: optimizer, loss function, scheduler (optional)
EPOCHS = 20
LR = 0.0001
MIN_LR = 1e-6
model_type="model_always"
model_name = model_type + ".pth"

# Create AdamW optimizer
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=0.01
)

# Create loss function
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

# Set scheduler
cosine = CosineAnnealingLR(optimizer, T_max=10, eta_min=MIN_LR) # 1-10:  LR = 1e-4 -> 1e-6 (cosine)
fixed = ConstantLR(optimizer, factor=MIN_LR/LR, total_iters=10) # 11-20: LR = 1e-6
scheduler = SequentialLR(
    optimizer,
    schedulers=[cosine, fixed],
    milestones=[10] 
)

In [None]:
# And train...

# Instantiate the classification engine with the created model and the target device
engine = ClassificationEngine(
    model=model,                                # Model to be trained
    optimizer=optimizer,                        # Optimizer
    loss_fn=loss_fn,                            # Loss function
    scheduler=scheduler,                        # Scheduler 
    use_distillation=False,                     # Optional, use_distillation is False by default    
    color_map={'train': 'light_red',            # Color map for the plots
               'test': 'light_green'},
    log_verbose=True,                           # Verbosity
    device=device                               # Target device
    )

# Configure the training method
results = engine.train(
    target_dir=MODEL_DIR,                       # Directory where the model will be saved
    model_name=model_name,                      # Name of the model
    resume=False,                                # Resume training from the last saved checkpoint
    save_best_model=["last", "loss", "acc"],    # Save the best models based on different criteria
    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency    
    dataloaders=dataloaders,                    # Dictionary with the dataloaders     
    apply_validation=True,                      # Enable validation step
    augmentation_strategy="always",             # Augmentation strategy    
    recall_threshold=0.995,                     # False positive rate at recall_threshold recall
    recall_threshold_pauc=0.95,                 # Partial AUC score above recall_threshold_pauc recall
    epochs=EPOCHS,                              # Total number of epochs
    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
    debug_mode=False,                           # Disable debug mode    
    accumulation_steps=2,                       # Accumulation steps 2: effective batch size = batch_size x accumulation steps
    )

In [None]:

# If trainning is interrupted, you can resume it by enabling 'resume'
#engine = ClassificationEngine(
#    model=model,
#    optimizer=optimizer,
#    loss_fn=loss_fn,
#    scheduler=scheduler,
#    use_distillation=False,
#    color_map={'train': 'light_red', 'test': 'light_magenta'},
#    log_verbose=True,
#    device=device
#    )
#results = engine.train(
#    target_dir=MODEL_DIR,                       # Directory where the model will be saved
#    model_name=model_name,                      # Name of the model
#    resume=True,                                # Resume training from the last saved checkpoint
#    save_best_model=["last", "loss", "acc"],    # Save the best models based on different criteria
#    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency    
#    dataloaders=dataloaders,                    # Dictionary with the dataloaders
#    apply_validation=True,                      # Enable validation step
#    augmentation_strategy="random",             # Augmentation strategy    
#    recall_threshold=0.995,                     # False positive rate at recall_threshold recall
#    recall_threshold_pauc=0.95,                 # Partial AUC score above recall_threshold_pauc recall
#    epochs=EPOCHS,                              # Total number of epochs
#    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
#    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
#    debug_mode=False,                           # Disable debug mode    
#    accumulation_steps=2,                       # Accumulation steps 2: effective batch size = batch_size x accumulation steps
#    )

# Or simply
# results = engine.train(resume=True)

## 8.2. Disabling Augmentation in the First Epochs

The model quickly learns the simplest structure on clean data first, then regularisation (augmentation) is introduced when it already has a decent representation.

However, if training starts without augmentation and then switch it on, the model may see the augmented data as a new distribution, which can slow learning, especially if the augmentations are strong. 

This strategy works better if augmentations are mild.

In [None]:
# Create the vanilla ViT-Base model
NUM_CLASSES = len(class_names)
model = ViT(
    img_size=IMG_SIZE_2,
    in_channels=3,
    patch_size=16,
    num_transformer_layers=12,
    emb_dim=768,
    mlp_size=3072,
    num_heads=12,
    attn_dropout=0,
    mlp_dropout=0.1,
    emb_dropout=0.1,
    num_classes=NUM_CLASSES
)

# Copy weights from torchvision.models
if IMG_SIZE_2 == 384:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1) # For image size of 384x384
else:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1) # For image size of 224x224

In [None]:
# Configure training strategy: optimizer, loss function, scheduler (optional)
EPOCHS = 20
LR = 0.0001
MIN_LR = 1e-6
OFF_EPOCHS = 5
model_type="model_off_first"
model_name = model_type + ".pth"

# Create AdamW optimizer
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=0.01
)

# Create loss function
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

# Set scheduler
linear = LinearLR(optimizer, start_factor=MIN_LR/LR, total_iters=OFF_EPOCHS) # 1-5:   LR = 1e-6
cosine = CosineAnnealingLR(optimizer, T_max=10, eta_min=MIN_LR)          # 6-15:  LR = 1e-4 -> 1e-6 (cosine)
fixed =  ConstantLR(optimizer, factor=MIN_LR/LR, total_iters=OFF_EPOCHS) # 16-20: LR = 1e-6
scheduler = SequentialLR(
    optimizer,
    schedulers=[linear, cosine, fixed],
    milestones=[OFF_EPOCHS, EPOCHS-OFF_EPOCHS]  # switch at these epochs
)

# And train...

# Instantiate the classification engine with the created model and the target device
engine = ClassificationEngine(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    scheduler=scheduler,
    use_distillation=False,
    log_verbose=True,
    device=device)

# Configure the training method
results = engine.train(
    target_dir=MODEL_DIR,                       # Directory where the model will be saved
    model_name=model_name,                      # Name of the model
    save_best_model=["last", "loss", "acc"],    # Save the best models based on different criteria
    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency    
    dataloaders=dataloaders,                    # Dictionary with the dataloaders
    apply_validation=True,                      # Enable validation step
    augmentation_strategy="off_first",          # Augmentation strategy
    augmentation_off_epochs=OFF_EPOCHS,         # Number of epochs without augmentation    
    recall_threshold=0.995,                     # False positive rate at recall_threshold recall
    recall_threshold_pauc=0.95,                 # Partial AUC score above recall_threshold_pauc recall    
    epochs=EPOCHS,                              # Total number of epochs
    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
    debug_mode=False,                           # Disable debug mode    
    accumulation_steps=2,                       # Accumulation steps 2: effective batch size = batch_size x accumulation steps
    )

## 8.3. Disabling Augmentation in the Last Epochs

By training on clean data near the end the model sees the true distribution and fine-tune its decision boundaries without the noise of augmentation.

In [None]:
# Create the vanilla ViT-Base model
NUM_CLASSES = len(class_names)
model = ViT(
    img_size=IMG_SIZE_2,
    in_channels=3,
    patch_size=16,
    num_transformer_layers=12,
    emb_dim=768,
    mlp_size=3072,
    num_heads=12,
    attn_dropout=0,
    mlp_dropout=0.1,
    emb_dropout=0.1,
    num_classes=NUM_CLASSES
)

# Copy weights from torchvision.models
if IMG_SIZE_2 == 384:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1) # For image size of 384x384
else:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1) # For image size of 224x224

In [None]:
# Configure training strategy: optimizer, loss function, scheduler (optional)
EPOCHS = 20
LR = 0.0001
MIN_LR = 1e-6
OFF_EPOCHS = 5
model_type="model_off_last"
model_name = model_type + ".pth"

# Create AdamW optimizer
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=0.01
)

# Create loss function
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

# Set scheduler
cosine = CosineAnnealingLR(optimizer, T_max=10, eta_min=MIN_LR) # 1-10:  LR = 1e-4 -> 1e-6 (cosine)
fixed = ConstantLR(optimizer, factor=MIN_LR/LR, total_iters=10) # 11-20: LR = 1e-6
scheduler = SequentialLR(
    optimizer,
    schedulers=[cosine, fixed],
    milestones=[10] 
)

# And train...

# Instantiate the classification engine with the created model and the target device
engine = ClassificationEngine(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    scheduler=scheduler,
    use_distillation=False,
    log_verbose=True,
    color_map={'train': 'red', 'test': 'yellow'},
    device=device)

# Configure the training method
results = engine.train(
    target_dir=MODEL_DIR,                       # Directory where the model will be saved
    model_name=model_name,                      # Name of the model
    save_best_model=["last", "loss", "acc"],    # Save the best models based on different criteria
    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency    
    dataloaders=dataloaders,                    # Dictionary with the dataloaders
    apply_validation=True,                      # Enable validation step
    augmentation_strategy="off_last",           # Augmentation strategy
    augmentation_off_epochs=OFF_EPOCHS,         # Number of epochs without augmentation
    recall_threshold=0.995,                     # False positive rate at recall_threshold recall
    recall_threshold_pauc=0.95,                 # Partial AUC score above recall_threshold_pauc recall    
    epochs=EPOCHS,                              # Total number of epochs
    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
    debug_mode=False,                           # Disable debug mode    
    accumulation_steps=2,                       # Accumulation steps 2: effective batch size = batch_size x accumulation steps
    )

## 8.4. Random Augmentation Schedules

This strategy acts like another layer of stochastic regularisation; the model never quite knows whether input will be clean or augmented. However, it can be harder to control.

In [None]:
# Create the vanilla ViT-Base model
NUM_CLASSES = len(class_names)
model = ViT(
    img_size=IMG_SIZE_2,
    in_channels=3,
    patch_size=16,
    num_transformer_layers=12,
    emb_dim=768,
    mlp_size=3072,
    num_heads=12,
    attn_dropout=0,
    mlp_dropout=0.1,
    emb_dropout=0.1,
    num_classes=NUM_CLASSES
)

# Copy weights from torchvision.models
if IMG_SIZE_2 == 384:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1) # For image size of 384x384
else:
    model.copy_weights(torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1) # For image size of 224x224

In [None]:
# Configure training strategy: optimizer, loss function, scheduler (optional)
EPOCHS = 20
LR = 0.0001
MIN_LR = 1e-6
RANDOM_PROB = 0.25
model_type="model_random"
model_name = model_type + ".pth"

# Create AdamW optimizer
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=0.01
)

# Create loss function
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

# Set scheduler
cosine = CosineAnnealingLR(optimizer, T_max=10, eta_min=MIN_LR) # 1-10:  LR = 1e-4 -> 1e-6 (cosine)
fixed = ConstantLR(optimizer, factor=MIN_LR/LR, total_iters=10) # 11-20: LR = 1e-6
scheduler = SequentialLR(
    optimizer,
    schedulers=[cosine, fixed],
    milestones=[10]
)

# And train...

# Instantiate the classification engine with the created model and the target device
engine = ClassificationEngine(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    scheduler=scheduler,
    use_distillation=False,
    log_verbose=True,
    color_map={'train': 'magenta', 'test': 'green'},
    device=device)

# Configure the training method
results = engine.train(
    target_dir=MODEL_DIR,                       # Directory where the model will be saved
    model_name=model_name,                      # Name of the model
    save_best_model=["last", "loss", "acc"],    # Save the best models based on different criteria
    keep_best_models_in_memory=False,           # Do not keep the models stored in memory for the sake of training time and memory efficiency    
    dataloaders=dataloaders,                    # Dictionary with the dataloaders
    apply_validation=True,                      # Enable validation step
    augmentation_strategy="random",             # Augmentation strategy
    augmentation_random_prob=RANDOM_PROB,       # Probability (0.0-1.0) of applying augmentation when agumentation_strategy is set to random    
    recall_threshold=0.995,                     # False positive rate at recall_threshold recall
    recall_threshold_pauc=0.95,                 # Partial AUC score above recall_threshold_pauc recall    
    epochs=EPOCHS,                              # Total number of epochs
    amp=True,                                   # Enable Automatic Mixed Precision (AMP)
    enable_clipping=False,                      # Disable clipping on gradients, only useful if training becomes unestable
    debug_mode=False,                           # Disable debug mode    
    accumulation_steps=2,                       # Accumulation steps 2: effective batch size = batch_size x accumulation steps
    )

# 9. Evaluating the Model

In [None]:
# Generate a classification report 
transforms = v2.Compose([
    v2.Resize(IMG_SIZE_2),
    v2.CenterCrop((IMG_SIZE_2, IMG_SIZE_2)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

pred_list_gpu, classif_report_gpu = engine.predict_and_store(
    test_dir=TEST_DIR,
    transform=transforms,
    class_names=class_names,
    sample_fraction=1,
    seed=SEED) # make predictions on GPU

In [None]:
pd.DataFrame(classif_report_gpu)