# 1. Introduction

This notebook focuses on evaluating the best-performing Vision Transformer (ViT) model derived from the ViT_Modeling_v3 notebook. The model employs a ViT-Base/16-384 architecture, trained from scratch but initialized with pretrained weights. Its input image size is 384×384, which results in longer prediction times compared to the models employing input images of 224x224 pixels (see notebook ViT_Modeling_v1.ipynb).

# 2. Importing Libraries

In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import torchvision

from torch import nn
from torchvision.transforms import v2
from pathlib import Path

from modules.helper_functions import set_seeds, get_most_wrong_examples, load_model
from modules.dataloaders import create_dataloaders
from modules.engine import predict_and_store, Trainer
from modules.vision_transformer import ViT

# Define some constants
NUM_WORKERS = os.cpu_count()
BATCH_SIZE = 64
AMOUNT_TO_GET = 1.0
SEED = 42

# Define target data directory
target_dir_name = f"../data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent"

# Setup training and test directories
target_dir = Path(target_dir_name)
train_dir = target_dir / "train"
test_dir = target_dir / "test"
target_dir.mkdir(parents=True, exist_ok=True)

# Create target model directory
model_dir = Path("../models")

# Set seeds
set_seeds(SEED)

In [2]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# 2. Creating Dataloaders

In [3]:
# Create transforms
IMG_SIZE_1 = 272
IMG_SIZE_2 = 256
manual_transforms_train = v2.Compose([    
    v2.TrivialAugmentWide(),
    v2.Resize((IMG_SIZE_1, IMG_SIZE_1)),
    v2.RandomCrop((IMG_SIZE_2, IMG_SIZE_2)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
    ])

manual_transforms_test = v2.Compose([    
    v2.Resize((IMG_SIZE_1, IMG_SIZE_1)),
    v2.CenterCrop((IMG_SIZE_2, IMG_SIZE_2)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
    ])

In [4]:
# Create dataloaders
train_dataloader, test_dataloader, class_names = create_dataloaders(    
    train_dir=train_dir,
    test_dir=test_dir,
    train_transform=manual_transforms_train,
    test_transform=manual_transforms_test,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS
)

# 3. Evaluating ViT-Base/16-384 with 102 Classes

The model will be evaluated using the test dataset with the following metrics: accuracy, loss, model size (MB), average prediction time per image (sec), and average predited images per second (frames/sec)

In [5]:
# Train the model
model_type="swinb_101_2025-01-17"
model_name = f"{model_type}"
swinb_model_name = model_name + ".pth"
swinb_results_df = model_name + ".csv"

# Copy weights from torchvision.models
set_seeds(SEED)

# Instantiate the model
swinb_model = torchvision.models.swin_v2_b(
    weights=torchvision.models.Swin_V2_B_Weights.DEFAULT,
    )

swinb_model.head = nn.Linear(in_features=1024, out_features=len(class_names)).to(device)

# Compile model
swinb_model = torch.compile(swinb_model, backend="aot_eager")

# Load the trained weights
swinb_model = load_model(model=swinb_model,
                               model_weights_dir=model_dir,
                               model_weights_name=swinb_model_name)

# Load the training results
swinb_results = pd.read_csv(os.path.join(model_dir, swinb_results_df))

  param_schemas = callee.param_schemas()
  param_schemas = callee.param_schemas()


[INFO] Loading model from: ..\models\swinb_101_2025-01-17.pth


## 3.1 Making Predictions on the CPU: Intel Core i9-9900K@3.60GHz

In [6]:
# Make predictions on the CPU to compute the average prediction time per image
pred_list_cpu, classif_report_cpu = Trainer(
    model=swinb_model,
    device="cpu"
    ).predict_and_store(
        test_dir=test_dir,
        transform=manual_transforms_test,
        class_names=class_names
        )

[INFO] Finding all filepaths ending with '.jpg' in directory: ..\data\food-101_100_percent\test


  0%|          | 0/25250 [00:00<?, ?it/s]

## 3.2 Making Predictions on the GPU: NVIDIA GeForce RTX 4070

In [7]:
# Make predictions on the GPU to compute the average prediction time per image
from modules.engine import Trainer
pred_list_gpu, classif_report_gpu = Trainer(
    model=swinb_model,
    device=device
    ).predict_and_store(
        test_dir=test_dir,
        transform=manual_transforms_test,
        class_names=class_names
    )

[INFO] Finding all filepaths ending with '.jpg' in directory: ..\data\food-101_100_percent\test


  0%|          | 0/25250 [00:00<?, ?it/s]

## 3.3. Collecting Data

In [8]:
# Create a dictionary with the results for this model, in terms 
swins_dict = {}
swins_dict['Model Name'] = model_name
swins_dict['Test Acc'] = round(swinb_results['test_acc'].iloc[-1], 2)
swins_dict['Test los'] = round(swinb_results['test_loss'].iloc[-1], 2)
swins_dict['Model Size [MB]'] = round(Path(os.path.join(model_dir, swinb_model_name)).stat().st_size // (1024*1024), 2)
swins_dict['No Params [Million]'] = round(sum(p.numel() for p in swinb_model.parameters()) / 1000000, 2)
swins_dict['CPU: Prediction Time - Mean [s]'] = round(pd.DataFrame(pred_list_cpu)['time_for_pred'].mean(), 4)
swins_dict['CPU: Predicted Images per Sec [fps]'] = round(1.0 / swins_dict['CPU: Prediction Time - Mean [s]'], 4)
swins_dict['GPU: Prediction Time - Mean [s]'] = round(pd.DataFrame(pred_list_gpu)['time_for_pred'].mean(), 4)
swins_dict['GPU: Predicted Images per Sec [fps]'] = round(1.0 / swins_dict['GPU: Prediction Time - Mean [s]'], 4)

# Convert to dataframe 3
df_effnetb2 = pd.DataFrame.from_dict(swins_dict, orient="index").reset_index()
df_effnetb2.columns = ['Metric', 'Value']
df_effnetb2

Unnamed: 0,Metric,Value
0,Model Name,swinb_101_2025-01-17
1,Test Acc,0.93
2,Test los,1.04
3,Model Size [MB],332
4,No Params [Million],87.01
5,CPU: Prediction Time - Mean [s],0.3024
6,CPU: Predicted Images per Sec [fps],3.3069
7,GPU: Prediction Time - Mean [s],0.1693
8,GPU: Predicted Images per Sec [fps],5.9067
