# 1. Introduction

This notebook focuses on evaluating the ViT/16-224 model derived from Comp_ViT224_Modeling notebook.

# 2. Importing Libraries

In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import torchvision

from torch import nn
from torchvision.transforms import v2
from pathlib import Path

from modules.helper_functions import set_seeds, get_most_wrong_examples, load_model
from modules.dataloaders import create_dataloaders
from modules.engine import predict_and_store, Trainer, sec_to_min_sec
from modules.vision_transformer import ViT

# Define some constants
NUM_WORKERS = os.cpu_count()
BATCH_SIZE = 64
AMOUNT_TO_GET = 1.0
SEED = 42

# Define target data directory
target_dir_name = f"../data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent"

# Setup training and test directories
target_dir = Path(target_dir_name)
train_dir = target_dir / "train"
test_dir = target_dir / "test"
target_dir.mkdir(parents=True, exist_ok=True)

# Create target model directory
model_dir = Path("../models")

# Set seeds
set_seeds(SEED)

In [2]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# 2. Creating Dataloaders

In [3]:
# Create transforms
IMG_SIZE_1 = 256
IMG_SIZE_2 = 224
manual_transforms_train = v2.Compose([    
    v2.TrivialAugmentWide(),
    v2.Resize((IMG_SIZE_1, IMG_SIZE_1)),
    v2.RandomCrop((IMG_SIZE_2, IMG_SIZE_2)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
    ])

manual_transforms_test = v2.Compose([    
    v2.Resize((IMG_SIZE_1, IMG_SIZE_1)),
    v2.CenterCrop((IMG_SIZE_2, IMG_SIZE_2)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
    ])

In [4]:
# Create dataloaders
train_dataloader, test_dataloader, class_names = create_dataloaders(    
    train_dir=train_dir,
    test_dir=test_dir,
    train_transform=manual_transforms_train,
    test_transform=manual_transforms_test,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS
)

# 3. Evaluating ViT-Base/16-384 with 102 Classes

The model will be evaluated using the test dataset with the following metrics: accuracy, loss, model size (MB), average prediction time per image (sec), and average predited images per second (frames/sec)

In [5]:
# Train the model
model_type="vitb224_101_2025-01-19"
model_name = model_type + ".pth"
results_df = model_type + ".csv"

# Copy weights from torchvision.models
set_seeds(SEED)

# Instantiate the model
model = torchvision.models.vit_b_16(weights=torchvision.models.ViT_B_16_Weights.DEFAULT).to(device)
model.heads = nn.Linear(in_features=768, out_features=len(class_names)).to(device)

# Compile model
model = torch.compile(model, backend="aot_eager")

# Load the trained weights
model = load_model(model=model,
                   model_weights_dir=model_dir,
                   model_weights_name=model_name)

# Load the training results
results = pd.read_csv(os.path.join(model_dir, results_df))

  param_schemas = callee.param_schemas()
  param_schemas = callee.param_schemas()


[INFO] Loading model from: ..\models\vitb224_101_2025-01-19.pth


## 3.1 Making Predictions on the CPU: Intel Core i9-9900K@3.60GHz

In [6]:
# Make predictions on the CPU to compute the average prediction time per image
pred_list_cpu, classif_report_cpu = Trainer(
    model=model,
    device="cpu"
    ).predict_and_store(
        test_dir=test_dir,
        transform=manual_transforms_test,
        class_names=class_names,
        )

[INFO] Finding all filepaths ending with '.jpg' in directory: ..\data\food-101_100_percent\test


  0%|          | 0/25250 [00:00<?, ?it/s]

## 3.2 Making Predictions on the GPU: NVIDIA GeForce RTX 4070

In [7]:
# Make predictions on the GPU to compute the average prediction time per image
from modules.engine import Trainer
pred_list_gpu, classif_report_gpu = Trainer(
    model=model,
    device=device
    ).predict_and_store(
        test_dir=test_dir,
        transform=manual_transforms_test,
        class_names=class_names
    )

[INFO] Finding all filepaths ending with '.jpg' in directory: ..\data\food-101_100_percent\test


  0%|          | 0/25250 [00:00<?, ?it/s]

## 3.3. Collecting Data

In [8]:
# Create a dictionary with the results for this model, in terms 
dict = {}
dict['Model Name'] = model_name
dict['Test Acc'] = round(results['test_acc'].iloc[-1], 4)
dict['Test los'] = round(results['test_loss'].iloc[-1], 4)
dict['Model Size [MB]'] = round(Path(os.path.join(model_dir, model_name)).stat().st_size // (1024*1024), 2)
dict['No Params [Million]'] = round(sum(p.numel() for p in model.parameters()) / 1000000, 2)
dict['CPU: Prediction Time - Mean [s]'] = round(pd.DataFrame(pred_list_cpu)['time_for_pred'].mean(), 4)
dict['CPU: Predicted Images per Sec [fps]'] = round(1.0 / dict['CPU: Prediction Time - Mean [s]'], 4)
dict['GPU: Prediction Time - Mean [s]'] = round(pd.DataFrame(pred_list_gpu)['time_for_pred'].mean(), 4)
dict['GPU: Predicted Images per Sec [fps]'] = round(1.0 / dict['GPU: Prediction Time - Mean [s]'], 4)
dict['GPU: Training time'] = sec_to_min_sec(results["train_time [s]"].mean() + results["test_time [s]"].mean())

# Convert to dataframe
df = pd.DataFrame.from_dict(dict, orient="index").reset_index()
df.columns = ['Metric', 'Value']
df

Unnamed: 0,Metric,Value
0,Model Name,vitb224_101_2025-01-19.pth
1,Test Acc,0.8806
2,Test los,1.1962
3,Model Size [MB],327
4,No Params [Million],85.88
5,CPU: Prediction Time - Mean [s],0.118
6,CPU: Predicted Images per Sec [fps],8.4746
7,GPU: Prediction Time - Mean [s],0.0219
8,GPU: Predicted Images per Sec [fps],45.6621
9,GPU: Training time,7m33s
