"""
This script performs multiclass image classification on the iFood-2019 dataset using a custom CNN with residual blocks and CBAM attention.

Main components:
1. Reproducibility setup with fixed seeds
2. Dataset extraction, transformation, and preparation
3. Custom dataset class and data loaders
4. Definition of a residual CNN with CBAM attention
5. Training with label smoothing, cosine LR scheduling, and early stopping
6. Hyperparameter grid search over learning rate and weight decay
7. Model evaluation with classification report and confusion matrix
8. Plots for accuracy, losses, errors, time over epoocs

Output:
- Best model checkpoint
- Training and evaluation logs
- Accuracy and performance metrics
- Plots to suppor it
"""


In [None]:
#pip install tqdm

In [None]:
# Import libraries
import zipfile
import os
import shutil
from torchvision import transforms
from torch.utils.data import Dataset
import pandas as pd
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
import numpy as np
import time
import os, pandas as pd, zipfile, pathlib
import torch, random, numpy as np
from pathlib import Path
import random
from tqdm import tqdm
import multiprocessing
from torch.cuda.amp import autocast, GradScaler
import torch
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt




In [None]:
# fixed the seed - it is necessary for reproducibility of the results.
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# We use multiple workers. Each worker needs its own seed. If we dont fix the seed of the workers this might destroy the purpose of shuflle of augmnetations since each worker would generate it own sequence.
def seed_worker(worker_id):
    seed = torch.initial_seed() % 2**32
    np.random.seed(seed)
    random.seed(seed)
set_seed(42)



In [None]:
# Setting the paths
data_dir = Path('/kaggle/input/ifood-2019-fgvc6')
working_dir = Path('/kaggle/working')
TRAIN_CSV = data_dir / 'train_labels.csv'

In [None]:
# Since the datset is zipped and contains an extra internal folder we do flatening. This was necessary to all pytorch to read the actual Imagefolder

def extract_and_flatten(zip_filename, folder_name, internal_folder_name):
    zip_path = data_dir / zip_filename
    extract_path = working_dir / (folder_name + "_temp")
    final_path = working_dir / folder_name

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_path)

    nested = extract_path / internal_folder_name
    final_path.mkdir(exist_ok=True)
    for fname in os.listdir(nested):
        shutil.move(str(nested / fname), str(final_path / fname))

    shutil.rmtree(extract_path)

# We apply this to the three subests, train, validation split
extract_and_flatten("train_set.zip", "train", "train_set")
extract_and_flatten("val_set.zip", "val", "val_set")
extract_and_flatten("test_set.zip", "test", "test_set")

In [None]:
# Ensure that we loaded the dataset correctly. Just a check
print("Train images:", len(os.listdir(os.path.join(working_dir, "train"))))
print("Validation images:", len(os.listdir(os.path.join(working_dir, "val"))))
print("Test images:", len(os.listdir(os.path.join(working_dir, "test"))))


In [None]:
labels_df = pd.read_csv(labels_csv)


In the cell below we descbe the tranfromation that we applied in the preprocessing part. First the mean and the standard deviation are not with the default Imagente Mean [0.485, 0.456, 0.406] Standard deviation [0.229, 0.224, 0.225] values. Instead we calculated the mean and the standar deviation of our traing set. The same values were used also for the normalisation of the validation set. The values that we obtained from the traing set were Mean [0.6388, 0.5444, 0.4448] and Std [0.2229, 0.2414, 0.2638]. And these are the ones used below.

In [None]:
# def compute_mean_std(image_dir):
#     from tqdm import tqdm

#     transform = transforms.Compose([
#         transforms.Resize((224, 224)),
#         transforms.ToTensor()
#     ])

#     image_paths = list(Path(image_dir).glob("*.jpg"))

#     mean = torch.zeros(3)
#     std = torch.zeros(3)
#     total_images = 0

#     for img_path in tqdm(image_paths, desc="Computing mean/std"):
#         image = Image.open(img_path).convert("RGB")
#         tensor = transform(image)
#         mean += tensor.mean(dim=(1, 2))
#         std += tensor.std(dim=(1, 2))
#         total_images += 1

#     mean /= total_images
#     std /= total_images

#     print("Mean:", mean.tolist())
#     print("Std:", std.tolist())
#     return mean.tolist(), std.tolist()

Secondly, in our initial experiments, we observed that the training accuracy was significantly higher than the validation and test accuracy. To address this issue, we implemented augmentations such as random rotations, flips, and brightness changes. These were applied only to the training set, not to the test set.


In [None]:
# Transforms ( with dataset-specific mean and std)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    # prof of flipping 50  percent
    transforms.RandomHorizontalFlip(),
    # random slight rotations in range + or - 15 degres
    transforms.RandomRotation(15),
    # Randomly changes brightness, contrast, saturation, and hue
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02),
    # slight random crops of the images
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.6388, 0.5444, 0.4448], std=[0.2229, 0.2414, 0.2638])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.6388, 0.5444, 0.4448], std=[0.2229, 0.2414, 0.2638])
])

In [None]:
# Dataset
class FoodDataset(Dataset):
    def __init__(self, image_dir, labels_df, transform, class_to_idx):
        self.image_dir = image_dir
        self.labels_df = labels_df
        self.transform = transform
        #  Dictionary to map class names to numeric indices
        self.class_to_idx = class_to_idx

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        img_path = os.path.join(self.image_dir, row['img_name'])
        # convert to RGB
        image = Image.open(img_path).convert('RGB')
        label = self.class_to_idx[row['label']]
        image = self.transform(image)
        return image, label


In the Kaggle dataset we used, the test images were not accompanied by ground truth labels. Therefore, we did not use the test set. Instead, we split the original training set into training and validation subsets (with stratification). As a result, we trained the model using only 80\% of the original training set, while the remaining 20\% was used for validation.


In [None]:
# Load CSVs and split
labels_df = pd.read_csv(TRAIN_CSV)
classes = sorted(labels_df['label'].unique())
class_to_idx = {cls_name: idx for idx, cls_name in enumerate(classes)}
labels = labels_df['label'].map(class_to_idx)

train_idx, val_idx = train_test_split(
    np.arange(len(labels_df)), test_size=0.2,
    stratify=labels, random_state=42
)

train_df = labels_df.iloc[train_idx].reset_index(drop=True)
val_df = labels_df.iloc[val_idx].reset_index(drop=True)



A high number of workers can speed up the data loading process. However, using too many workers may lead to system overload. To avoid this, we defined a boundary by limiting the number of workers to a maximum value. The minimum was set to the number of available CPU cores. For reproducibility, we set the \texttt{seed\_worker}. Shuffling was applied only during training; for validation and testing, shuffling was disabled. Additionally, we enabled \texttt{pin\_memory=True} to allow direct memory access transfers to the GPU.



In [None]:
# Set optimal num_workers
cpu_count = multiprocessing.cpu_count()
num_workers = min(6, cpu_count)

# Loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,
                          num_workers=num_workers, pin_memory=True,
                          persistent_workers=True, worker_init_fn=seed_worker)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False,
                        num_workers=num_workers, pin_memory=True,
                        persistent_workers=True, worker_init_fn=seed_worker)

test_csv = data_dir / 'val_labels.csv'
test_labels_df = pd.read_csv(test_csv)
test_dataset = FoodDataset(working_dir / 'val', test_labels_df, val_transform, class_to_idx)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False,
                         num_workers=num_workers, pin_memory=True,
                         persistent_workers=True, worker_init_fn=seed_worker)

The network  ResNet-inspired convolutional neural network with under 5 million parameters. It uses stacked `BasicBlock` residual units to preserve gradient flow. The final feature map is refined with a CBAM attention module that applies both channel and spatial attention. A global average pooling layer, followed by dropout and a two-layer MLP, produces the final class logits for 251 food categories. The model employs `SiLU` activations and `AdamW` optimization with cosine learning rate scheduling.


In [None]:
# Basic residual block used in the CNN architecture
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample  # Used when input and output dimensions don't match

    def forward(self, x):
        identity = x if self.downsample is None else self.downsample(x)
        out = self.act(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += identity  # Residual connection
        return self.act(out)

# Convolutional Block Attention Module. Aim: to improve feature focus
class CBAMBlock(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.channel_fc = nn.Sequential(
            nn.Conv2d(channels, channels // reduction, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(channels // reduction, channels, 1, bias=False)
        )
        self.spatial = nn.Sequential(
            nn.Conv2d(2, 1, kernel_size=7, padding=3, bias=False),
            nn.Sigmoid()
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Channel attention mechanism
        avg_out = self.channel_fc(self.avg_pool(x))
        max_out = self.channel_fc(self.max_pool(x))
        channel_att = self.sigmoid(avg_out + max_out)
        x = x * channel_att

        # Spatial attention mechanism
        avg_pool = torch.mean(x, dim=1, keepdim=True)
        max_pool, _ = torch.max(x, dim=1, keepdim=True)
        spatial_att = self.spatial(torch.cat([avg_pool, max_pool], dim=1))
        return x * spatial_att

# Custom CNN architecture with residual blocks and CBAM attention
class CustomCNN(nn.Module):
    def __init__(self, block, layers, num_classes=251, base_width=32):
        super().__init__()
        self.in_channels = base_width

        # Initial convolution + activation
        self.conv1 = nn.Conv2d(3, base_width, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(base_width)
        self.act = nn.SiLU(inplace=True)

        # Residual layers
        self.layer1 = self._make_layer(block, base_width, layers[0])
        self.layer2 = self._make_layer(block, base_width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(block, base_width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(block, base_width * 8, layers[3], stride=2)

        # Attention mechanism
        self.cbam = CBAMBlock(base_width * 8)

        # Classification head
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Sequential(
            nn.Linear(base_width * 8, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

        self._init_weights()  # Initialize all weights

    # residual layer with `blocks` number of blocks
    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion)
            )
        layers = [block(self.in_channels, out_channels, stride, downsample)]
        self.in_channels = out_channels * block.expansion
        layers.extend([block(self.in_channels, out_channels) for _ in range(1, blocks)])
        return nn.Sequential(*layers)

    # Custom weight initialization
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    # Forward pass through the entire netwrk
    def forward(self, x):
        x = self.act(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.cbam(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        return x


The instantiation of the network goes as follows
     - BasicBlock as the residual block
     - [2, 2, 2, 2] blocks per layer (ResNet-18-like depth)
     - 40 base channels (slightly narrower than standard)
     - num_classes as output layer size (default: 251 for iFood).
Here we also ensured that our network is below the maximum threshold of 5M params.

In [None]:
# Function to build and return the custom CNN model
def build_custom_cnn(num_classes=251):

    model = CustomCNN(BasicBlock, [2, 2, 2, 2], num_classes=num_classes, base_width=40)

    # Count and print the number of trainable parameters
    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {n_params:,} ({n_params / 1e6:.2f}M)")

    return model

# Instantiate the model
model = build_custom_cnn(num_classes=251)


Trainable parameters: 4,671,493 (4.67M)


Cross-entropy loss is used for multi-class classification. Since we dealt with a large dataset, it is important to reduce sharp probability peaks and discourage the model from being too confident. That is why, instead of assigning hard probabilities, we used label_smoothing=0.1 to assign 0.9 probability to the correct classes and 0.1 to the rest. This can help with overfitting.

AdamW =Adam with decoupled weight decay is the optimizer that we decided for. It is a variation of the Adam optimizer but uses L2 regularisation. The reeason we used this is overfitting.

To adjust the learning rate dynamically, we used CosineAnnealingLR. The idea is to start with a higher learning rate and decrease it over time to avoid overshooting a minimum point. We applied this scheduler to the AdamW optimizer. Note that T_max represents the number of epochs over which the learning rate decreases. Here, it is set to 50. If a different number of epochs is used, the T_max parameter should be adjusted accordingly.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.05)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)



cuda


Since we conducted this project with limited computational resources, we aimed to avoid training models that failed to improve over time. To address this, we implemented early stopping. We set a patience value, which we varied between 6 and 10 depending on whether we were still in the experimental phase for a specific architecture.

In [None]:
#  EarlyStopping class
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, save_path='best_model10.pt'):
        self.patience = patience
        self.verbose = verbose
        self.save_path = save_path
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_score, model):
        if self.best_score is None or val_score > self.best_score:
            self.best_score = val_score
            self.counter = 0
            if self.verbose:
                print(f"Validation score improved. Saving model to {self.save_path}")
            torch.save(model.state_dict(), self.save_path)
        else:
            self.counter += 1
            if self.verbose:
                print(f"No improvement in validation score for {self.counter} epochs.")
            if self.counter >= self.patience:
                self.early_stop = True


The grid search is not extensive, since we got the impression that the actual difference between "good performing" neural networks actually shows after 20–25, maybe even after 30 epochs. This is the phase when validation accuracy increases above 30%, a point above which not every neural network can reach. Training neural networks with around 5 million parameters for that many epochs requires resources that we didn’t have.

In [None]:
search_space = {
    'lr': [1e-4, 1e-3],
    'weight_decay': [0.01, 0.001],
}

In [None]:
# Initialize tracking variables
best_val_acc = 0.0
best_config = None
best_model_state = None

In [None]:
# Set device and training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 60
patience = 6  # for early stopping. If it doesnt improve after 6 epochs stop


We performed a grid search over learning rate and weight decay combinations. For each configuration, the model was trained using AdamW with label smoothing, cosine annealing learning rate scheduling, and early stopping. Metrics and runtime were logged per epoch to identify the best-performing setup.

In [None]:
# Grid search over hyperparameter combinations
for lr in search_space['lr']:
    for wd in search_space['weight_decay']:
        print(f"Training with lr={lr}, weight_decay={wd}")

        # Build model and training utilities
        model = build_custom_cnn(num_classes=251).to(device)
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
        early_stopper = EarlyStopping(patience=patience, verbose=False)

        # Logs for analysis
        train_losses = []
        val_losses = []
        train_accuracies = []
        val_accuracies = []
        epoch_times = []
        total_start_time = time.time()

        for epoch in range(num_epochs):
            epoch_start_time = time.time()
            model.train()
            train_loss, correct, total = 0.0, 0, 0
            loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]", leave=False)

            # Training loop
            for images, labels in loop:
                images, labels = images.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item() * images.size(0)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                loop.set_postfix(loss=loss.item())

            scheduler.step()  # update learning rate

            # Compute average training metrics
            avg_train_loss = train_loss / total
            train_accuracy = 100 * correct / total
            train_losses.append(avg_train_loss)
            train_accuracies.append(train_accuracy)

            # Validation loop
            model.eval()
            val_loss, val_correct, val_total = 0.0, 0, 0
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(device), labels.to(device)
                    outputs = model(images)
                    loss = criterion(outputs, labels)

                    val_loss += loss.item() * images.size(0)
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

            # Compute average validation metrics
            avg_val_loss = val_loss / val_total
            val_accuracy = 100 * val_correct / val_total
            val_losses.append(avg_val_loss)
            val_accuracies.append(val_accuracy)

            epoch_duration = time.time() - epoch_start_time
            epoch_times.append(epoch_duration)

            print(f"Epoch {epoch+1}/{num_epochs} | "
                  f"Train Loss: {avg_train_loss:.4f}, Acc: {train_accuracy:.2f}% | "
                  f"Val Loss: {avg_val_loss:.4f}, Acc: {val_accuracy:.2f}% | "
                  f"Time: {epoch_duration:.2f}s")

            early_stopper(val_accuracy, model)
            if early_stopper.early_stop:
                print("→ Early stopping triggered.")
                break

        total_training_time = time.time() - total_start_time
        print(f"Total training time for config (lr={lr}, wd={wd}): {total_training_time:.2f}s")

        if early_stopper.best_score > best_val_acc:
            best_val_acc = early_stopper.best_score
            best_config = {'lr': lr, 'weight_decay': wd}
            best_model_state = model.state_dict()

        # Free up memory after each run
        del model, optimizer, scheduler
        torch.cuda.empty_cache()

# Save and report the best configuration
print(f"Best Val Acc: {best_val_acc:.2f}% with config: {best_config}")
torch.save(best_model_state, "best_model.pt")
print("Saved best model to best_model.pt")




In [None]:
# Load the best-performing model checkpoint, set to evaluation mode, and move it to the target device
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
model.to(device)



Evaluation was based on average cross-entropy loss, overall accuracy, per-class metrics via classification report, and confusion matrix visualization using the test set. Note since the number of classes was very big confussion matrix was not really informative. It is basically gray square with white diagonal. This is why I did not report them in the actual report.


In [None]:
def evaluate_model(model, data_loader, criterion, device, class_names):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds = []
    all_labels = []

    print("→ Starting evaluation...")

    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Store predictions and ground truths for analysis
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / total
    accuracy = 100 * correct / total

    print(f"Evaluation Loss: {avg_loss:.4f}")
    print(f"Evaluation Accuracy: {accuracy:.2f}%\n")

    # Detailed metrics per class
    print("→ Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names))

    # Confusion matrix visualization
    print("→ Generating Confusion Matrix...")
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()


In [None]:
# Generate a list of class names in the format "Class 0", "Class 1", ..., based on the sorted unique label values in the dataset
class_names = [f"Class {i}" for i in sorted(labels_df['label'].unique())]

In [None]:
# call the evaluate model function
evaluate_model(model, test_loader, criterion, device, class_names)

Plotting the curves for validational losses, accuracy and erros.

In [None]:

# Plot losses
plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

# Plot accuracies
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy over Epochs')
plt.legend()

plt.show()


In [None]:
val_errors = [100 - acc for acc in val_accuracies]

plt.plot(val_errors, label='Validation Error')
plt.xlabel('Epoch')
plt.ylabel('Error (%)')
plt.title('Validation Error over Epochs')
plt.legend()
plt.show()


Time analysis

In [None]:
total_training_time = time.time() - total_start_time
print(f"\nTotal Training Time: {total_training_time / 60:.2f} minutes")
print(f"Average Epoch Time: {np.mean(epoch_times):.2f} seconds")
print(f"Fastest Epoch: {np.min(epoch_times):.2f} s | Slowest Epoch: {np.max(epoch_times):.2f} s")

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(epoch_times)+1), epoch_times, marker='o')
plt.title("Epoch Duration Over Time")
plt.xlabel("Epoch")
plt.ylabel("Time (seconds)")
plt.grid(True)
plt.show()
