# ResNet50 Training on ImageNet

This notebook trains ResNet50 on ImageNet with the following configuration:
- Model: ResNet50 (imported from `models/model_resnet50.py`)
- Batch size: 128 (increased from ResNet152's 48)
- Model name: `resnet_50_sgd1`
- Architecture: ResNet50 with [3, 4, 6, 3] blocks

## Import Libraries

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
import time

# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print("Libraries imported - ready to use PyTorch", torch.__version__)

## Setup Parameters and Device

In [None]:
# Helper function
def show_image(image, label):
    image = image.permute(1, 2, 0)
    plt.imshow(image.squeeze())
    plt.title(f'Label: {label}')
    plt.show()

# Device setup
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Resume training option
resume_training = True

In [None]:
# Training parameters
class Params:
    def __init__(self):
        self.batch_size = 128  # Changed from 48 to 128 for ResNet50
        self.name = "resnet_50_sgd1"  # Changed from resnet_152_sgd1
        self.workers = 4
        self.lr = 0.002
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__

params = Params()
print(params)

## Data Loading

In [None]:
# Data paths
training_folder_name = '/home/xpz1/Downloads/imagenet-object-localization-challenge/ILSVRC/Data/CLS-LOC/train'
val_folder_name = '/home/xpz1/Downloads/imagenet-object-localization-challenge/ILSVRC/Data/CLS-LOC/val'

In [None]:
# Training data loader
train_transformation = transforms.Compose([
    transforms.ToTensor(),
    transforms.RandomResizedCrop(224, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
    transforms.RandomHorizontalFlip(0.5),
    transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = torchvision.datasets.ImageFolder(
    root=training_folder_name,
    transform=train_transformation
)
train_sampler = torch.utils.data.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=params.batch_size,
    sampler=train_sampler,
    num_workers=params.workers,
    pin_memory=True,
)

In [None]:
# Validation data loader
val_transformation = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(size=256, antialias=True),
    transforms.CenterCrop(224),
    transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
])

val_dataset = torchvision.datasets.ImageFolder(
    root=val_folder_name,
    transform=val_transformation
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=128,  # Increased to match training batch size
    num_workers=params.workers,
    shuffle=False,
    pin_memory=True
)

## Define Training and Testing Functions

In [None]:
from math import sqrt

def train(dataloader, model, loss_fn, optimizer, epoch, writer):
    size = len(dataloader.dataset)
    model.train()
    start0 = time.time()
    start = time.time()

    # Add tracking variables
    total_loss = 0
    correct = 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Track metrics
        total_loss += loss.item()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        # Backpropagation
        loss.backward()
        optimizer.step()
        batch_size = len(X)
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * batch_size
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}], {(current/size * 100):>4f}%")
            step = epoch * size + current
            writer.add_scalar('training loss', loss, step)
            new_start = time.time()
            delta = new_start - start
            start = new_start
            if batch != 0:
                print("Done in ", delta, " seconds")
                remaining_steps = size - current
                speed = 100 * batch_size / delta
                remaining_time = remaining_steps / speed
                print("Remaining time (seconds): ", remaining_time)
        optimizer.zero_grad()
    print("Entire epoch done in ", time.time() - start0, " seconds")
    # Return metrics
    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / size
    return avg_loss, accuracy

def test(dataloader, model, loss_fn, epoch, writer, train_dataloader, calc_acc5=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct, correct_top5 = 0, 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            if calc_acc5:
                _, pred_top5 = pred.topk(5, 1, largest=True, sorted=True)
                correct_top5 += pred_top5.eq(y.view(-1, 1).expand_as(pred_top5)).sum().item()
    test_loss /= num_batches
    step = epoch * len(train_dataloader.dataset)
    if writer != None:
        writer.add_scalar('test loss', test_loss, step)
    correct /= size
    correct_top5 /= size
    if writer != None:
        writer.add_scalar('test accuracy', 100*correct, step)
        if calc_acc5:
            writer.add_scalar('test accuracy5', 100*correct_top5, step)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    if calc_acc5:
        print(f"Test Error: \n Accuracy-5: {(100*correct_top5):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
    return test_loss, correct * 100, correct_top5 * 100  # Return as percentages

## Import ResNet50 Model

In [None]:
# Import ResNet50 from our model file
import sys
sys.path.append('..')  # Add parent directory to path
from models.model_resnet50 import resnet50

print("ResNet50 model imported successfully!")

## Create ResNet50 Model

In [None]:
# Clear GPU memory
import gc
torch.cuda.empty_cache()
gc.collect()

# Create ResNet50 model using our imported function
device = "cuda"
model = resnet50(num_classes=1000).to(device)

print("Model: ResNet50")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"\nUsing batch size: {params.batch_size}")
print("ResNet50 uses ~3x less memory than ResNet152, allowing larger batch sizes")

## Learning Rate Finder

In [None]:
# Use the same LR finder code as ResNet152
from torch_lr_finder import LRFinder
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage import gaussian_filter1d

# Run LR finder
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-7, weight_decay=1e-4)
lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
lr_finder.range_test(train_loader, end_lr=0.02, num_iter=5000, smooth_f=0.1)

# Extract and analyze
lrs = lr_finder.history["lr"]
losses = lr_finder.history["loss"]
smoothed_losses = gaussian_filter1d(losses, sigma=50)
gradients = np.gradient(smoothed_losses, np.log(lrs))
min_grad_idx = np.argmin(gradients)
optimal_lr = lrs[min_grad_idx]

# Plot
plt.figure(figsize=(10, 6))
plt.plot(lrs, losses, label="Original Loss", alpha=0.5)
plt.plot(lrs, smoothed_losses, label="Smoothed Loss", color="red")
plt.scatter(optimal_lr, smoothed_losses[min_grad_idx], color="blue", 
            label=f"Steepest Drop LR: {optimal_lr:.2e}", zorder=5)
plt.xscale("log")
plt.xlabel("Learning Rate")
plt.ylabel("Loss")
plt.title("Learning Rate Finder with Steepest Drop Marked")
plt.legend()
plt.grid(True)
plt.show()

print(f"Learning rate with steepest drop in loss: {optimal_lr:.2e}")
lr_finder.reset()

## Training Setup

In [None]:
# Set learning rate and create optimizer/scheduler
params.lr = optimal_lr # 0.0001  # Or use optimal_lr from LR finder

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), 
                            lr=params.lr, momentum=params.momentum, weight_decay=params.weight_decay)

# OneCycleLR scheduler
# Note: steps_per_epoch will be different with batch_size=128
steps_per_epoch = len(train_loader)
print(f"Steps per epoch with batch_size={params.batch_size}: {steps_per_epoch}")

lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=0.002, total_steps=None, epochs=100, 
    steps_per_epoch=steps_per_epoch, pct_start=0.3, anneal_strategy='cos', 
    cycle_momentum=True, base_momentum=0.85, max_momentum=0.95, 
    div_factor=10.0, final_div_factor=1000.0, 
    three_phase=False, last_epoch=-1, verbose='deprecated'
)

## Resume Training Setup

In [None]:
# Resume from checkpoint if available
start_epoch = 0
checkpoint_path = os.path.join("checkpoints", params.name, f"checkpoint.pth")

if resume_training and os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["model"])
    start_epoch = checkpoint["epoch"] + 1
    optimizer.load_state_dict(checkpoint["optimizer"])
    lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
    assert params == checkpoint["params"]
    print(f"Resumed from epoch {start_epoch}")

## Training Loop

In [None]:
## Initialize Training History
# Training history for plotting
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': [],
    'val_acc_top5': [],
    'lr': []
}

# Best model tracking
best_val_acc = 0
best_epoch = 0

# Setup tensorboard
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path

Path(os.path.join("checkpoints", params.name)).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter('runs/' + params.name)

# Initial validation
test(val_loader, model, loss_fn, epoch=0, writer=writer, train_dataloader=train_loader, calc_acc5=True)

# Training loop
for epoch in range(start_epoch, 100):
    print(f"Running Epoch:{epoch} ")
    # Get training metrics
    train_loss, train_acc = train(train_loader, model, loss_fn, optimizer, epoch=epoch, writer=writer)
    
    checkpoint = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "lr_scheduler": lr_scheduler.state_dict(),
        "epoch": epoch,
        "params": params
    }
    torch.save(checkpoint, os.path.join("checkpoints", params.name, f"model_{epoch}.pth"))
    torch.save(checkpoint, os.path.join("checkpoints", params.name, f"checkpoint.pth"))
    
    lr_scheduler.step()
    # Get validation metrics
    val_loss, val_acc, val_acc_top5 = test(val_loader, model, loss_fn, epoch + 1, writer, 
                                           train_dataloader=train_loader, calc_acc5=True)
    
    # Update history
    current_lr = optimizer.param_groups[0]['lr']
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_acc_top5'].append(val_acc_top5)
    history['lr'].append(current_lr)
    
    # Track best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_epoch = epoch
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"best_model.pth"))
    
    # Print epoch summary
    print(f"\nEpoch {epoch+1}/100 Summary:")
    print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    print(f"  Val Top-5 Acc: {val_acc_top5:.2f}%")
    print(f"  Learning Rate: {current_lr:.2e}")
    print(f"  Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch+1})")
    print("=" * 50)

In [None]:
## Plot Training Progress
# Plot training curves
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

epochs = range(1, len(history['train_loss']) + 1)

# Loss plot
axes[0, 0].plot(epochs, history['train_loss'], 'b-', label='Train')
axes[0, 0].plot(epochs, history['val_loss'], 'r-', label='Val')
axes[0, 0].set_title('Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Accuracy plot
axes[0, 1].plot(epochs, history['train_acc'], 'b-', label='Train')
axes[0, 1].plot(epochs, history['val_acc'], 'r-', label='Val Top-1')
axes[0, 1].plot(epochs, history['val_acc_top5'], 'g-', label='Val Top-5')
axes[0, 1].set_title('Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy (%)')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Learning rate plot
axes[1, 0].plot(epochs, history['lr'], 'orange')
axes[1, 0].set_title('Learning Rate')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('LR')
axes[1, 0].set_yscale('log')
axes[1, 0].grid(True)

# Summary text
axes[1, 1].axis('off')
summary_text = f"""Training Summary:

Model: ResNet50
Batch Size: {params.batch_size}
Epochs Trained: {len(history['train_loss'])}

Best Val Acc: {best_val_acc:.2f}%
Best Epoch: {best_epoch + 1}

Final Train Acc: {history['train_acc'][-1]:.2f}%
Final Val Acc: {history['val_acc'][-1]:.2f}%
Final Val Top-5: {history['val_acc_top5'][-1]:.2f}%
"""
axes[1, 1].text(0.1, 0.5, summary_text, fontsize=12, 
                verticalalignment='center', fontfamily='monospace')

plt.suptitle('ResNet50 Training Progress', fontsize=16)
plt.tight_layout()

# Save the plot
Path("plots").mkdir(exist_ok=True)
plt.savefig("plots/resnet50_training_curves.png", dpi=100)
plt.show()

In [None]:
## Save Training History
# Save training history as JSON
Path("logs").mkdir(exist_ok=True)

history_path = "logs/resnet50_training_history.json"
with open(history_path, 'w') as f:
    json.dump(history, f, indent=4)
print(f"Training history saved to {history_path}")

# Save training configuration
config_dict = {
    'model': 'ResNet50',
    'batch_size': params.batch_size,
    'initial_lr': params.lr,
    'momentum': params.momentum,
    'weight_decay': params.weight_decay,
    'epochs_trained': len(history['train_loss']),
    'best_val_acc': best_val_acc,
    'best_epoch': best_epoch + 1
}

config_path = "logs/resnet50_config.json"
with open(config_path, 'w') as f:
    json.dump(config_dict, f, indent=4)
print(f"Configuration saved to {config_path}")