In [1]:
from torchvision import datasets, transforms

In [2]:
# Step 1: Get the Data
transform = transforms.Compose([
 transforms.ToTensor(),
 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))    
])

In [3]:
# Load dataset
train_dataset = datasets.CIFAR10(root='./data',train=True,download=True,transform=transform)
test_dataset = datasets.CIFAR10(root='./data',train=False,download=True,transform=transform)

100%|██████████| 170M/170M [00:02<00:00, 82.9MB/s] 


In [4]:
from torch.utils.data import random_split,DataLoader

In [5]:
test_size = int(0.5 * len(test_dataset))   # 50% for testing
val_size = int(0.5 * len(test_dataset))  # 50% for validation

In [7]:
# Dividing test data to test data and val data
test_dataset, val_dataset = random_split(test_dataset, [test_size, val_size])

In [8]:
train_data_loader = DataLoader(train_dataset,batch_size=32,shuffle=True,num_workers=4)# change number of workers to 4 when we ran in cuda env
test_data_loader = DataLoader(test_dataset,batch_size=32,shuffle=False,num_workers=4)
val_data_loader = DataLoader(val_dataset,batch_size=32,shuffle=False,num_workers=4)

In [10]:
# Model Creation
import torchvision.models as models
import torch

model = models.resnet18(weights=True)  

In [11]:
import torch.nn as nn

num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)

In [12]:
# Make Infra ready by setting loss,optinizer and scheduler
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)
# Scheduler: ReduceLROnPlateau
# - mode='min' → expects validation loss to decrease
# - factor=0.1 → multiply LR by 0.1 when triggered
# - patience=3 → wait 3 epochs with no improvement before reducing LR
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=3,
    verbose=True
)

num_epochs = 30

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else 
                      "mps" if torch.backends.mps.is_available() else 
                      "cpu")

print("Using device:", device)
model.to(device)

Using device: cuda




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [13]:
import os

# Make sure the checkpoints directory exists
os.makedirs("checkpoints", exist_ok=True)

best_acc = 0.0
for epoch in range(num_epochs):  
    model.train()
    running_loss = 0.0
    for images, labels in train_data_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss+=loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    model.eval()
    val_loss = 0
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in val_data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            val_loss+=loss         
    acc = 100 * correct / total
    print(f"Validation Accuracy: {100 * correct / total:.2f}%")

    avg_val_loss = val_loss / len(val_data_loader)
    
    # Step the scheduler with validation loss
    scheduler.step(avg_val_loss)
    
    print(f"Epoch {epoch+1}, Current LR: {scheduler.optimizer.param_groups[0]['lr']}, "
          f"Val Loss: {avg_val_loss:.4f}")

    # --- Checkpointing ---
    checkpoint = {
        "epoch": epoch + 1,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "accuracy": acc,
        "loss": loss.item()
    }
    torch.save(checkpoint, f"checkpoints/epoch_{epoch+1}.pth")
    
    # Save best model separately
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "checkpoints/best_model.pth")
        print("Best model updated and saved!")

Epoch 1, Loss: 0.6815
Validation Accuracy: 72.88%
Epoch 1, Current LR: 0.001, Val Loss: 0.8087
Best model updated and saved!
Epoch 2, Loss: 0.9311
Validation Accuracy: 73.96%
Epoch 2, Current LR: 0.001, Val Loss: 0.7977
Best model updated and saved!
Epoch 3, Loss: 1.2481
Validation Accuracy: 75.26%
Epoch 3, Current LR: 0.001, Val Loss: 0.7722
Best model updated and saved!
Epoch 4, Loss: 0.2942
Validation Accuracy: 78.32%
Epoch 4, Current LR: 0.001, Val Loss: 0.6592
Best model updated and saved!
Epoch 5, Loss: 0.3481
Validation Accuracy: 75.04%
Epoch 5, Current LR: 0.001, Val Loss: 1.1445
Epoch 6, Loss: 0.3048
Validation Accuracy: 80.52%
Epoch 6, Current LR: 0.001, Val Loss: 0.6069
Best model updated and saved!
Epoch 7, Loss: 0.0711
Validation Accuracy: 80.50%
Epoch 7, Current LR: 0.001, Val Loss: 0.6495
Epoch 8, Loss: 0.3039
Validation Accuracy: 80.20%
Epoch 8, Current LR: 0.001, Val Loss: 0.6917
Epoch 9, Loss: 0.4269
Validation Accuracy: 80.20%
Epoch 9, Current LR: 0.001, Val Loss: 0.

In [16]:
import torch.optim as optim

# Optimizer (same as before)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Scheduler: CosineAnnealingLR
# - T_max = number of epochs (or cycle length)
# - eta_min = minimum LR at the end of the cosine curve
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=30,       # full training run length
    eta_min=1e-6    # minimum LR
)

# Training loop snippet
for epoch in range(30):
    model.train()
    running_loss = 0.0
    
    for images, labels in train_data_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Validation step
    model.eval()
    val_loss = 0.0
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in val_data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    val_accuracy = 100 * correct / total
    
    # Step the scheduler
    scheduler.step()
    
    print(f"Epoch {epoch+1}, LR: {scheduler.get_last_lr()}, "
          f"Train Loss: {running_loss/len(train_data_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_data_loader):.4f}, "
          f"Val Acc: {val_accuracy:.2f}%")

Epoch 1, LR: [0.0009972636867364526], Train Loss: 0.1263, Val Loss: 0.8491, Val Acc: 80.00%
Epoch 2, LR: [0.000989084726566536], Train Loss: 0.1069, Val Loss: 0.8544, Val Acc: 79.20%
Epoch 3, LR: [0.0009755527298894295], Train Loss: 0.0952, Val Loss: 0.9939, Val Acc: 77.34%
Epoch 4, LR: [0.0009568159560924792], Train Loss: 0.0881, Val Loss: 0.8727, Val Acc: 79.50%
Epoch 5, LR: [0.0009330796891903274], Train Loss: 0.0793, Val Loss: 0.9362, Val Acc: 79.56%
Epoch 6, LR: [0.0009046039886902865], Train Loss: 0.0680, Val Loss: 0.9701, Val Acc: 78.24%
Epoch 7, LR: [0.0008717008403259585], Train Loss: 0.0662, Val Loss: 0.9020, Val Acc: 80.30%
Epoch 8, LR: [0.0008347307378762498], Train Loss: 0.0551, Val Loss: 0.9419, Val Acc: 80.30%
Epoch 9, LR: [0.0007940987335200904], Train Loss: 0.0522, Val Loss: 0.9360, Val Acc: 79.84%
Epoch 10, LR: [0.00075025], Train Loss: 0.0403, Val Loss: 1.0679, Val Acc: 78.74%
Epoch 11, LR: [0.0007036649532163624], Train Loss: 0.0371, Val Loss: 1.0568, Val Acc: 79.94