In [1]:
from torchvision import datasets, transforms

In [3]:
# Step 1: Get the Data
transform = transforms.Compose([
 transforms.ToTensor(),
 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))    
])

In [12]:
# Load dataset
train_dataset = datasets.CIFAR10(root='./data',train=True,download=True,transform=transform)
test_dataset = datasets.CIFAR10(root='./data',train=False,download=True,transform=transform)

In [13]:
len(test_dataset)

10000

In [14]:
len(train_dataset)

50000

In [16]:
from torch.utils.data import random_split,DataLoader

In [17]:
test_size = int(0.5 * len(test_dataset))   # 50% for testing
val_size = int(0.5 * len(test_dataset))  # 50% for validation

In [18]:
# Dividing test data to test data and val data
test_dataset, val_dataset = random_split(test_dataset, [test_size, val_size])

In [19]:
len(test_dataset)

5000

In [21]:
train_data_loader = DataLoader(train_dataset,batch_size=32,shuffle=True,num_workers=0)# change number of workers to 4 when we ran in cuda env
test_data_loader = DataLoader(test_dataset,batch_size=32,shuffle=False,num_workers=0)
val_data_loader = DataLoader(val_dataset,batch_size=32,shuffle=False,num_workers=0)

In [24]:
# Model Creation
import torchvision.models as models
import torch

model = models.resnet18(weights=None)  # no auto-download
state_dict = torch.load("../pytorch-dataloading/resnet18-5c106cde.pth",weights_only=False)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [26]:
import torch.nn as nn

num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)

In [27]:
# Make Infra ready by setting loss,optinizer and scheduler
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)
# Scheduler: StepLR
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
num_epochs = 30

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else 
                      "mps" if torch.backends.mps.is_available() else 
                      "cpu")

print("Using device:", device)
model.to(device)

Using device: mps


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [29]:
import os

# Make sure the checkpoints directory exists
os.makedirs("checkpoints", exist_ok=True)

best_acc = 0.0
for epoch in range(num_epochs):  
    model.train()
    running_loss = 0.0
    for images, labels in train_data_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss+=loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    model.eval()
    val_loss = 0
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in val_data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            val_loss+=loss         
    acc = 100 * correct / total
    print(f"Validation Accuracy: {100 * correct / total:.2f}%")

    # --- Checkpointing ---
    checkpoint = {
        "epoch": epoch + 1,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "accuracy": acc,
        "loss": loss.item()
    }
    torch.save(checkpoint, f"checkpoints/epoch_{epoch+1}.pth")
    
    # Save best model separately
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "checkpoints/best_model.pth")
        print("Best model updated and saved!")

    # Step the scheduler
    scheduler.step()        
    print(f"Epoch {epoch+1}, LR: {scheduler.get_last_lr()}, "
          f"Train Loss: {running_loss/len(train_data_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_data_loader):.4f}, "
          f"Val Acc: {acc:.2f}%")

Epoch 1, Loss: 0.9107
Validation Accuracy: 72.46%
Best model updated and saved!
Epoch 1, LR: [0.001], Train Loss: 1.0723, Val Loss: 0.8040, Val Acc: 72.46%
Epoch 2, Loss: 0.7060
Validation Accuracy: 76.80%
Best model updated and saved!
Epoch 2, LR: [0.001], Train Loss: 0.7545, Val Loss: 0.6923, Val Acc: 76.80%
Epoch 3, Loss: 0.4130
Validation Accuracy: 79.14%
Best model updated and saved!
Epoch 3, LR: [0.001], Train Loss: 0.6085, Val Loss: 0.6233, Val Acc: 79.14%
Epoch 4, Loss: 0.4066
Validation Accuracy: 79.98%
Best model updated and saved!
Epoch 4, LR: [0.001], Train Loss: 0.4935, Val Loss: 0.5882, Val Acc: 79.98%
Epoch 5, Loss: 1.1649
Validation Accuracy: 77.72%
Epoch 5, LR: [0.001], Train Loss: 0.3949, Val Loss: 0.7002, Val Acc: 77.72%
Epoch 6, Loss: 0.8591
Validation Accuracy: 79.32%
Epoch 6, LR: [0.001], Train Loss: 0.3288, Val Loss: 0.6553, Val Acc: 79.32%
Epoch 7, Loss: 0.8972
Validation Accuracy: 79.12%
Epoch 7, LR: [0.001], Train Loss: 0.2604, Val Loss: 0.7013, Val Acc: 79.12