<a href="https://colab.research.google.com/github/rkrmishra/pytorch-work/blob/main/multiclass-classification/MLOps3_G24AIT042.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

### Link to Project MLOps2025_G24AIT042 Artefacts

Link to Project: https://wandb.ai/g24ait042-iit-j/MLOps2025_G24AIT042?nw=nwuserg24ait042

Link to Project Report: https://wandb.ai/g24ait042-iit-j/MLOps2025_G24AIT042/reports/Report_MLOps2025_G24AIT042--VmlldzoxMTMzODU5Ng


In [32]:
!pip install wandb -qU

In [33]:
# Log in to your W&B account
import wandb
import random
import math

In [34]:
wandb.login()

True

# Q1.
Dataset and Model Preparation

In [35]:
#@title
import torch, torchvision
import torch.nn as nn
from torchvision.datasets import MNIST
import torchvision.transforms as T

device = "cuda:0" if torch.cuda.is_available() else "cpu"

def get_dataloader(is_train, batch_size, slice=5):
    "Get a training dataloader"
    full_dataset = MNIST(root=".", train=is_train, transform=T.ToTensor(), download=True)
    sub_dataset = torch.utils.data.Subset(full_dataset, indices=range(0, len(full_dataset), slice))
    loader = torch.utils.data.DataLoader(dataset=sub_dataset,
                                         batch_size=batch_size,
                                         shuffle=True if is_train else False,
                                         pin_memory=True, num_workers=2)
    return loader

def get_model(units):
    "A simple model"
    model = nn.Sequential(nn.Flatten(),
                         nn.Linear(28*28, units),
                         nn.ReLU(),
                         nn.Linear(units,10)).to(device)
    return model

def validate_model(model, valid_dl, loss_func, log_images=False, batch_idx=0):
    "Compute performance of the model on the validation dataset and log a wandb.Table"
    model.eval()
    val_loss = 0.
    with torch.inference_mode():
        correct = 0
        for i, (images, labels) in enumerate(valid_dl):
            images, labels = images.to(device), labels.to(device)

            # Forward pass ➡
            outputs = model(images)
            val_loss += loss_func(outputs, labels)*labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

            # Log one batch of images to the dashboard, always same batch_idx.
            if i==batch_idx and log_images:
                log_image_table(images, predicted, labels, outputs.softmax(dim=1))
    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)

# Q2.
Setting Up the Project & Logging Hyperparameters
- Created new WandB accounts using iitj email id
- Created a new organization and made change for public viewing
- Created a project

Create a teble to compare the predicted values versus the true value

The following cell is unique to W&B, so let's go over it.

In the cell we define a function called `log_image_table`. Though technically, optional, this function creates a W&B Table object. We will use the table object to create a table that shows what the model predicted for each image.

More specifically, each row will conists of the image fed to the model, along with predicted value and the actual value (label).

In [36]:
def log_image_table(images, predicted, labels, probs):
    "Log a wandb.Table with (img, pred, target, scores)"
    # Create a wandb Table to log images, labels and predictions to
    table = wandb.Table(columns=["image", "pred", "target"]+[f"score_{i}" for i in range(10)])
    for img, pred, targ, prob in zip(images.to("cpu"), predicted.to("cpu"), labels.to("cpu"), probs.to("cpu")):
        table.add_data(wandb.Image(img[0].numpy()*255), pred, targ, *prob.numpy())
    wandb.log({"predictions_table":table}, commit=False)

# Q3.
Training and Validation

Train your model and upload checkpoints

The following code trains and saves model checkpoints to your project. Use model checkpoints like you normally would to assess how the model performed during training.

In [37]:
# Launch 1 experiments
for _ in range(1):
    # initialise a wandb run
    wandb.init(
        project="MLOps2025_G24AIT042",
        config={
            "epochs": 5,
            "batch_size": 128,
            "lr": 1e-3,
            "model": 256
            })

    # Copy your config
    config = wandb.config

    # Get the data
    train_dl = get_dataloader(is_train=True, batch_size=config.batch_size)
    valid_dl = get_dataloader(is_train=False, batch_size=2*config.batch_size)
    n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

    # A simple MLP model
    model = get_model(config.model)

    # Make the loss and optimizer
    loss_func = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

   # Training
    example_ct = 0
    step_ct = 0
    for epoch in range(config.epochs):
        model.train()
        for step, (images, labels) in enumerate(train_dl):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            train_loss = loss_func(outputs, labels)
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            example_ct += len(images)
            metrics = {"train/train_loss": train_loss,
                       "train/epoch": (step + 1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch,
                       "train/example_ct": example_ct}

            if step + 1 < n_steps_per_epoch:
                # Log train metrics to wandb
                wandb.log(metrics)

            step_ct += 1

        val_loss, accuracy = validate_model(model, valid_dl, loss_func, log_images=(epoch==(config.epochs-1)))

        # Log train and validation metrics to wandb
        val_metrics = {"val/val_loss": val_loss,
                       "val/val_accuracy": accuracy}
        wandb.log({**metrics, **val_metrics})

        # Save the model checkpoint to wandb
        torch.save(model, "my_model.pt")
        wandb.log_model("./my_model.pt", "my_mnist_model", aliases=[f"epoch-{epoch+1}_linear_units-{round(wandb.config.model, 4)}"])

        print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.3f}, Valid Loss: {val_loss:3f}, Accuracy: {accuracy:.2f}")

    # If you had a test set, this is how you could log it as a Summary metric
    wandb.summary['test_accuracy'] = 0.8

    # Close your wandb run
    wandb.finish()

Epoch: 1, Train Loss: 0.418, Valid Loss: 0.360714, Accuracy: 0.89
Epoch: 2, Train Loss: 0.192, Valid Loss: 0.308578, Accuracy: 0.91
Epoch: 3, Train Loss: 0.246, Valid Loss: 0.279094, Accuracy: 0.92
Epoch: 4, Train Loss: 0.504, Valid Loss: 0.257133, Accuracy: 0.92
Epoch: 5, Train Loss: 0.097, Valid Loss: 0.233352, Accuracy: 0.93


0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/example_ct,▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇██
train/train_loss,█▂▃▃▃▂▄▂▃▂▂▂▄▁▂▁▃▂▂▂▁▃▂▁▂▁▁▁▂▂▂▂▂▂▃▁▂▁▂▁
val/val_accuracy,▁▄▆▆█
val/val_loss,█▅▄▂▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/train_loss,0.09663
val/val_accuracy,0.9305
val/val_loss,0.23335


# Q4.
Hyperparameter Exploration (Sweeps)

In [41]:
# Define hyperparameter sweep configuration
sweep_config = {
    'method': 'grid',
    'metric': {'name': 'val/val_accuracy', 'goal': 'maximize'},
    'parameters': {
        'epochs': {'values': [5]},
        'batch_size': {'values': [64]},
        'lr': {'values': [0.001]},
        'model': {'values': [128, 256, 512]}
    }
}

In [42]:
def train_and_evaluate(config=None):
    with wandb.init(config=config):
        config = wandb.config

        # Get the data
        train_dl = get_dataloader(is_train=True, batch_size=config.batch_size)
        valid_dl = get_dataloader(is_train=False, batch_size=2*config.batch_size)
        n_steps_per_epoch = math.ceil(len(train_dl.dataset) / config.batch_size)

        # A simple MLP model
        model = get_model(config.model)

        # Make the loss and optimizer
        loss_func = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

        # Training
        example_ct = 0
        step_ct = 0
        for epoch in range(config.epochs):
            model.train()
            for step, (images, labels) in enumerate(train_dl):
                images, labels = images.to(device), labels.to(device)

                outputs = model(images)
                train_loss = loss_func(outputs, labels)
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                example_ct += len(images)
                metrics = {"train/train_loss": train_loss.item(),
                           "train/epoch": (step + 1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch,
                           "train/example_ct": example_ct,
                           "train/step": step_ct}

                wandb.log(metrics)
                step_ct += 1

            val_loss, accuracy = validate_model(model, valid_dl, loss_func, log_images=(epoch==(config.epochs-1)))
            val_metrics = {"val/val_loss": val_loss,
                           "val/val_accuracy": accuracy}
            wandb.log({**metrics, **val_metrics})

            # Save the model checkpoint to wandb
            torch.save(model, "my_model.pt")
            wandb.log_model("./my_model.pt", "my_mnist_model", aliases=[f"epoch-{epoch+1}_linear_units-{round(wandb.config.model, 4)}"])

            print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.3f}, Valid Loss: {val_loss:3f}, Accuracy: {accuracy:.2f}")

        wandb.summary['test_accuracy'] = 0.8
        wandb.finish()

In [43]:
# Initialize sweep
sweep_id = wandb.sweep(sweep_config, project="MLOps2025_G24AIT042")
wandb.agent(sweep_id, train_and_evaluate, count=3)

Create sweep with ID: ewlgi6q4
Sweep URL: https://wandb.ai/g24ait042-iit-j/MLOps2025_G24AIT042/sweeps/ewlgi6q4


[34m[1mwandb[0m: Agent Starting Run: 9dzu2f2c with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	model: 128


Epoch: 1, Train Loss: 0.346, Valid Loss: 0.369918, Accuracy: 0.89
Epoch: 2, Train Loss: 0.189, Valid Loss: 0.316781, Accuracy: 0.90
Epoch: 3, Train Loss: 0.124, Valid Loss: 0.271605, Accuracy: 0.92
Epoch: 4, Train Loss: 0.145, Valid Loss: 0.245622, Accuracy: 0.93
Epoch: 5, Train Loss: 0.093, Valid Loss: 0.224655, Accuracy: 0.94


0,1
train/epoch,▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█████
train/example_ct,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/step,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇███
train/train_loss,█▇▆▅▄▄▃▂▄▂▃▂▂▃▃▃▄▃▃▂▂▃▃▂▂▂▃▂▂▁▃▂▃▁▂▁▃▁▂▁
val/val_accuracy,▁▃▅▇█
val/val_loss,█▅▃▂▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/step,939.0
train/train_loss,0.09326
val/val_accuracy,0.9355
val/val_loss,0.22466


[34m[1mwandb[0m: Agent Starting Run: y3o1e1ca with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	model: 256


Epoch: 1, Train Loss: 0.388, Valid Loss: 0.322507, Accuracy: 0.90
Epoch: 2, Train Loss: 0.308, Valid Loss: 0.277811, Accuracy: 0.92
Epoch: 3, Train Loss: 0.534, Valid Loss: 0.237170, Accuracy: 0.93
Epoch: 4, Train Loss: 0.342, Valid Loss: 0.209885, Accuracy: 0.94
Epoch: 5, Train Loss: 0.251, Valid Loss: 0.189221, Accuracy: 0.94


0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇██
train/example_ct,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/step,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
train/train_loss,█▇▄▃▃▂▂▂▂▂▁▂▂▂▂▂▁▂▂▁▂▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁
val/val_accuracy,▁▃▅▇█
val/val_loss,█▆▄▂▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/step,939.0
train/train_loss,0.25053
val/val_accuracy,0.9435
val/val_loss,0.18922


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: o1t2pqje with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	model: 512


Epoch: 1, Train Loss: 0.334, Valid Loss: 0.317736, Accuracy: 0.90
Epoch: 2, Train Loss: 0.056, Valid Loss: 0.237817, Accuracy: 0.93
Epoch: 3, Train Loss: 0.179, Valid Loss: 0.194244, Accuracy: 0.94
Epoch: 4, Train Loss: 0.041, Valid Loss: 0.176693, Accuracy: 0.94
Epoch: 5, Train Loss: 0.073, Valid Loss: 0.169724, Accuracy: 0.95


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇███
train/example_ct,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████
train/step,▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train/train_loss,█▅▇█▆▆▃▅▄▅▄▆▄▃▅▃▄▄▂▂▂▄▂▃▁▂▂▃▃▅▄▂▄▄▃▂▁▁▁▂
val/val_accuracy,▁▅▇▇█
val/val_loss,█▄▂▁▁

0,1
test_accuracy,0.8
train/epoch,5.0
train/example_ct,60000.0
train/step,939.0
train/train_loss,0.07318
val/val_accuracy,0.9505
val/val_loss,0.16972
