In [None]:
# import library
import os
import torch
from torchvision.datasets import VOCSegmentation
import random
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(42) # For reproduciblity purpose, please do not modify this.

## Helper functions and dataset setup

##1. Download dataset
Please refer to [this function](https://docs.pytorch.org/vision/main/generated/torchvision.datasets.VOCSegmentation.html) from TorchVision to download the Pascal VOC Segmentation Dataset.

Note that you can change the input of provided code to match with your requirement.

Because the Pascal VOC Segmentation Dataset 2012 only provide a `train` set and a `val` set. So that you are required to train on `train` set only and then test the model on `val` set

**Note:** There is a void class with index 255 in dataset, you can treat the pixels with this label as backbround or just simply ignore it when calculate the loss value. [Refer to this post for suggestion](https://discuss.pytorch.org/t/having-trouble-with-voc-2012-segmentation-with-the-void-255-label/46486/7)

In [None]:
voc_dir = './data'
os.makedirs(voc_dir, exist_ok=True)
train_dataset = VOCSegmentation(root=voc_dir, year="2012", image_set="train", download=True)
val_dataset = VOCSegmentation(root=voc_dir, year="2012", image_set="val", download=True)

VOC_CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle",
               "bus", "car",  "cat",  "chair", "cow",  "diningtable", "dog", "horse",
               "motorbike", "person","potted plant", "sheep", "sofa","train", "tv/monitor"]

VOC_COLORMAP = [
    [0, 0, 0],
    [128, 0, 0],
    [0, 128, 0],
    [128, 128, 0],
    [0, 0, 128],
    [128, 0, 128],
    [0, 128, 128],
    [128, 128, 128],
    [64, 0, 0],
    [192, 0, 0],
    [64, 128, 0],
    [192, 128, 0],
    [64, 0, 128],
    [192, 0, 128],
    [64, 128, 128],
    [192, 128, 128],
    [0, 64, 0],
    [128, 64, 0],
    [0, 192, 0],
    [128, 192, 0],
    [0, 64, 128],
]

100%|██████████| 2.00G/2.00G [00:51<00:00, 38.6MB/s]


##2. Helper function
You are required to use this helper function to calculate the mean IoU score

In [None]:
# Provided meanIoU score
import numpy as np
from sklearn.metrics import confusion_matrix

def calculate_segmentation_metrics(preds, masks, num_classes, ignore_index=0):
    """
    Computes segmentation metrics: per-class and mean Precision, Recall, IoU, Dice, and overall Pixel Accuracy.

    Args:
        preds (Tensor): Predicted segmentation masks (B, H, W), each element is the predicted index class
        masks (Tensor): Ground truth segmentation masks (B, H, W)
        num_classes (int): Number of classes including background
        ignore_index (int): Label to ignore in evaluation (e.g., it should be the index of the background)

    Returns:
        metrics (dict): Dictionary containing:
            - 'per_class': dict of per-class metrics
            - 'mean_metrics': dict of averaged metrics across foreground classes
            - 'pixel_accuracy': float, overall pixel accuracy (excluding ignored)
    """
    eps = 1e-6  # for numerical stability
    preds = preds.view(-1)
    masks = masks.view(-1)
    valid = masks != ignore_index

    preds = preds[valid]
    masks = masks[valid]

    per_class_metrics = {}
    total_correct = 0
    total_pixels = valid.sum().item()

    precision_list = []
    recall_list = []
    iou_list = []
    dice_list = []

    for cls in range(num_classes):
        pred_inds = preds == cls
        target_inds = masks == cls

        TP = (pred_inds & target_inds).sum().item()
        FP = (pred_inds & ~target_inds).sum().item()
        FN = (~pred_inds & target_inds).sum().item()
        TN = ((~pred_inds) & (~target_inds)).sum().item()

        union = TP + FP + FN
        pred_sum = pred_inds.sum().item()
        target_sum = target_inds.sum().item()

        if target_sum == 0 and pred_sum == 0:
            continue

        precision = TP / (TP + FP + eps)
        recall = TP / (TP + FN + eps)
        iou = TP / (union + eps)
        dice = (2 * TP) / (pred_sum + target_sum + eps)

        precision_list.append(precision)
        recall_list.append(recall)
        iou_list.append(iou)
        dice_list.append(dice)

        total_correct += TP

    pixel_accuracy = total_correct / (total_pixels + eps)

    return {
        "precision": sum(precision_list) / len(precision_list),
        "recall": sum(recall_list) / len(recall_list),
        "iou": sum(iou_list) / len(iou_list),
        "dice": sum(dice_list) / len(dice_list),
        "pixel_accuracy": pixel_accuracy,
    }

# Task 1: Build a baseline Fully Convolutional Network (FCN) model for semantic segmentation (5 marks)

In [None]:
# Note: You can modify this code to load the backbone, just make sure you use model and weights from Nvidia
backbone_efficientnet = torch.hub.load("NVIDIA/DeepLearningExamples:torchhub",  "nvidia_efficientnet_b0", pretrained=True)

Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" to /root/.cache/torch/hub/torchhub.zip
Downloading: "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b0_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-b0_210412.pth" to /root/.cache/torch/hub/checkpoints/nvidia_efficientnet-b0_210412.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 128MB/s] 


In [None]:
# Your code starts from here

# Task 2: Improve the baseline FCN model (8 marks)

In [None]:
# Your code starts from here