In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from pathlib import Path
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from torchvision import transforms, models
from torchvision.datasets import VOCSegmentation
from PIL import Image

In [2]:
# Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print_freq = 10

In [3]:
data_dir = "./VOC2012"  # root folder containing VOCdevkit/
full_dataset = VOCSegmentation(root=data_dir, image_set='train', download=False)
print(f"Total images: {len(full_dataset)}")

Total images: 1464


In [4]:
# Subset
subset_size = 15
random.seed(42)  # for reproducibility
subset_indices = random.sample(range(len(full_dataset)), subset_size)
subset_dataset = Subset(full_dataset, subset_indices)


## Spliting into 80/20% train and test datasets

In [5]:
train_size = int(0.8 * subset_size)
test_size = subset_size - train_size

train_indices = subset_indices[:train_size]
test_indices = subset_indices[train_size:]

train_dataset = Subset(full_dataset, train_indices)
test_dataset = Subset(full_dataset, test_indices)

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")


Train size: 12, Test size: 3


In [6]:
class VOCSubsetDataset(torch.utils.data.Dataset):
    def __init__(self, subset, size=224):
        self.subset = subset
        self.size = size
        # transformations excluding resizing
        self.img_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
        
    def __len__(self):
        return len(self.subset)
    
    def __getitem__(self, idx):
        img, mask = self.subset[idx]  # PIL images
        
        # Resize PIL images first
        img = TF.resize(img, (self.size, self.size))
        mask = TF.resize(mask, (self.size, self.size), interpolation=TF.InterpolationMode.NEAREST)
        
        # Apply tensor conversion and normalization
        img = self.img_transform(img)
        mask = torch.as_tensor(np.array(mask), dtype=torch.long)
        
        return img, mask


In [7]:
#Wrap subsets with your transform
train_data = VOCSubsetDataset(train_dataset, size=256)
test_data  = VOCSubsetDataset(test_dataset, size=256)



In [8]:
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
test_loader  = DataLoader(test_data, batch_size=2, shuffle=False)


## Preprocessing

In [9]:
# Convert mask (PIL Image) to class indices
def mask_to_class(mask):
    mask = np.array(mask)
    # VOC has 21 classes, background=0
    return torch.from_numpy(mask).long()

In [10]:
import torch
import numpy as np

def mask_to_class(mask):
    """
    Converts VOC mask (PIL Image or numpy array) to class indices tensor
    """
    if isinstance(mask, np.ndarray):
        mask_np = mask
    else:
        mask_np = np.array(mask)
    # If mask has RGB, map colors to class indices (VOC has 21 classes)
    # Here, assume mask already has class indices (0-20)
    return torch.as_tensor(mask_np, dtype=torch.long)


## Model Implementation

In [11]:
class FCN32s(nn.Module):
    def __init__(self, num_classes=21, pretrained=True, upsample_method='bilinear'):
        super().__init__()
        vgg = models.vgg16(pretrained=pretrained)
        self.features = vgg.features  # convolutional backbone
        
        # Replace FC layers with conv layers
        self.conv6 = nn.Conv2d(512, 4096, kernel_size=7)
        self.relu6 = nn.ReLU(inplace=True)
        self.drop6 = nn.Dropout2d()
        self.conv7 = nn.Conv2d(4096, 4096, kernel_size=1)
        self.relu7 = nn.ReLU(inplace=True)
        self.drop7 = nn.Dropout2d()
        self.score = nn.Conv2d(4096, num_classes, kernel_size=1)
        
        self.upsample_method = upsample_method

    def forward(self, x):
        x = self.features(x)
        x = self.relu6(self.conv6(x))
        x = self.drop6(x)
        x = self.relu7(self.conv7(x))
        x = self.drop7(x)
        x = self.score(x)
        
        if self.upsample_method == 'bilinear':
            x = F.interpolate(x, size=(256,256), mode='bilinear', align_corners=False)
        else:
            x = nn.ConvTranspose2d(21, 21, kernel_size=64, stride=32, padding=16, bias=False)(x)
        return x

# Instantiate model
num_classes = 21  # Pascal VOC
device = "cuda" if torch.cuda.is_available() else "cpu"
model = FCN32s(num_classes=num_classes).to(device)




## Loss and optimizer

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=255)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## Metrics
(a) Pixel Accuracy

Fraction of correctly classified pixels:

In [17]:
import torch
import torch.nn.functional as F

def pixel_accuracy(outputs, masks):
    """
    outputs: [B, C, H, W] raw logits
    masks:   [B, H, W] ground truth class index (0â€“20)
    """
    preds = outputs.argmax(dim=1)              # [B, H, W]
    correct = (preds == masks).float()
    acc = correct.sum() / correct.numel()
    return acc.item()


def mean_iou(pred, mask, num_classes=21):
    pred = pred.argmax(1).cpu().numpy()
    mask = mask.cpu().numpy()

    ious = []

    for cls in range(num_classes):
        pred_cls = (pred == cls)
        mask_cls = (mask == cls)

        intersection = (pred_cls & mask_cls).sum()
        union = (pred_cls | mask_cls).sum()

        if union == 0:
            continue

        iou = intersection / union
        ious.append(iou)

    return np.mean(ious) if len(ious) > 0 else 0

In [None]:
from torchvision.transforms import functional as TF
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    epoch_pixacc = 0
    epoch_miou = 0

    for imgs, masks in train_loader:
        imgs, masks = imgs.to(device), masks.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)

        loss = criterion(outputs, masks)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_pixacc += pixel_accuracy(outputs, masks)
        epoch_miou += mean_iou(outputs, masks)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Loss: {epoch_loss/len(train_loader):.4f}")
    print(f"  Pixel Acc: {epoch_pixacc/len(train_loader):.4f}")
    print(f"  mIoU: {epoch_miou/len(train_loader):.4f}")


Epoch 1/20
  Loss: 2.1349
  Pixel Acc: 0.4307
  mIoU: 0.1304
Epoch 2/20
  Loss: 1.2064
  Pixel Acc: 0.6117
  mIoU: 0.2066
Epoch 3/20
  Loss: 0.9855
  Pixel Acc: 0.6434
  mIoU: 0.2292
Epoch 4/20
  Loss: 1.0854
  Pixel Acc: 0.6325
  mIoU: 0.2429
Epoch 5/20
  Loss: 0.9568
  Pixel Acc: 0.6236
  mIoU: 0.2440
Epoch 6/20
  Loss: 0.9151
  Pixel Acc: 0.6306
  mIoU: 0.2070


In [1]:
plt.figure(figsize=(10,4))

# Loss curve
plt.subplot(1,2,1)
plt.plot(range(1, num_epochs+1), train_losses, marker='o', label='Train Loss')
plt.title("Training Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.legend()

# Pixel accuracy curve
plt.subplot(1,2,2)
plt.plot(range(1, num_epochs+1), train_pixel_acc, marker='o', color='green', label='Train Pixel Acc')
plt.title("Training Pixel Accuracy Curve")
plt.xlabel("Epoch")
plt.ylabel("Pixel Accuracy")
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()


NameError: name 'plt' is not defined