In [None]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# import library
import os
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from timm.models.layers import DropPath, Mlp, PatchEmbed
from torch.utils.data import DataLoader, random_split
from torchinfo import summary
from torchvision import datasets
from torchvision.datasets import OxfordIIITPet, wrap_dataset_for_transforms_v2
from torchvision.transforms import v2
from torchvision.transforms.functional import InterpolationMode

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(42) # For reproduciblity purpose, please do not modify this.



# Helper functions

In [None]:
# You can change input params of this function if needed
def load_pet_data(data_dir, batch_size=64, img_height=224, img_width=224, transform=None):

    # Define transforms
    if transform is None:
      transform = v2.Compose([
              v2.ToImage(),
              v2.Resize((img_height, img_width), interpolation=InterpolationMode.BICUBIC),
              v2.ToDtype(torch.float32, scale=True),
          ])

    train_ds = OxfordIIITPet(
        root=data_dir,
        split="trainval",
        download=True,
        transform=transform,
    )

    val_ds = OxfordIIITPet(
        root=data_dir,
        split="test",
        download=True,
        transform=transform,
    )

    print(f"Number of training samples: {len(train_ds)}")
    print(f"Number of validation samples: {len(val_ds)}")

    # Create DataLoader instances
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, 37


# Vision Transformer (ViT)


In [None]:
def swish(x, beta):
    return x * torch.sigmoid(x * beta)


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
        super().__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads"
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = (
            self.qkv(x)
            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
            .permute(2, 0, 3, 1, 4)
        )
        q, k, v = qkv.unbind(0)

        attention = (q @ k.transpose(-2, -1)) * self.scale
        attention = attention.softmax(dim=-1)
        attention = self.attn_drop(attention)

        x = (attention @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class ViTLayer(nn.Module):

    def __init__(
        self,
        dim,
        num_heads,
        mlp_ratio=4.0,
        qkv_bias=False,
        drop=0.0,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
    ):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attention = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            attn_drop=attn_drop,
            proj_drop=drop,
        )
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.ffn = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop,
        )

    def forward(self, x):
        x = x + self.drop_path(self.attention(self.norm1(x)))
        x = x + self.drop_path(self.ffn(self.norm2(x)))
        return x


class VisionTransformer(nn.Module):
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4.0,
        qkv_bias=True,
        distilled=False,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.2,
        embed_layer=PatchEmbed,
    ):
        """
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            num_classes (int): number of classes for classification head
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            drop_rate (float): dropout rate
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            embed_layer (nn.Module): patch embedding layer
        """
        super().__init__()
        self.num_features = self.embed_dim = (
            embed_dim  # num_features for consistency with other models
        )
        self.num_tokens = 2 if distilled else 1
        norm_layer = nn.LayerNorm
        act_layer = nn.GELU

        self.patch_embed = embed_layer(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
        )
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.dist_token = (
            nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
        )
        self.pos_embed = nn.Parameter(
            torch.zeros(1, num_patches + self.num_tokens, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=drop_rate)

        self.drop_rate = drop_rate
        self.attn_drop_rate = attn_drop_rate

        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, depth)
        ]  # stochastic depth decay rule
        self.layers = nn.Sequential(
            *[
                ViTLayer(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    act_layer=act_layer,
                )
                for i in range(depth)
            ]
        )
        self.norm = norm_layer(embed_dim)

        # Classifier head(s)
        self.head = nn.Linear(self.num_features, 1000)

    @torch.jit.ignore
    def no_weight_decay(self):
        return {"pos_embed", "cls_token", "dist_token"}

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)

        if self.dist_token is None:
            x = torch.cat((cls_token, x), dim=1)
        else:
            x = torch.cat(
                (cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1
            )
        x = self.pos_drop(x + self.pos_embed)

        x = self.layers(x)

        x = x[:, self.num_tokens :].mean(dim=1)

        x = self.norm(x)
        out = self.head(x)
        return out

# Task 3: Fine-tuning (a part of) the ViT Model (3 marks)

In [None]:
batch_size = 32 # Set this to your preferred values.

small_vit = VisionTransformer()

# Load the pretrained ImageNet weight
missing, unexpected = small_vit.load_state_dict(torch.load("./pretrained_vit_small.pth"))

print("Missing:", missing)
print("Unexpected:", unexpected)

# Replace with a new head for transfer learning.
# Hidden dimension of 384, 37 classes for the OxfordIIITPet dataset
small_vit.head = nn.Linear(384, 37)
small_vit.to(device)

Missing: []
Unexpected: []


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (layers): Sequential(
    (0): ViTLayer(
      (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (attention): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (ffn): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
 

In [None]:
summary(small_vit, (batch_size, 3, 224, 224), device=device)

Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformer                        [32, 37]                  76,032
├─PatchEmbed: 1-1                        [32, 196, 384]            --
│    └─Conv2d: 2-1                       [32, 384, 14, 14]         295,296
│    └─Identity: 2-2                     [32, 196, 384]            --
├─Dropout: 1-2                           [32, 197, 384]            --
├─Sequential: 1-3                        [32, 197, 384]            --
│    └─ViTLayer: 2-3                     [32, 197, 384]            --
│    │    └─LayerNorm: 3-1               [32, 197, 384]            768
│    │    └─Attention: 3-2               [32, 197, 384]            591,360
│    │    └─Identity: 3-3                [32, 197, 384]            --
│    │    └─LayerNorm: 3-4               [32, 197, 384]            768
│    │    └─Mlp: 3-5                     [32, 197, 384]            1,181,568
│    │    └─Identity: 3-6                [32, 197, 384]       

In [None]:
# Get the dataloaders for training
train_dataloader, val_dataloader, num_classes = load_pet_data(data_dir="./", batch_size=batch_size)

100%|██████████| 792M/792M [00:43<00:00, 18.3MB/s]
100%|██████████| 19.2M/19.2M [00:01<00:00, 11.5MB/s]


Number of training samples: 3680
Number of validation samples: 3669


In [None]:
# Your code starts from here

# Task 4: Explore and Implement a Parameter-Efficient Transfer Learning (PETL) Technique (4 marks)

In [None]:
# Your code starts from here