In [None]:
!pip install lightning av pandas tensorboard

In [2]:
!wget https://pithos.okeanos.grnet.gr/public/FiBAB54cGZuQfXrQL7ylK -O ucf101_top5.tar.gz
!mkdir data
!tar xf ucf101_top5.tar.gz --directory data

In [4]:
from torch.utils.data import Dataset, DataLoader
from typing import Callable, List
import torch
import pandas as pd
import os
import numpy as np
from torchvision.io import read_video
from torchvision.models import resnet18
from torchvision.models.feature_extraction import create_feature_extractor
from torchvision.transforms import ConvertImageDtype, Resize, Normalize
import pickle

# Define data root directory
DATA_ROOT = "/workspace/lab4_data/"

# Define a Video-transform type
VTransform = Callable[[torch.Tensor], torch.Tensor]


class UCF101(Dataset):
    def __init__(
        self,
        data_root,
        mode="train",
        video_transforms: List[VTransform] = [],
        use_precomputed=True,
    ):
        """
        Return a UCF101 Dataset instance
        """
        super().__init__()
        assert mode in ["train", "test"]

        self.root = data_root
        self.mode = mode
        self.v_transforms = video_transforms

        # Build database of samples
        self._build_db()

        # Features precomute functionality
        self.pre = use_precomputed
        self.pre_root = os.path.join(self.root, "precomp")
        if self.pre and not os.path.exists(self.pre_root):
            os.makedirs(self.pre_root)

    def _build_db(self):
        """
        Parse train/test csv containing paths to videos and corresponding labels.
        Also, assign a unique index to each category
        """
        csv_file = os.path.join(self.root, self.mode + ".csv")
        self.db: np.ndarray = pd.read_csv(csv_file, header=0).values

        unique_categories = np.sort(np.unique(self.db.T[1]))
        self.categories = {
            c_name: c_idx for c_idx, c_name in enumerate(unique_categories)
        }

    def compute_sample(self, video_name, category):
        """
        For a specific video, read data into memory, permute data to NumFrames x Channels x Height x Width format.
        Also, transform data according to list of transforms
        """

        # Load video
        V, *_ = read_video(os.path.join(self.root, self.mode, video_name))
        # Permute data to NxCxHxW from NxHxWxC
        V = V.permute(0, 3, 1, 2)

        for T in self.v_transforms:
            V = T(V)

        return V, self.categories[category]

    def __getitem__(self, index):
        """
        Retrieve a specific sample from the dataset
        """
        video_name, category = self.db[index]

        hval = "_".join([self.mode, video_name])

        if os.path.exists(os.path.join(self.pre_root, f"{hval}.tmp")):
            with open(os.path.join(self.pre_root, f"{hval}.tmp"), "rb") as f:
                sample = pickle.load(f)
        else:
            sample = self.compute_sample(video_name, category)
            # Save tmp
            with open(os.path.join(self.pre_root, f"{hval}.tmp"), "wb") as f:
                pickle.dump(sample, f)

        return sample

    def __len__(self):
        """
        Returns the number of samples in the dataset
        """
        return self.db.shape[0]


def compute_features() -> VTransform:
    """
    Returns a VTransform object that uses a pretrained CNN to extract features
    """
    # Instantiate a CNN for feature extraction
    encoder = resnet18(weights="DEFAULT", progress=False)
    # model = nn.Sequential(*list(encoder.children())[:-1], nn.Flatten())
    model = create_feature_extractor(encoder, ["avgpool"])
    model.eval()

    def apply(v: torch.Tensor) -> torch.Tensor:
        # return model(v)
        with torch.no_grad():
            feats = torch.flatten(model(v)["avgpool"], 1)
        return feats

    return apply

In [6]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Define train/val datasets
train_dset = UCF101(
    DATA_ROOT,
    "train",
    video_transforms=[
        ConvertImageDtype(torch.float32),
        Resize((224, 224)),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        compute_features(),
    ],
)

val_dset = UCF101(
    DATA_ROOT,
    "test",
    video_transforms=[
        ConvertImageDtype(torch.float32),
        Resize((224, 224)),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        compute_features(),
    ],
)


def pad_sequences_collate_fn(samples: List[tuple]) -> tuple:
    """
    Zero-pad (in front) each sample to enable batching. The longest sequence defines the sequence length for the batch
    """

    labels = torch.stack([torch.tensor(v[1]) for v in samples])
    data = pad_sequence([v[0] for v in samples], batch_first=True)

    key_mask = pad_sequence(
        [torch.zeros(v[0].shape[0], dtype=torch.bool) for v in samples],
        padding_value=True,
        batch_first=True,
    )

    return data, labels, key_mask


# Define train/val dataloaders
train_dloader = DataLoader(
    train_dset,
    batch_size=32,
    shuffle=True,
    collate_fn=pad_sequences_collate_fn,
    num_workers=4,
)
val_dloader = DataLoader(
    val_dset,
    batch_size=32,
    shuffle=False,
    collate_fn=pad_sequences_collate_fn,
    num_workers=4,
)

torch.Size([32, 300, 512]) torch.Size([32, 300])


In [35]:
import lightning as L
from torch import nn
import torch.nn.functional as F
import math


class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """

        pe = self.pe[:, : x.size(1)].expand(x.shape[0], -1, -1)

        x = x + pe
        return self.dropout(x)


class TransformerCLS(L.LightningModule):
    def __init__(self, num_classes: int, d_model: int = 128):
        super().__init__()

        self.embedding_layer = nn.Linear(512, d_model)

        self.encoder = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=d_model, nhead=8, batch_first=True
            ),
            num_layers=3,
        )

        self.cls_tkn = nn.Parameter(torch.rand(1, 1, d_model), requires_grad=True)

        self.pos_emb = PositionalEncoding(d_model)

        self.classifier = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, x: torch.Tensor, key_mask: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        # BS x T x 512
        x = self.embedding_layer(x)  # BS x T x d_model
        x = self.pos_emb(x)  # BS x T x d_model

        cls_tkn = self.cls_tkn.expand(x.shape[0], -1, -1)  # BS x 1 x d_model
        x = torch.cat([cls_tkn, x], dim=1)  # BS x (T+1) x d_model


        key_mask = torch.cat(
            [
                torch.zeros(x.shape[0], 1, dtype=torch.bool, device=key_mask.device),
                key_mask,
            ],
            dim=1,
        )
        x = self.encoder(x, src_key_padding_mask=key_mask)  # BS x (T+1) x d_model

        x = x[:, 0, :]  # BS x d_model
        return self.classifier(x)

    def training_step(self, batch, batch_idx):
        data, labels, key_mask = batch

        logits = self(data, key_mask)

        loss = F.nll_loss(torch.log_softmax(logits, dim=-1), labels)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        data, labels, key_mask = batch

        logits = self(data, key_mask)

        loss = F.nll_loss(torch.log_softmax(logits, dim=-1), labels)
        self.log("val_loss", loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [None]:
trainer = L.Trainer(
    max_epochs=50, devices=1, accelerator="gpu", default_root_dir="/workspace/lab4"
)

trainer.fit(
    model=TransformerCLS(num_classes=5),
    train_dataloaders=train_dloader,
    val_dataloaders=val_dloader,
)