In [1]:
import torch
from torchvision import transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
import torchmetrics
from torchinfo import summary

import numpy as np
import os
import pandas as pd
import cv2
import math
from sklearn.model_selection import train_test_split

seed_everything(1337)

Global seed set to 1337


1337

In [2]:
class C3D(pl.LightningModule):
    """
    https://arxiv.org/abs/1412.0767
    """
    def __init__(self, num_classes=487):
        super().__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.flatten = nn.Flatten(start_dim=1)

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, num_classes)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

        self.acc = torchmetrics.Accuracy()
        self.loss = nn.CrossEntropyLoss()


    def init_weight(self):
        for name, para in self.named_parameters():
            if name.find('weight') != -1:
                nn.init.xavier_normal_(para.data)
            else:
                nn.init.constant_(para.data, 0)


    def forward(self, x):

        x = self.relu(self.conv1(x))
        x = self.pool1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        x = self.fc8(x)
        x = self.softmax(x)

        return x


    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self.relu(self.conv1(x))
        x = self.pool1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        x = self.fc8(x)
        x = self.softmax(x)

        train_loss = self.loss(y, x)
        acc = self.acc(y, x)
        self.log_dict({"acc": acc, "train_loss": train_loss}, prog_bar=True)
        return train_loss


    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.relu(self.conv1(x))
        x = self.pool1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        x = self.fc8(x)
        x = self.softmax(x)

        val_loss = self.loss(y, x)
        mae = self.mae(y*6, x*6)
        self.log_dict({"mae": mae, "val_loss": val_loss}, prog_bar=True)
        return val_loss


    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=0.003)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)
        return {
            "optimizer":optimizer,
            "lr_scheduler" : {
                "scheduler" : scheduler,
                "monitor" : "train_loss",
            }
        }

In [4]:
model = C3D()

In [5]:
print(summary(model, input_size=(1, 3, 16, 112, 112)))

Layer (type:depth-idx)                   Output Shape              Param #
C3D                                      [1, 487]                  18,776,551
├─Conv3d: 1-1                            [1, 64, 16, 112, 112]     5,248
├─ReLU: 1-2                              [1, 64, 16, 112, 112]     --
├─MaxPool3d: 1-3                         [1, 64, 16, 56, 56]       --
├─Conv3d: 1-4                            [1, 128, 16, 56, 56]      221,312
├─ReLU: 1-5                              [1, 128, 16, 56, 56]      --
├─MaxPool3d: 1-6                         [1, 128, 8, 28, 28]       --
├─Conv3d: 1-7                            [1, 256, 8, 28, 28]       884,992
├─ReLU: 1-8                              [1, 256, 8, 28, 28]       --
├─Conv3d: 1-9                            [1, 256, 8, 28, 28]       1,769,728
├─ReLU: 1-10                             [1, 256, 8, 28, 28]       --
├─MaxPool3d: 1-11                        [1, 256, 4, 14, 14]       --
├─Conv3d: 1-12                           [1, 512, 4, 14, 

  x = self.softmax(x)


In [2]:
# short ucf101 downloaded from here- https://www.kaggle.com/datasets/nguyntindng0506/ucf101

In [35]:
class VideoDataset(Dataset):
    """
    Args:
        dataset (str): Name of dataset. Defaults to 'ucf101'.
        split (str): Determines which folder of the directory the dataset will read from. Defaults to 'train'.
    """

    def __init__(self, dataset='data', split='train'):
        self.root_dir = dataset
        self.split = split

        # Section 3.2. "Training"
        self.resize_height = 128
        self.resize_width = 171
        self.crop_size = 112

        if split == 'train':
            self.h_flip = transforms.RandomHorizontalFlip(p=0.5)
        self.img_tfs = transforms.Compose([
                    transforms.ToPILImage(),
                    transforms.Resize((128, 171)),
                    transforms.CenterCrop((112, 112)),
                ])

        self.fnames, labels = [], []
        for fname in sorted(os.listdir(os.path.join(self.root_dir, split))):
            self.fnames.append(os.path.join(self.root_dir, split, fname))
            labels.append(fname.split("_")[1])

        print(f'Number of {split}: {len(self.fnames)}')
        print(f'Available label classes: {sorted(list(set(labels)))}')

        # Prepare a mapping between the label names (strings) and indices (ints)
        self.label2index = {label: index for index, label in enumerate(sorted(set(labels)))}
        # Convert the list of label names into an array of label indices
        self.label_array = np.array([self.label2index[label] for label in labels], dtype=int)
        assert len(self.label_array) == len(self.fnames)

    def __len__(self):
        return len(self.fnames)


    def __getitem__(self, index):
        subsampled_frames = [] # 16, 112, 112
        for frame in self.process_video(self.fnames[index]):
            frame = self.img_tfs(frame)
            if self.split == 'train':
                frame = self.h_flip(frame)
            frame = np.array(frame)
            frame = frame / 255. # paper doesn't say anything about normalization
            subsampled_frames.append(frame)

        subsampled_frames = np.array(subsampled_frames)
        labels = np.array(self.label_array[index])
        return torch.from_numpy(subsampled_frames), torch.from_numpy(labels)


    def process_video(self, fname):
        subsampled_frames = []
        capture = cv2.VideoCapture(fname)
        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
        count = 0
        frames_to_keep = self.frames_splits(frame_count, 16)
        while count < frame_count:
            ret, frame = capture.read()
            if frame is None or not ret:
                continue
            count += 1
            # Section 3.1. "Common network settings" 
            if count in frames_to_keep:
                subsampled_frames.append(frame)
        capture.release()
        return subsampled_frames


    @staticmethod
    def frames_splits(a, n):
        k, m = divmod(a, n)
        return [(i+1)*k+min(i+1, m) for i in range(n)]


    def to_tensor(self, buffer):
        return buffer.transpose((3, 0, 1, 2))


In [36]:
dataset = VideoDataset('data')
x, y = next(iter(dataset))

Number of train: 594
Available label classes: ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
