In [1]:
import torch
from torchvision import transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
import torchmetrics
from torchinfo import summary

import numpy as np
import os
import cv2
from tqdm.notebook import tqdm

seed_everything(1337)

Global seed set to 1337


1337

In [2]:
class C3D(pl.LightningModule):
    """
    https://arxiv.org/abs/1412.0767
    """
    def __init__(self, num_classes=487):
        super().__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.flatten = nn.Flatten(start_dim=1)

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, num_classes)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        self.acc = torchmetrics.Accuracy()
        self.loss = nn.CrossEntropyLoss()


    def init_weight(self):
        for name, para in self.named_parameters():
            if name.find('weight') != -1:
                nn.init.xavier_normal_(para.data)
            else:
                nn.init.constant_(para.data, 0)


    def forward(self, x):

        x = self.relu(self.conv1(x))
        x = self.pool1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        x = self.fc8(x)
        x = self.softmax(x)

        return x


    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self.relu(self.conv1(x))
        x = self.pool1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        x = self.fc8(x)
        x = self.softmax(x)

        train_loss = self.loss(x, y)
        train_acc = self.acc(x, y)
        self.log_dict({"train_acc": train_acc, "train_loss": train_loss}, prog_bar=True)
        return train_loss


    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.relu(self.conv1(x))
        x = self.pool1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = self.flatten(x)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        x = self.relu(self.fc7(x))
        x = self.dropout(x)

        x = self.fc8(x)
        x = self.softmax(x)

        val_loss = self.loss(x, y)
        val_acc = self.acc(x, y)
        self.log_dict({"val_acc": val_acc, "val_loss": val_loss}, prog_bar=True)
        return val_loss


    def configure_optimizers(self):
        # lr as written in paper
        optimizer = torch.optim.SGD(self.parameters(), lr=0.003)
        # not super sure about the step_size
        # correct according to Section 3.1. "Common network settings" 
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)
        return {
            "optimizer":optimizer,
            "lr_scheduler" : {
                "scheduler" : scheduler,
                "monitor" : "train_loss",
            }
        }

In [3]:
# short ucf101 downloaded from here- https://www.kaggle.com/datasets/nguyntindng0506/ucf101

class VideoDataset(Dataset):
    """
    https://www.kaggle.com/datasets/nguyntindng0506/ucf101
    Args:
        dataset (str): Name of dataset. Defaults to 'ucf101'.
        split (str): Determines which folder of the directory the dataset will read from. Defaults to 'train'.
    """

    def __init__(self, dataset='data', split='train'):
        self.root_dir = dataset
        self.split = split

        # Section 3.2. "Training"
        self.resize_height = 128
        self.resize_width = 171
        self.crop_size = 112

        if split == 'train':
            self.h_flip = transforms.RandomHorizontalFlip(p=0.5)
        self.img_tfs = transforms.Compose([
                    transforms.ToPILImage(),
                    transforms.Resize((128, 171)),
                    transforms.CenterCrop((112, 112)),
                ])

        print('Loading and processing videos...')
        self.fnames, labels = [], []
        # load videos now - loading/processing vids during training ate up a lot of time
        # unsure if loading/processing everything NOW makes sense either
        # especially when dataset size is big
        for fname in tqdm(sorted(os.listdir(os.path.join(self.root_dir, split)))):
            video_frames_chunk = self.process_video(os.path.join(self.root_dir, split, fname))
            # ignoring videos which have missing frames
            if len(video_frames_chunk) == 16: 
                self.fnames.append(video_frames_chunk)
                labels.append(fname.split("_")[1])

        print(f'Number of {split}: {len(self.fnames)}')
        print(f'Available label classes: {sorted(list(set(labels)))}')

        # Prepare a mapping between the label names (strings) and indices (ints)
        self.label2index = {label: index for index, label in enumerate(sorted(set(labels)))}
        # Convert the list of label names into an array of label indices
        self.label_array = np.array([self.label2index[label] for label in labels], dtype=int)
        assert len(self.label_array) == len(self.fnames)



    def num_classes(self):
        return len(list(set(self.label_array)))


    def __len__(self):
        return len(self.fnames)


    def __getitem__(self, index):
        subsampled_frames = [] # 16, 112, 112
        for frame in self.fnames[index]:
            frame = self.img_tfs(frame)
            if self.split == 'train':
                frame = self.h_flip(frame)
            frame = np.array(frame)
            frame = frame / 255. # paper doesn't say anything about normalization
            subsampled_frames.append(frame)
        # reshape into (C, D, H, W) for easier convolutions
        subsampled_frames = np.array(subsampled_frames).transpose((3, 0, 1, 2)) 
        labels = np.array(self.label_array[index])
        return torch.from_numpy(subsampled_frames).float(), torch.from_numpy(labels).type(torch.LongTensor)


    def process_video(self, fname):
        subsampled_frames = []
        capture = cv2.VideoCapture(fname)
        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
        count = 0
        frames_to_keep = self.frames_splits(frame_count, 16)
        while capture.isOpened():
            ret, frame = capture.read()
            if not ret:
                break
            count += 1
            # Section 3.1. "Common network settings" 
            if count in frames_to_keep:
                subsampled_frames.append(frame)
        capture.release()
        return subsampled_frames


    @staticmethod
    def frames_splits(a, n):
        k, m = divmod(a, n)
        return [(i+1)*k+min(i+1, m) for i in range(n)]

In [4]:
val_set = VideoDataset('data', 'test')

Loading and processing videos...


  0%|          | 0/224 [00:00<?, ?it/s]

Number of test: 218
Available label classes: ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [5]:
train_set = VideoDataset('data', 'train')

Loading and processing videos...


  0%|          | 0/594 [00:00<?, ?it/s]

Number of train: 575
Available label classes: ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [6]:
len(train_set), len(val_set), train_set.num_classes(), val_set.num_classes()

(575, 218, 5, 5)

In [7]:
model = C3D(num_classes=train_set.num_classes())
print(summary(model, input_size=(10, 3, 16, 112, 112)))

Layer (type:depth-idx)                   Output Shape              Param #
C3D                                      [10, 5]                   16,801,797
├─Conv3d: 1-1                            [10, 64, 16, 112, 112]    5,248
├─ReLU: 1-2                              [10, 64, 16, 112, 112]    --
├─MaxPool3d: 1-3                         [10, 64, 16, 56, 56]      --
├─Conv3d: 1-4                            [10, 128, 16, 56, 56]     221,312
├─ReLU: 1-5                              [10, 128, 16, 56, 56]     --
├─MaxPool3d: 1-6                         [10, 128, 8, 28, 28]      --
├─Conv3d: 1-7                            [10, 256, 8, 28, 28]      884,992
├─ReLU: 1-8                              [10, 256, 8, 28, 28]      --
├─Conv3d: 1-9                            [10, 256, 8, 28, 28]      1,769,728
├─ReLU: 1-10                             [10, 256, 8, 28, 28]      --
├─MaxPool3d: 1-11                        [10, 256, 4, 14, 14]      --
├─Conv3d: 1-12                           [10, 512, 4, 14,

In [8]:
train_loader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=16)
val_loader = DataLoader(val_set, batch_size=4, num_workers=16)

In [9]:
# x, y = next(iter(train_loader))
# x.shape, y.shape, y.type(torch.LongTensor)

In [10]:
trainer = pl.Trainer(limit_train_batches=100, max_epochs=16,accelerator='gpu', devices=1, log_every_n_steps=10)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
!rm -rf lightning_logs 

In [12]:
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)
# run 'tensorboard --logdir .' on terminal for logs

Missing logger folder: /home/rishab/Documents/codes/model-arch-implementation/quick-model-archs/c3d/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name    | Type             | Params
----------------------------------------------
0  | conv1   | Conv3d           | 5.2 K 
1  | pool1   | MaxPool3d        | 0     
2  | conv2   | Conv3d           | 221 K 
3  | pool2   | MaxPool3d        | 0     
4  | conv3a  | Conv3d           | 884 K 
5  | conv3b  | Conv3d           | 1.8 M 
6  | pool3   | MaxPool3d        | 0     
7  | conv4a  | Conv3d           | 3.5 M 
8  | conv4b  | Conv3d           | 7.1 M 
9  | pool4   | MaxPool3d        | 0     
10 | conv5a  | Conv3d           | 7.1 M 
11 | conv5b  | Conv3d           | 7.1 M 
12 | pool5   | MaxPool3d        | 0     
13 | flatten | Flatten          | 0     
14 | fc6     | Linear           | 33.6 M
15 | fc7     | Linear           | 16.8 M
16 | fc8     | Linear           | 20.5 K
17 | dropout | Dropout          | 0     
18 | relu    | R

Epoch 15: 100%|██████████| 155/155 [00:15<00:00, 10.00it/s, loss=1.61, v_num=0, train_acc=0.250, train_loss=1.610, val_acc=0.202, val_loss=1.610]

`Trainer.fit` stopped: `max_epochs=16` reached.


Epoch 15: 100%|██████████| 155/155 [00:16<00:00,  9.68it/s, loss=1.61, v_num=0, train_acc=0.250, train_loss=1.610, val_acc=0.202, val_loss=1.610]
