In [1]:
import torch
from torchvision import transforms
import torchvision.models as models
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
import torchmetrics
from torchinfo import summary

import numpy as np
import os
import cv2
from tqdm.notebook import tqdm

seed_everything(1337)

Global seed set to 1337


1337

In [2]:
L_consecutive_frames = 10

class TwoStreamActionNet(pl.LightningModule):
    '''
    https://arxiv.org/abs/1406.2199
    '''
    def __init__(self, num_classes=101):
        super().__init__()
        # for single RGB frame RGB
        self.spatial_stream_net = models.resnet18(weights='IMAGENET1K_V1') # any convnet works
        self.spatial_stream_net.fc = nn.Linear(in_features=512, out_features=num_classes)
        # for multi-frame optical flow, hard coding to 10 according to paper examples
        self.temporal_stream_net = models.resnet18()
        self.temporal_stream_net.conv1 = nn.Conv2d(L_consecutive_frames*2, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.temporal_stream_net.fc = nn.Linear(in_features=512, out_features=num_classes)
        self.softmax = nn.Softmax(dim=1)

        self.acc = torchmetrics.Accuracy()
        self.loss = nn.CrossEntropyLoss()

    
    def forward(self, x_rgb, x_optical_flow):
        x_rgb = self.spatial_stream_net(x_rgb)
        x_optical_flow = self.temporal_stream_net(x_optical_flow)
        y_hat = self.softmax(x_rgb + x_optical_flow)
        return y_hat


    def training_step(self, batch, batch_idx):
        (x_rgb, x_optical_flow), y = batch
        x_rgb = self.spatial_stream_net(x_rgb)
        x_optical_flow = self.temporal_stream_net(x_optical_flow)
        y_hat = self.softmax(x_rgb + x_optical_flow)
        train_loss = self.loss(y_hat, y)
        train_acc = self.acc(y_hat, y)
        self.log_dict({"train_acc": train_acc, "train_loss": train_loss}, prog_bar=True)
        return train_loss


    def validation_step(self, batch, batch_idx):
        (x_rgb, x_optical_flow), y = batch
        x_rgb = self.spatial_stream_net(x_rgb)
        x_optical_flow = self.temporal_stream_net(x_optical_flow)
        y_hat = self.softmax(x_rgb + x_optical_flow)
        val_loss = self.loss(y_hat, y)
        val_acc = self.acc(y_hat, y)
        self.log_dict({"val_acc": val_acc, "val_loss": val_loss}, prog_bar=True)
        return val_loss


    def configure_optimizers(self):
        # lr and schedular as written in paper
        optimizer = torch.optim.SGD(self.parameters(), lr=1e-02, momentum=0.9)
        # wasn't sure which lr scheduler would be best fit, so picked steplr
        # what should be according to paper: 1e-03 from 50k iters, 1e-04 from 70k iters
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)
        return {
            "optimizer":optimizer,
            "lr_scheduler" : {
                "scheduler" : scheduler,
                "monitor" : "train_loss",
            }
        }

In [3]:
# short ucf101 downloaded from here- https://www.kaggle.com/datasets/nguyntindng0506/ucf101

class TwoStreamDataset(Dataset):
    """
    https://www.kaggle.com/datasets/nguyntindng0506/ucf101
    Args:
        dataset (str): Name of dataset. Defaults to 'ucf101'.
        split (str): Determines which folder of the directory the dataset will read from. Defaults to 'train'.
    """

    def __init__(self, dataset='data', split='train'):
        self.root_dir = dataset
        self.split = split

        # Section 5 "Implementation details"
        rescale_size = 256
        crop_size = 224

        if split == 'train':
            self.h_flip = transforms.RandomHorizontalFlip(p=.5)
            self.color_jitter = transforms.ColorJitter(brightness=.5, hue=.5, \
                    contrast=.5, saturation=.5)
        self.spatial_img_tfs = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize((rescale_size, rescale_size)),
                    transforms.RandomCrop((crop_size, crop_size)),
                ])

        self.temporal_tfs = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize((rescale_size, rescale_size)), # not mentioned but few vids h,w < 224
                    transforms.RandomCrop((crop_size, crop_size)),
                ])

        print('Loading and processing videos...')
        self.rgb_frames, self.optical_flow_stacks, labels = [], [], []
        # load videos now - loading/processing vids during training ate up a lot of time
        # unsure if loading/processing everything NOW makes sense either
        # especially when dataset size is big
        for fname in tqdm(sorted(os.listdir(os.path.join(self.root_dir, split)))):
            rgb_frame, optical_flow_stacked = self.process_video(os.path.join(self.root_dir, split, fname))
            if optical_flow_stacked.shape[-1] == 20 and rgb_frame is not None: 
                self.rgb_frames.append(rgb_frame)
                self.optical_flow_stacks.append(optical_flow_stacked)
                labels.append(fname.split("_")[1])

        print(f'Size of {split}: {len(self.rgb_frames)}')
        print(f'Available label classes: {sorted(list(set(labels)))}')

        # Prepare a mapping between the label names (strings) and indices (ints)
        self.label2index = {label: index for index, label in enumerate(sorted(set(labels)))}
        # Convert the list of label names into an array of label indices
        self.label_array = np.array([self.label2index[label] for label in labels], dtype=int)
        assert len(self.label_array) == len(self.rgb_frames) == len(self.optical_flow_stacks)


    def num_classes(self):
        return len(list(set(self.label_array)))


    def __len__(self):
        return len(self.rgb_frames)


    def __getitem__(self, index):
        rgb = self.spatial_img_tfs(self.rgb_frames[index])
        optical_stacks = self.temporal_tfs(self.optical_flow_stacks[index])
        if self.split == 'train':
            rgb = self.h_flip(rgb)
            rgb = self.color_jitter(rgb)
            optical_stacks = self.h_flip(optical_stacks)
        # paper doesn't say anything about normalization
        # RGB frame gets this op automatically
        optical_stacks = optical_stacks / 255. # definitely not correct number
        labels = np.array(self.label_array[index])
        return (rgb, optical_stacks), torch.from_numpy(labels).type(torch.LongTensor)


    def process_video(self, fname):
        optical_flow_frames = []
        capture = cv2.VideoCapture(fname)
        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
        count = 0
        random_frame_rgb_idx = np.random.randint(1, frame_count)
        random_optical_start_idx = np.random.randint(0, frame_count - L_consecutive_frames)
        prev_frame = None
        rgb_frame = None

        while capture.isOpened():
            ret, frame = capture.read()
            if not ret:
                break
            count += 1
            if count == random_frame_rgb_idx:
                rgb_frame = frame
            if count >= random_optical_start_idx and len(optical_flow_frames) < 2*L_consecutive_frames:
                if prev_frame is None:
                    prev_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                else:
                    # converting to gray should make optical flow calc cheaper
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                    flow = cv2.calcOpticalFlowFarneback(prev_frame, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0)
                    prev_frame = frame
                    # 3.1 "ConvNet input configurations"
                    # Not 100% sure if this is "Optical flow stacking"
                    optical_flow_frames.append(flow[:, :, 0])
                    optical_flow_frames.append(flow[:, :, 1])
        capture.release()
        # technically stack size is the channel for model 
        # transposing now for cleaner code?
        optical_flow_frames = np.array(optical_flow_frames).transpose((1, 2, 0)) 
        return rgb_frame, optical_flow_frames

In [4]:
val_set = TwoStreamDataset('data', 'test')

Loading and processing videos...


  0%|          | 0/224 [00:00<?, ?it/s]

Size of test: 224
Available label classes: ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [5]:
train_set = TwoStreamDataset('data', 'train')

Loading and processing videos...


  0%|          | 0/594 [00:00<?, ?it/s]

Size of train: 594
Available label classes: ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [6]:
len(train_set), len(val_set), train_set.num_classes(), val_set.num_classes()

(594, 224, 5, 5)

In [7]:
model = TwoStreamActionNet(num_classes=train_set.num_classes())

In [8]:
summary(model, [(10, 3, 224, 224),(10, 20, 224, 224)])

Layer (type:depth-idx)                        Output Shape              Param #
TwoStreamActionNet                            [10, 5]                   --
├─ResNet: 1-1                                 [10, 5]                   --
│    └─Conv2d: 2-1                            [10, 64, 112, 112]        9,408
│    └─BatchNorm2d: 2-2                       [10, 64, 112, 112]        128
│    └─ReLU: 2-3                              [10, 64, 112, 112]        --
│    └─MaxPool2d: 2-4                         [10, 64, 56, 56]          --
│    └─Sequential: 2-5                        [10, 64, 56, 56]          --
│    │    └─BasicBlock: 3-1                   [10, 64, 56, 56]          73,984
│    │    └─BasicBlock: 3-2                   [10, 64, 56, 56]          73,984
│    └─Sequential: 2-6                        [10, 128, 28, 28]         --
│    │    └─BasicBlock: 3-3                   [10, 128, 28, 28]         230,144
│    │    └─BasicBlock: 3-4                   [10, 128, 28, 28]         295,42

In [9]:
train_loader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=16)
val_loader = DataLoader(val_set, batch_size=4, num_workers=16)

In [11]:
x, y = next(iter(train_loader))
x[0].shape, x[1].shape, y.shape, y.type(torch.LongTensor)

(torch.Size([4, 3, 224, 224]),
 torch.Size([4, 20, 224, 224]),
 torch.Size([4]),
 tensor([2, 1, 4, 0]))

In [12]:
trainer = pl.Trainer(limit_train_batches=100, max_epochs=4,accelerator='gpu', devices=1, log_every_n_steps=10)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
!rm -rf lightning_logs 

In [14]:
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)
# run 'tensorboard --logdir .' on terminal for logs

Missing logger folder: /home/rishab/Documents/codes/model-arch-implementation/quick-model-archs/two-stream-action-recog/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type             | Params
---------------------------------------------------------
0 | spatial_stream_net  | ResNet           | 11.2 M
1 | temporal_stream_net | ResNet           | 11.2 M
2 | softmax             | Softmax          | 0     
3 | acc                 | Accuracy         | 0     
4 | loss                | CrossEntropyLoss | 0     
---------------------------------------------------------
22.4 M    Trainable params
0         Non-trainable params
22.4 M    Total params
89.646    Total estimated model params size (MB)


Epoch 3: 100%|██████████| 156/156 [00:09<00:00, 17.08it/s, loss=1.73, v_num=0, train_acc=0.250, train_loss=1.650, val_acc=0.219, val_loss=1.690]

`Trainer.fit` stopped: `max_epochs=4` reached.


Epoch 3: 100%|██████████| 156/156 [00:09<00:00, 16.38it/s, loss=1.73, v_num=0, train_acc=0.250, train_loss=1.650, val_acc=0.219, val_loss=1.690]
