## Initial Imports

In [None]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from torchvision.datasets.kinetics import Kinetics400

from IPython.display import Video
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Path.ls = lambda x: [o.name for o in x.iterdir()]

### Video Transformations from [`pytorch/vision/references/video_classification/transforms.py` ](https://github.com/pytorch/vision/blob/master/references/video_classification/transforms.py)

In [None]:
import torch
import random


def crop(vid, i, j, h, w):
    return vid[..., i:(i + h), j:(j + w)]


def center_crop(vid, output_size):
    h, w = vid.shape[-2:]
    th, tw = output_size

    i = int(round((h - th) / 2.))
    j = int(round((w - tw) / 2.))
    return crop(vid, i, j, th, tw)


def hflip(vid):
    return vid.flip(dims=(-1,))


# NOTE: for those functions, which generally expect mini-batches, we keep them
# as non-minibatch so that they are applied as if they were 4d (thus image).
# this way, we only apply the transformation in the spatial domain
def resize(vid, size, interpolation='bilinear'):
    # NOTE: using bilinear interpolation because we don't work on minibatches
    # at this level
    scale = None
    if isinstance(size, int):
        scale = float(size) / min(vid.shape[-2:])
        size = None
    return torch.nn.functional.interpolate(
        vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False)


def pad(vid, padding, fill=0, padding_mode="constant"):
    # NOTE: don't want to pad on temporal dimension, so let as non-batch
    # (4d) before padding. This works as expected
    return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode)


def to_normalized_float_tensor(vid):
    return vid.permute(3, 0, 1, 2).to(torch.float32) / 255


def normalize(vid, mean, std):
    shape = (-1,) + (1,) * (vid.dim() - 1)
    mean = torch.as_tensor(mean).reshape(shape)
    std = torch.as_tensor(std).reshape(shape)
    return (vid - mean) / std


# Class interface

class RandomCrop(object):
    def __init__(self, size):
        self.size = size

    @staticmethod
    def get_params(vid, output_size):
        """Get parameters for ``crop`` for a random crop.
        """
        h, w = vid.shape[-2:]
        th, tw = output_size
        if w == tw and h == th:
            return 0, 0, h, w
        i = random.randint(0, h - th)
        j = random.randint(0, w - tw)
        return i, j, th, tw

    def __call__(self, vid):
        i, j, h, w = self.get_params(vid, self.size)
        return crop(vid, i, j, h, w)


class CenterCrop(object):
    def __init__(self, size):
        self.size = size

    def __call__(self, vid):
        return center_crop(vid, self.size)


class Resize(object):
    def __init__(self, size):
        self.size = size

    def __call__(self, vid):
        return resize(vid, self.size)


class ToFloatTensorInZeroOne(object):
    def __call__(self, vid):
        return to_normalized_float_tensor(vid)


class Normalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, vid):
        return normalize(vid, self.mean, self.std)


class RandomHorizontalFlip(object):
    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, vid):
        if random.random() < self.p:
            return hflip(vid)
        return vid


class Pad(object):
    def __init__(self, padding, fill=0):
        self.padding = padding
        self.fill = fill

    def __call__(self, vid):
        return pad(vid, self.padding, self.fill)


### .

## Data Loading

In [None]:
base_dir = Path('/Users/rahulsomani/01_github_projects/video-classification/')
data_dir = base_dir/'data'

In [None]:
!tree {data_dir/'train'}

In [None]:
data = Kinetics400(data_dir/'train',
                   step_between_clips = 1,
                   extensions         = ('mp4',),
                   frames_per_clip    = 32,
                   frame_rate = None)

In [None]:
data[0][0].shape

## Seeing the Data to Understand What Exactly is in the Training Data

### Helper Functions

In [None]:
def get_metadata(data):
    """
    Takes in a `Kinetics400` dataset, computes the no. of frames in each data point
    and returns in a DataFrame
    """
    fnames     = [f.rsplit('/')[-1] for f in data.metadata['video_paths']]
    num_frames = [len(pts) for pts in data.metadata['video_pts']]
    fps        = [i for i in data.metadata['video_fps']]

    metadata = pd.DataFrame(list(zip(fnames, num_frames, fps)), columns = ['Filename', '# Frames', 'FPS'])
    return metadata

In [None]:
def plot_img(sub_plt, x, i):
    """
    Where `x` is one data sample of shape `(T, H, W, C)` and
    `i` is the index of `T` that must be plotted.
    `sub_plt` is an `AxesSubplot`.
    This function removes axes labels and ticks, and names the
    subplot with the appropriate frame number
    """
    sub_plt.imshow(x[i])
    sub_plt.set_title(f'Frame #{i+1}')
    sub_plt.axis('off')

In [None]:
def plot_adjacent(x, figsize=(8,5), i1=0, i2=31):
    f, plots = plt.subplots(1, 2, figsize=figsize)
    plots[0] = plot_img(plots[0], x, i1)
    plots[1] = plot_img(plots[1], x, i2)

    f.tight_layout()

### Seeing the Training Data

<br>
When the dataset of class `Kinetics400` is constructed with `frames_per_clip = 32`, it doesn't take into consideration any data points which have less than 32 clips. <br>

As seen in the DataFrame above, `c2-sample1` and `c2-sample2` had less than 32 frames, and `[len(x) for x in data.video_clips.clips]` reveals that 0 samples were taken from both these clips. 

For `c1-sample1`, which has exactly 32 clips, one clip gets created, whereas for `c1-sample2`, 16 clips get created, leading to a dataset of `len = 17`. The dataset has 1 sample from `c1-sample1` and 16 samples from `c1-sample2`.

<br>

In [None]:
get_metadata(data)

In [None]:
len(data)

In [None]:
[len(x) for x in data.video_clips.clips]

---
<h3> Below are the actual videos

In [None]:
Video('../data/train/class1/c1-sample1.mp4', width=400)

In [None]:
Video('../data/train/class1/c1-sample2.mp4', width=400)

In [None]:
Video('../data/train/class2/c2-sample1.mp4', width=400)

In [None]:
Video('../data/train/class2/c2-sample2.mp4', width=400)

<br>

To fully understand what's happening, look at the first and last frames of selected data samples (indices 0, 1, 2 and 16) below.

<br>

In [None]:
plot_adjacent(data[0][0])

In [None]:
plot_adjacent(data[1][0])

In [None]:
plot_adjacent(data[2][0])

In [None]:
plot_adjacent(data[16][0])

<br>

---

## Data Augmentations -- Albumentations

In [None]:
import os
import numpy as np
from skimage.color import label2rgb

import albumentations as A
import random

### Read Video as `Torch Tensor`

This isn't nearly as efficient as `torchvision`'s video reader but is useful to quickly read in a video as a `torch.Tensor` for experiments.

In [None]:
file = '/Users/rahulsomani/01_github_projects/video-classification/data/train/class1/c1-sample1.mp4'

In [None]:
def read_video_tensor(file):
    import ffmpeg

    out, _ = (
        ffmpeg
        .input(file)
        .output('pipe:', format='rawvideo', pix_fmt='rgb24')
        .run(capture_stdout=True)
    )

    meta = ffmpeg.probe(file)['streams'][0]
    height, width = meta['height'], meta['width']

    vid = (
        np
        .frombuffer(out, np.uint8)
        .reshape([-1, height, width, 3])
    )

    return torch.from_numpy(vid)

In [None]:
vid = read_video_tensor(file)
vid.shape

### Albumentations on a Single Frame

<br>

`x` is one data sample i.e. a `tensor` of 32 frames/images. To apply albumentations, we'll first apply it on one single frame, then loop and apply over all the frames. As seen below, _not all_ `albumentations` can be applied to `torch.Tensor`s, so they first need to be converted to `np.array`. However, this is a good thing because the `albumentations` always work faster on `np.array` vs. `torch.Tensor`

In [None]:
x = data[3][0]
x.shape

In [None]:
plt.imshow(x[0])

In [None]:
def show(aug, img):
    img = aug(image = img)['image']
    plt.figure(figsize=(6,6))
    plt.imshow(img)

In [None]:
%%time

random.seed(42)
show(A.ChannelShuffle(p=1), np.asarray(x[0]))

In [None]:
for i in x: print(i.shape)

In [None]:
%%time

random.seed(42)
show(A.ChannelShuffle(p=1), x[0])

In [None]:
%%time

random.seed(42)
show(A.ChannelShuffle(p=1), np.asarray(x[0]))

In [None]:
%%time

random.seed(42)
show(A.ToGray(p=1), x[0])

In [None]:
%%time

random.seed(42)
show(A.ToGray(p=1), np.asarray(x[0]))

### Albumentations on a Video i.e. List of Frames

<br>

Here, I define a function `aug_video` which applies a list of `albumentations` to a video. It ensures that the exact same transformation is applied to each frame of the video. This is a must because, for instance, you wouldn't want one frame of the video to be horizontally flipped while the next not. This is done using a constant `random.seed()` value.

In [None]:
file = '/Users/rahulsomani/Desktop/tennis-dataset-1-minimal/train/forehand/point#1_shot#2_David_Ferrer_-_Best-Ever_Roadrunner_Points.mp4'

In [None]:
out, _ = (
    ffmpeg
    .input(file)
    .output('pipe:', format='rawvideo', pix_fmt='rgb24')
    .run(capture_stdout=True)
)

meta = ffmpeg.probe(file)['streams'][1]
height, width = meta['height'], meta['width']

vid = (
    np
    .frombuffer(out, np.uint8)
    .reshape([-1, height, width, 3])
)

In [None]:
vid.shape

In [None]:
tfms = A.Compose([
    A.HorizontalFlip(p=0.0),
    #A.ToGray(p=1),
    #A.CLAHE(p=1),
    A.Cutout(p=1),
    A.RandomRain(p=1),
    #A.ChannelDropout(p=1),
    #A.ChannelShuffle(p=1),
    #A.InvertImg(p=1)
])

In [None]:
def aug_video(vid, tfms):
    seed = random.randint(0,99999)
    aug_vid = []
    for x in vid:
        random.seed(seed)
        aug_vid.append((tfms(image = np.asarray(x)))['image'])
    return torch.from_numpy(np.stack(aug_vid))

In [None]:
plot_adjacent(vid, figsize=(16, 9), i1=10, i2=20)

In [None]:
vidaug = (aug_video(vid, tfms))
vidaug.shape
plot_adjacent(vidaug, figsize=(16, 9), i1=10, i2=20)

In [None]:
vid.shape

In [None]:
ToFloatTensorInZeroOne

### Testing to see if the `ToFloatTensorInZeroOne` and `Normalize` from the torchvision Video Transforms can be incorporated in the same `Compose` list as `albumentations`


In [None]:
tmp = torch.from_numpy(vid)
tmp.shape

In [None]:
to_normalized_float_tensor(tmp).shape

In [None]:
tmp.shape[-2:]

In [None]:
def normalize(vid, mean, std):
    shape = (-1,) + (1,) * (vid.dim() - 1)
    mean = torch.as_tensor(mean).reshape(shape)
    std = torch.as_tensor(std).reshape(shape)
    return (vid - mean) / std

mean = [0.43216, 0.394666, 0.37645]
std  = [0.22803, 0.22145, 0.216989]

In [None]:
shape = (-1,) + (1,) * (tmp.dim() - 1)
shape

In [None]:
torch.as_tensor(mean).shape

In [None]:
# in the official code
(torch.as_tensor(mean).reshape(shape)).shape

In [None]:
# recreating to see if same result can be optained with `C` axis at end instead of beginning
torch.as_tensor(mean)[..., None, None, None].shape

In [None]:
torch.as_tensor(mean)[None, None, None].shape

In [None]:
raw_shape = (tmp/255. - torch.as_tensor(mean)[None, None, None])
raw_shape.shape

In [None]:
test_target = to_normalized_float_tensor(tmp) - torch.as_tensor(mean).reshape(shape)
test_target.shape

In [None]:
to_normalized_float_tensor(raw_shape).shape == test_target.shape

In [None]:
to_normalized_float_tensor(raw_shape) == test_target

---

### Combining Torchvision Normalising and Albumentations

<br>

**They can't** be combined in one `Compose`(or it's too painful to make it happen). <br>
Instead, in the following chunk of `__getitem__` function of `Kinetics400`...

```python
if self.transform is not None:
    video = self.transform(video) 
```

... we can pass in only the normalisation functions, and then add another line to call the `albumentations` transformations, like so:

```python
if self.transform is not None:
    video = self.transform(video) # Torchvision `ToFloatTensorInZeroOne` and `Normalize`. Returns shape (C,T,H,W)
    video = self.tfms_albumentations(video) # Albumentations Transforms
```

In order to do so, we'll need to tweak the `aug_video` function defined in the above section to `permute` to the appropriate shape `(T,H,W,C)`, and after the `albumentations` transforms are done, `permute` back to shape `(C,T,H,W)`

In [None]:
tfms_torch = torchvision.transforms.Compose([
    ToFloatTensorInZeroOne(),
    Normalize(mean=[0.43216, 0.394666, 0.37645],
              std=[0.22803, 0.22145, 0.216989])
])

In [None]:
tfms_torch(vidaug).shape

In [None]:
tfms_torch(vidaug).permute(1,2,3,0).shape

Redefine `aug_video` for combining both kinds of transformations -- accomodate changing sizes

In [None]:
def aug_video(vid, tfms):
    seed = random.randint(0,99999)
    vid = vid.permute(1,2,3,0) # added line of code
    aug_vid = []
    for x in vid:
        random.seed(seed)
        aug_vid.append((tfms(image = np.asarray(x)))['image'])
    return torch.from_numpy(np.stack(aug_vid)).permute(3,0,1,2)

In [None]:
def combine_tfms(vid, tfms_torch, tfms_albu):
    if not isinstance(vid, torch.Tensor): vid = torch.from_numpy(vid)
    return aug_video(tfms_torch(vid), tfms_albu)

In [None]:
tfms_albu = A.Compose([
    A.HorizontalFlip(p=1),
    A.ChannelShuffle(p=1)
])

In [None]:
vid.shape

In [None]:
combined = combine_tfms(vid, tfms_torch, tfms_albu)
combined.shape

In [None]:
plot_adjacent(combined.permute(1,2,3,0), figsize=(16,10))