## PyTorchVideo - Getting started

Going through the PyTorchVideo quickstart tutorial: https://pytorchvideo.org/#quickstart

In [None]:
%matplotlib inline

import json
import os
import pathlib
import random

import torch
from torchvision.transforms import Compose, Lambda
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

In [None]:
DATA_DIR = pathlib.Path('/data/autotrim/dev')
DEVICE = "cuda"
MODEL_NAME = "slowfast_r50"

In [None]:
!wget https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json

--2023-06-24 17:22:01--  https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.202.10, 18.164.202.62, 18.164.202.120, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.202.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10326 (10K) [text/plain]
Saving to: ‘kinetics_classnames.json.1’


2023-06-24 17:22:01 (419 MB/s) - ‘kinetics_classnames.json.1’ saved [10326/10326]



In [None]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)
    
# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [None]:
model = torch.hub.load('facebookresearch/pytorchvideo', MODEL_NAME, pretrained=True)
model = model.to(DEVICE)
model = model.eval()

Using cache found in /home/shikhar/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [None]:
# List clips in the data dir.
clips = os.listdir(DATA_DIR)
clip_path = DATA_DIR / random.choice(clips)

In [None]:
# Load the video
video = EncodedVideo.from_path(clip_path)

In [None]:
####################
# SlowFast transform
####################

side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
alpha = 4

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

In [None]:
clip_start_sec = 0.0 # secs

video_data = video.get_clip(start_sec=clip_start_sec, end_sec=clip_start_sec + clip_duration)
video_data = transform(video_data)
inputs = video_data["video"]
inputs = [i.to(DEVICE)[None, ...] for i in inputs]

In [None]:
preds = model(inputs)
preds = torch.nn.functional.softmax(preds, dim=1)
pred_class_ids = preds.topk(k=5).indices

In [None]:
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_class_ids[0]]
print("Predicted labels: %s" % ", ".join(pred_class_names))

Predicted labels: archery, throwing axe, playing paintball, playing didgeridoo, tai chi


In [None]:
dir(video)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_audio',
 '_audio_start_pts',
 '_audio_time_base',
 '_container',
 '_decode_audio',
 '_duration',
 '_has_audio',
 '_pyav_decode_video',
 '_selective_decoding',
 '_video',
 '_video_name',
 '_video_start_pts',
 '_video_time_base',
 'close',
 'duration',
 'from_path',
 'get_clip',
 'name']

In [None]:
clip = video.get_clip(0, 10)

In [None]:
clip['video'].shape

torch.Size([3, 600, 1440, 1920])

In [None]:
frame = clip['video'][:, 0, :, :].shape

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
img = Image.fromarray(frame, 'RGB')

AttributeError: 'torch.Size' object has no attribute '__array_interface__'