# Infer using a Transformer

> Module for training on a dataset of embeddings

In [None]:
%reload_ext nb_black
%reload_ext autoreload
%autoreload 2

from nbdev.showdoc import *
import sys

__root = "../../"
sys.path.append(__root)

In [None]:
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo,
)
import pandas as pd

In [None]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 64
# sampling_rate = 2
frames_per_second = 30
alpha = 4

# model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)
transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x / 255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=side_size),
            CenterCropVideo(crop_size=(crop_size, crop_size)),
        ]
    ),
)

In [None]:
from torch_snippets import *

root = P("/mnt/347832F37832B388/ml-datasets/ssbd")
annotations = pd.read_csv(f"{root}/annotations.csv")

In [None]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
alpha = 4

model = torch.hub.load("facebookresearch/pytorchvideo", "slow_r50", pretrained=True)

transform = lambda num_frames=num_frames: ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x / 255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=side_size),
            CenterCropVideo(crop_size=(crop_size, crop_size)),
        ]
    ),
)

In [None]:
row = choose(annotations.query("start == 3 and end == 7"))
show(row.to_frame().T)

Model from Hub

```python
model = torch.hub.load(
    "facebookresearch/pytorchvideo", "slow_r50", pretrained=True
).cuda()

import json
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")
```

```python
raw_videos_folder = root / "ssbd-raw-videos"
video = EncodedVideo.from_path(raw_videos_folder / f"{row.video}.mp4")
video_data = video.get_clip(start_sec=row.start, end_sec=row.end)
Info(video_data)
video_data = transform()(video_data)
# num_frames = (row.end - row.start) * 10
# video_data = transform_num_frames(num_frames)(video_data)
Info(video_data)
inputs = video_data["video"][None]
Info(inputs)
# Generate top 5 predictions

with torch.no_grad():
    preds = model(inputs.cuda())

preds = torch.nn.functional.softmax(preds, dim=-1)
print(preds)
pred_class_ids = preds.topk(k=5).indices

pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_class_ids[0]]
print("Predicted labels: %s" % ", ".join(pred_class_names))
```
---

```python
frames_folder = root / "ssbd-frames/10fps"

mean_transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            NormalizeVideo(mean, std),
            ShortSideScale(size=side_size),
            CenterCropVideo(crop_size=(crop_size, crop_size)),
        ]
    ),
)

for frames_path in Glob(frames_folder):
    frames = {"video": loaddill(frames_path).permute(1, 0, 2, 3)}
    # frames = {"video": loaddill(frames_folder / "198.frames.tensor").permute(1, 0, 2, 3)}
    Info(frames)
    frames = mean_transform(frames)["video"][None]
    Info(frames)

    with torch.no_grad():
        preds = model(frames.cuda())

    preds = torch.nn.functional.softmax(preds, dim=-1)
    print(preds)
    pred_class_ids = preds.topk(k=5).indices

    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_class_ids[0]]
    print("Predicted labels: %s" % ", ".join(pred_class_names))
```

In [None]:
frames_folder = root / "ssbd-frames/10fps"

mean_transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            NormalizeVideo(mean, std),
        ]
    ),
)

feature_extractor = nn.Sequential(*model.blocks[:4]).cpu()
features_folder = root / "ssbd-frames-features/10fps/slow_r50/"
makedir(features_folder)

for frames_path in (tracker := track2(Glob(frames_folder))):
    item = stem(frames_path)
    if item in ["477.frames", "407.frames"]:
        continue
    to = features_folder / f"{item}.features.tensor"
    if exists(to):
        continue
    frames = loaddill(frames_path).permute(1, 0, 2, 3)
    frames = {"video": frames}
    frames = mean_transform(frames)["video"][None]
    tracker.send(f"processing {item} @ {frames}")
    with torch.no_grad():
        try:
            preds = feature_extractor(frames.cpu()).cpu()
            dumpdill(preds, to, silent=True)
        except Exception as e:
            Warn(f"{e} @ {item}")

In [None]:
model