In [1]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
model_path = '/home/bhaswata08/Documents/ISLV2/model_chkpt_450'
trained_model = VideoMAEForVideoClassification.from_pretrained(model_path)
image_processor = VideoMAEImageProcessor.from_pretrained(model_path)
dataset_root_path = '/home/bhaswata08/Documents/ISLV2/Datav1'

In [2]:
import torch

In [3]:
import os
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = trained_model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps


val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

test_dataset = pytorchvideo.data.Ucf101(
    data_path=os.path.join(dataset_root_path, "test"),
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)


In [4]:
test_dataset.num_videos

14

In [5]:
all_video_file_paths = dataset_root_path
# Get all the subdirectories in the training folder
class_labels = sorted([d.name for d in os.scandir(os.path.join(dataset_root_path, "train")) if d.is_dir()])
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}


In [10]:
id2label

{0: 'Beautiful',
 1: 'Blind',
 2: 'Deaf',
 3: 'Ugly',
 4: 'happy',
 5: 'loud',
 6: 'quiet',
 7: 'sad'}

In [7]:
def investigate_video(sample_video):
    """Utility to investigate the keys present in a single video sample."""
    for k in sample_video:
        if k == "video":
            print(k, sample_video["video"].shape)
        else:
            print(k, sample_video[k])

    print(f"Video label: {id2label[sample_video[k]]}")


# investigate_video(sample_video)

In [20]:
sample_test_video = next(iter(test_dataset))
investigate_video(sample_test_video)


video torch.Size([3, 16, 224, 224])
video_name MVI_9568.mp4
video_index 13
clip_index 0
aug_index 0
label 7
Video label: sad


In [21]:
def run_inference(model, video):
    """Utility to run inference given a model and test video.
    
    The video is assumed to be preprocessed already.
    """
    # (num_frames, num_channels, height, width)
    perumuted_sample_test_video = video.permute(1, 0, 2, 3)

    inputs = {
        "pixel_values": perumuted_sample_test_video.unsqueeze(0),
        "labels": torch.tensor(
            [sample_test_video["label"]]
        ),  # this can be skipped if you don't have labels available.
    }
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    return logits


In [22]:
logits = run_inference(trained_model, sample_test_video["video"])

In [23]:
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", trained_model.config.id2label[predicted_class_idx])

Predicted class: sad
