## Clip Splicing Example

In [None]:
from slowfast.config.defaults import get_cfg
from slowfast.datasets.dataset_utils import RandomKinetics, RandomSsv2, RandomEpickitchens, SameClassKinetics, SameClassSsv2, SameClassEpickitchens, KLSimilarityEpickitchens
from slowfast.datasets.utils import revert_tensor_normalize
from slowfast.utils.checkpoint import load_test_checkpoint
from slowfast.models.build import MODEL_REGISTRY

import torch
import numpy as np

import tome

import random

In [None]:
cfg = get_cfg()
cfg.merge_from_file('<CONFIG-PATH>')
cfg.SUBSAMPLER.DISTRIBUTION_PATH = "<DISTRIBUTION-PATH>"
cfg.NUM_GPUS = 1

In [None]:
name = cfg.MODEL.MODEL_NAME
model = MODEL_REGISTRY.get(name)(cfg).to('cpu')
load_test_checkpoint(cfg, model)

In [None]:
if cfg.MODEL.MODEL_NAME == 'TimeSformer':
    patch_func = tome.patch.timesformer
    class_token=False
    patch_depth = 1
    nrows = 1
    r = 18
    num_to_replace = 4
elif cfg.MODEL.MODEL_NAME == 'Motionformer':
    patch_func = tome.patch.motionformer
    class_token=False
    patch_depth = 2
    nrows = 2
    r = 18
    num_to_replace = 4
elif cfg.MODEL.MODEL_NAME == 'ViViT':
    patch_func = tome.patch.vivit
    class_token=True
    patch_depth = 2
    nrows = 4
    r = 300
    num_to_replace = 4
elif cfg.MODEL.MODEL_NAME == 'VideoMAE':
    patch_func = tome.patch.videomae
    class_token=False
    patch_depth = 2
    nrows = 2
    r = 150
    num_to_replace = 4

patch_func(model, trace_source=True, verbose=True)

In [None]:
splice_type = 'kl_similarity'
match_type = 'noun'

if cfg.TEST.DATASET == 'kinetics':
    if splice_type == 'random':
        dataset = RandomKinetics(cfg=cfg, mode='test', patch_depth=patch_depth, num_to_replace=num_to_replace)
    elif splice_type == 'same_class':
        dataset = SameClassKinetics(cfg=cfg, mode='test', patch_depth=patch_depth, num_to_replace=num_to_replace)
elif cfg.TEST.DATASET == 'ssv2':
    if splice_type == 'random':
        dataset = RandomSsv2(cfg=cfg, mode='test', patch_depth=patch_depth, num_to_replace=num_to_replace)
    elif splice_type == 'same_class':
        dataset = SameClassSsv2(cfg=cfg, mode='test', patch_depth=patch_depth, num_to_replace=num_to_replace)
elif cfg.TEST.DATASET == 'epickitchens':
    if splice_type == 'random':
        dataset = RandomEpickitchens(cfg=cfg, mode='test', patch_depth=patch_depth, match_type=match_type, num_to_replace=num_to_replace)
    elif splice_type == 'same_class':
        dataset = SameClassEpickitchens(cfg=cfg, mode='test', patch_depth=patch_depth, match_type=match_type, num_to_replace=num_to_replace)
    elif splice_type == 'kl_similarity':
        dataset = KLSimilarityEpickitchens(cfg=cfg, mode='test', patch_depth=4, match_type=match_type, num_to_replace=num_to_replace)

In [None]:
clip = dataset[1]
video, label = clip[0], clip[1]
video[0] = revert_tensor_normalize(video[0].permute(1, 2, 3, 0), cfg.DATA.MEAN, cfg.DATA.STD).permute(3, 0, 1, 2)

In [None]:
model.r = r

with torch.inference_mode():
    output = model([x[None].to('cpu') for x in video])

In [None]:
source = model._tome_info['source']

if cfg.MODEL.MODEL_NAME in ['TimeSformer', 'Motionformer']:
    visualisation = tome.make_spatial_video_visualization(video[0].permute(1, 0, 2, 3), source, patch_size=(16, 16, patch_depth), class_token=class_token, average_colour=True)
elif cfg.MODEL.MODEL_NAME in ['ViViT', 'VideoMAE']:
    visualisation, separate_tokens = tome.make_spatiotemporal_video_visualization(video[0].permute(1, 0, 2, 3), source, patch_size=(16, 16, patch_depth), class_token=class_token, average_colour=True, separate=True)

In [None]:
cast_video = np.uint8(torch.permute(video[0], (1, 2, 3, 0)).numpy() * 255)
tome.concatenate_images(cast_video[::nrows], ncols=8, nrows=1)

In [None]:
tome.concatenate_images(visualisation[::nrows], ncols=8, nrows=1)

In [None]:
random.seed(13)

indices = random.sample(range(len(dataset)), k=1)

for i, index in enumerate(indices):
    clip = dataset[index]
    video, label = clip[0], clip[1]
    video[0] = revert_tensor_normalize(video[0].permute(1, 2, 3, 0), cfg.DATA.MEAN, cfg.DATA.STD).permute(3, 0, 1, 2)
    

    with torch.inference_mode():
        output = model([x[None].to('cpu') for x in video])

    source = model._tome_info['source']

    if cfg.MODEL.MODEL_NAME in ['TimeSformer', 'Motionformer']:
        visualisation = tome.make_spatial_video_visualization(video[0].permute(1, 0, 2, 3), source, patch_size=(16, 16, patch_depth), class_token=class_token, average_colour=True)
    elif cfg.MODEL.MODEL_NAME in ['ViViT', 'VideoMAE']:
        visualisation, separate_tokens = tome.make_spatiotemporal_video_visualization(video[0].permute(1, 0, 2, 3), source, patch_size=(16, 16, patch_depth), class_token=class_token, average_colour=True, separate=True)

    cast_video = np.uint8(torch.permute(video[0], (1, 2, 3, 0)).numpy() * 255)
    
    tome.concatenate_images(cast_video[::nrows], ncols=8, nrows=1).save(f'../paper/images/{cfg.MODEL.MODEL_NAME.lower()}/{cfg.MODEL.MODEL_NAME.lower()}_spliced_supplementary_{i}.png')

    tome.concatenate_images(visualisation[::nrows], ncols=8, nrows=1).save(f'../paper/images/{cfg.MODEL.MODEL_NAME.lower()}/{cfg.MODEL.MODEL_NAME.lower()}_spliced_supplementary_merged_{i}.png')