## Layer Duplication Example

In [None]:
from slowfast.config.defaults import get_cfg
from slowfast.datasets.build import build_dataset
from slowfast.datasets.utils import revert_tensor_normalize
from slowfast.utils.checkpoint import load_test_checkpoint
from slowfast.models.build import MODEL_REGISTRY

import torch
import numpy as np

import tome

import random

In [None]:
cfg = get_cfg()
cfg.merge_from_file('<CONFIG-PATH>')
cfg.NUM_GPUS = 1

In [None]:
name = cfg.MODEL.MODEL_NAME
model = MODEL_REGISTRY.get(name)(cfg).to('cpu')
load_test_checkpoint(cfg, model)

In [None]:
layer_to_duplicate = 11

if cfg.MODEL.MODEL_NAME == 'TimeSformer':
    patch_func = tome.patch.timesformer
    class_token=False
    patch_depth = 1
    nrows = 1
    r = 18
    tome.patch.duplicate_timesformer(model, layer_to_duplicate, 12)
elif cfg.MODEL.MODEL_NAME == 'Motionformer':
    patch_func = tome.patch.motionformer
    class_token=False
    patch_depth = 2
    nrows = 2
    r = 18
    tome.patch.duplicate_motionformer(model, layer_to_duplicate, 12)
elif cfg.MODEL.MODEL_NAME == 'ViViT':
    patch_func = tome.patch.vivit
    class_token=True
    patch_depth = 2
    nrows = 4
    r = 300
    tome.patch.duplicate_vivit(model, layer_to_duplicate, 12)
elif cfg.MODEL.MODEL_NAME == 'VideoMAE':
    patch_func = tome.patch.videomae
    class_token=False
    patch_depth = 2
    nrows = 2
    r = 150
    tome.patch.duplicate_videomae(model, layer_to_duplicate, 12)
    
patch_func(model, trace_source=True, head_aggregation='mean', verbose=True)

In [None]:
dataset = build_dataset(cfg.TEST.DATASET, cfg, 'test')

In [None]:
clip = dataset[1001]
video, label = clip[0], clip[1]
video[0] = revert_tensor_normalize(video[0].permute(1, 2, 3, 0), cfg.DATA.MEAN, cfg.DATA.STD).permute(3, 0, 1, 2)

In [None]:
model.r = [0] * layer_to_duplicate + [r] * 12 + [0] * (11 - layer_to_duplicate)

print(model.r)

with torch.inference_mode():
    output = model([x[None].to('cpu') for x in video])

In [None]:
source = model._tome_info['source']

if cfg.MODEL.MODEL_NAME in ['TimeSformer', 'Motionformer']:
    visualisation = tome.make_spatial_video_visualization(video[0].permute(1, 0, 2, 3), source, patch_size=(16, 16, patch_depth), class_token=class_token, average_colour=True)
elif cfg.MODEL.MODEL_NAME in ['ViViT', 'VideoMAE']:
    visualisation, separate_tokens = tome.make_spatiotemporal_video_visualization(video[0].permute(1, 0, 2, 3), source, patch_size=(16, 16, patch_depth), class_token=class_token, average_colour=True, separate=True)

In [None]:
cast_video = np.uint8(torch.permute(video[0], (1, 2, 3, 0)).numpy() * 255)
tome.concatenate_images(cast_video[::nrows], ncols=8, nrows=1)

In [None]:
tome.concatenate_images(visualisation[::nrows], ncols=8, nrows=1)