Add support of MViTv2 video variants #6373

datumbox · 2022-08-05T18:22:23Z

This is the continuation of the work from #6198 to finalize the API of MViT class. The PR extends TorchVision's existing MViT architecture to support v2 variants.

The mvit_v2_s variant introduced is canonical and the weights are ported from the paper. This is based on the work of @lyttonhao, @haooooooqi and @feichtenhofer on SlowFast.

Verification process

Comparing outputs

To confirm that the implementation is compatible with the original from SlowFast we create a weight converter, load the same weights for both implementations and compare them against the same input:

import collections
import tempfile
import torch
from torchvision.models.video import mvit_v2_s
from slowfast.config.defaults import assert_and_infer_cfg, get_cfg
from slowfast.models.video_model_builder import MViT
from slowfast.utils.parser import load_config


def mvit_v2_s_slowfast():
    config = """
    DATA:
      NUM_FRAMES: 16
      SAMPLING_RATE: 4
      TRAIN_CROP_SIZE: 224
      TEST_CROP_SIZE: 224
      INPUT_CHANNEL_NUM: [3]
    MVIT:
      ZERO_DECAY_POS_CLS: False
      USE_ABS_POS: False
      REL_POS_SPATIAL: True
      REL_POS_TEMPORAL: True
      DEPTH: 16
      NUM_HEADS: 1
      EMBED_DIM: 96
      PATCH_KERNEL: (3, 7, 7)
      PATCH_STRIDE: (2, 4, 4)
      PATCH_PADDING: (1, 3, 3)
      MLP_RATIO: 4.0
      QKV_BIAS: True
      DROPPATH_RATE: 0.2
      NORM: "layernorm"
      MODE: "conv"
      CLS_EMBED_ON: True
      DIM_MUL: [[1, 2.0], [3, 2.0], [14, 2.0]]
      HEAD_MUL: [[1, 2.0], [3, 2.0], [14, 2.0]]
      POOL_KVQ_KERNEL: [3, 3, 3]
      POOL_KV_STRIDE_ADAPTIVE: [1, 8, 8]
      POOL_Q_STRIDE: [[0, 1, 1, 1], [1, 1, 2, 2], [2, 1, 1, 1], [3, 1, 2, 2], [4, 1, 1, 1], [5, 1, 1, 1], [6, 1, 1, 1], [7, 1, 1, 1], [8, 1, 1, 1], [9, 1, 1, 1], [10, 1, 1, 1], [11, 1, 1, 1], [12, 1, 1, 1], [13, 1, 1, 1], [14, 1, 2, 2], [15, 1, 1, 1]]
      DROPOUT_RATE: 0.0
      DIM_MUL_IN_ATT: True
      RESIDUAL_POOLING: True
    """
    temp = tempfile.NamedTemporaryFile(mode="w+", delete=False)
    try:
        temp.write(config)
    finally:
        temp.close()

    cfg = get_cfg()
    cfg.merge_from_file(temp.name)
    cfg = assert_and_infer_cfg(cfg)

    cfg.NUM_GPUS = 0
    model = MViT(cfg)
    model.head.act = torch.nn.Identity()
    return model


def slowfast_to_tv_weights(state_dict):
    d = dict(state_dict)

    # remapping keys
    mapping = collections.OrderedDict(
        [
            ("patch_embed.project.weight", "conv_proj.weight"),
            ("patch_embed.project.bias", "conv_proj.bias"),
            ("cls_token", "pos_encoding.class_token"),
            ("pos_embed_spatial", "pos_encoding.spatial_pos"),
            ("pos_embed_temporal", "pos_encoding.temporal_pos"),
            ("pos_embed_class", "pos_encoding.class_pos"),
            ("attn.proj.weight", "attn.project.0.weight"),
            ("attn.proj.bias", "attn.project.0.bias"),
            ("attn.pool_q.weight", "attn.pool_q.pool.weight"),
            ("attn.norm_q.weight", "attn.pool_q.norm_act.0.weight"),
            ("attn.norm_q.bias", "attn.pool_q.norm_act.0.bias"),
            ("attn.pool_k.weight", "attn.pool_k.pool.weight"),
            ("attn.norm_k.weight", "attn.pool_k.norm_act.0.weight"),
            ("attn.norm_k.bias", "attn.pool_k.norm_act.0.bias"),
            ("attn.pool_v.weight", "attn.pool_v.pool.weight"),
            ("attn.norm_v.weight", "attn.pool_v.norm_act.0.weight"),
            ("attn.norm_v.bias", "attn.pool_v.norm_act.0.bias"),
            ("mlp.fc1.weight", "mlp.0.weight"),
            ("mlp.fc1.bias", "mlp.0.bias"),
            ("mlp.fc2.weight", "mlp.3.weight"),
            ("mlp.fc2.bias", "mlp.3.bias"),
            ("head.projection.weight", "head.1.weight"),
            ("head.projection.bias", "head.1.bias"),
            ("patch_embed.proj.weight", "conv_proj.weight"),
            ("patch_embed.proj.bias", "conv_proj.bias"),
            ("norm.weight", "norm.weight"),
            ("norm.bias", "norm.bias"),
            ("proj.weight", "project.weight"),
            ("proj.bias", "project.bias"),
        ]
    )
    for k in list(d.keys()):
        for pattern, replacement in mapping.items():
            if pattern in k:
                new_key = k.replace(pattern, replacement)
                d[new_key] = d.pop(k)
                break

    # matching dimensions
    d["pos_encoding.class_token"] = d["pos_encoding.class_token"][0, 0, :]
    if "pos_encoding.spatial_pos" in d:
        d["pos_encoding.spatial_pos"] = d["pos_encoding.spatial_pos"][0, :]
        d["pos_encoding.temporal_pos"] = d["pos_encoding.temporal_pos"][0, :]
        d["pos_encoding.class_pos"] = d["pos_encoding.class_pos"][0, 0, :]

    return d


def compare_models(sf_model_fn, tv_model_fn, input_shape):
    print(tv_model_fn.__name__)
    x = torch.randn(input_shape)

    sf_m = sf_model_fn().eval()
    exp_result = sf_m([x]).sum()

    d = sf_m.state_dict()
    d = slowfast_to_tv_weights(d)

    tv_m = tv_model_fn().eval()
    tv_m.load_state_dict(d)
    result = tv_m(x).sum()

    torch.testing.assert_close(result, exp_result)
    print("OK")


compare_models(mvit_v2_s_slowfast, mvit_v2_s, (1, 3, 16, 224, 224))

Benchmarks

To ensure that we don't introduce any speed regression we test the speed as follows:

import time


def benchmark(model_fn, input_shape, device, put_in_list, n=5, warmup=0.1):
    torch.manual_seed(42)
    m = model_fn().to(device).eval()
    x = torch.randn(input_shape).to(device)
    if put_in_list:
        x = [x]

    s = []
    for i in range(n):
        start = time.time()
        m(x)
        t = time.time() - start
        if i > n * warmup:
            s.append(t)

    print(model_fn.__name__, torch.tensor(s).median())


device = "cuda"
batch_size = 4
n = 100

print(f"device={device}, batch_size={batch_size}, n={n}")
for name, fn, put_in_list in [("TorchVision", mvit_v2_s, False), ("SlowFast", mvit_v2_s_slowfast, True)]:
    print(name)
    benchmark(fn, (batch_size, 3, 16, 224, 224), device, put_in_list, n=n)

This was tested on an A100 and as we see below the implementation is 5% faster than the original:

device=cuda, batch_size=4, n=100
TorchVision
mvit_v2_s tensor(0.0492)
SlowFast
mvit_v2_s_slowfast tensor(0.0520)

Accuracy

To verify the accuracy of the model we run the following:

torchrun --nproc_per_node=8 train.py --data-path="/datasets/clean_kinetics_400/" \ 
--batch-size=16 --test-only --cache-dataset \
--clip-len 16 --frame-rate 8 --clips-per-video 5 \
--model mvit_v2_s --weights="MViT_V2_S_Weights.DEFAULT"
 * Clip Acc@1 72.914 Clip Acc@5 89.507
 * Video Acc@1 80.757 Video Acc@5 94.665

Note that the reporting Acc@1 is a bit lower than the one of the paper but this is due to the version of the dataset that we use to assess the model (some corrupted videos are removed). To ensure that the accuracy of TorchVision's implementation is not lagging, we are testing the same data and weights using Slowfast reference scripts:

INFO:slowfast.utils.logging:json_stats: {"split": "test_final", "top1_acc": "80.79", "top5_acc": "94.66"}

As we can see the accuracies are practically the same, with minor differences caused by differences on the VideoClip sampling mechanism.

torchvision/models/video/mvit.py

jdsgomes

LGTM, thanks

Addresses some [breakages](https://github.com/pytorch/pytorch/runs/7782559841?check_suite_focus=true) from #82560 Context: The tests are breaking because a new architecture was added in TorchVision (see pytorch/vision#6373) that requires a different input size. This PR addresses it by using the right size for the `mvit_v2_s` architecture. Pull Request resolved: #83242 Approved by: https://github.com/ezyang

Summary: * Extending to support MViTv2 * Fix docs, mypy and linter * Refactor the relative positional code. * Code refactoring. * Rename vars. * Update docs. * Replace assert with exception. * Updat docs. * Minor refactoring. * Remove the square input limitation. * Moving methods around. * Modify the shortcut in the attention layer. * Add ported weights. * Introduce a `residual_cls` config on the attention layer. * Make the patch_embed kernel/padding/stride configurable. * Apply changes from code-review. * Remove stale todo. Reviewed By: datumbox Differential Revision: D38824226 fbshipit-source-id: 2950997bb37e431d76a0480b5b938b15b1d5eeaf

Extending to support MViTv2

78851e6

datumbox added enhancement module: models labels Aug 5, 2022

facebook-github-bot added the cla signed label Aug 5, 2022

datumbox marked this pull request as draft August 5, 2022 18:22

datumbox added this to In progress in Batteries Included - Phase 3 via automation Aug 5, 2022

This was referenced Aug 5, 2022

[RFC] Batteries Included - Phase 3 #6323

Open

Add MViT architecture in TorchVision #6198

Merged

Fix docs, mypy and linter

22c9850

datumbox force-pushed the models/mvitv2 branch from 98d30fd to 22c9850 Compare August 8, 2022 08:54

datumbox and others added 7 commits August 8, 2022 11:00

Refactor the relative positional code.

cb62dce

Code refactoring.

9f1dcaa

Merge branch 'main' into models/mvitv2

3fc4572

Rename vars.

9378abd

Merge branch 'main' into models/mvitv2

234a68f

Update docs.

a7d917e

Replace assert with exception.

b39ac57

datumbox commented Aug 8, 2022

View reviewed changes

torchvision/models/video/mvit.py Outdated Show resolved Hide resolved

datumbox added 5 commits August 8, 2022 16:35

Updat docs.

1bad54f

Minor refactoring.

2ce2c65

Remove the square input limitation.

03d365c

Moving methods around.

f260ecf

Modify the shortcut in the attention layer.

076c353

datumbox commented Aug 9, 2022

View reviewed changes

torchvision/models/video/mvit.py Outdated Show resolved Hide resolved

This comment was marked as outdated.

Sign in to view

datumbox and others added 2 commits August 9, 2022 14:02

Add ported weights.

be6d7e0

Merge branch 'main' into models/mvitv2

526d440

datumbox marked this pull request as ready for review August 9, 2022 19:30

datumbox changed the title ~~[WIP] Add support for MViTv2~~ Add support for MViTv2 Aug 9, 2022

datumbox requested a review from jdsgomes August 9, 2022 19:32

datumbox added 2 commits August 9, 2022 20:51

Introduce a residual_cls config on the attention layer.

a4173ac

Make the patch_embed kernel/padding/stride configurable.

94e510b

jdsgomes reviewed Aug 10, 2022

View reviewed changes

torchvision/models/video/mvit.py Show resolved Hide resolved

torchvision/models/video/mvit.py Outdated Show resolved Hide resolved

datumbox added 2 commits August 10, 2022 11:44

Apply changes from code-review.

538ffb5

Remove stale todo.

bae1069

datumbox changed the title ~~Add support for MViTv2~~ Add support of MViTv2 video variants Aug 10, 2022

jdsgomes reviewed Aug 10, 2022

View reviewed changes

jdsgomes approved these changes Aug 10, 2022

View reviewed changes

Merge branch 'main' into models/mvitv2

5c3a766

datumbox merged commit 7e8186e into pytorch:main Aug 10, 2022

Batteries Included - Phase 3 automation moved this from In progress to Done Aug 10, 2022

datumbox deleted the models/mvitv2 branch August 10, 2022 12:57

datumbox mentioned this pull request Aug 11, 2022

Change the input of mvit_v2_s on the FX test pytorch/pytorch#83242

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add support of MViTv2 video variants #6373

Add support of MViTv2 video variants #6373

datumbox commented Aug 5, 2022 •

edited

Loading

This comment was marked as outdated.

jdsgomes left a comment

Add support of MViTv2 video variants #6373

Add support of MViTv2 video variants #6373

Conversation

datumbox commented Aug 5, 2022 • edited Loading

Verification process

Comparing outputs

Benchmarks

Accuracy

This comment was marked as outdated.

jdsgomes left a comment

Choose a reason for hiding this comment

datumbox commented Aug 5, 2022 •

edited

Loading