In [1]:
from operator import itemgetter

print("hello")

hello


In [8]:
!pip install moviepy

Defaulting to user installation because normal site-packages is not writeable
Collecting moviepy
  Using cached moviepy-1.0.3.tar.gz (388 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting decorator<5.0,>=4.0.2 (from moviepy)
  Using cached decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Using cached proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Downloading imageio-2.35.1-py3-none-any.whl.metadata (4.9 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Using cached imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl.metadata (1.6 kB)
Using cached decorator-4.4.2-py2.py3-none-any.whl (9.2 kB)
Downloading imageio-2.35.1-py3-none-any.whl (315 kB)
Using cached imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl (22.6 MB)
Using cached proglog-0.1.10-py3-none-any.whl (6.1 kB)
Building wheels for collected packages: moviepy
  Building whee



In [10]:
from mmaction.utils import register_all_modules
register_all_modules(init_default_scope=True)

In [None]:
import mmcv
import decord
import numpy as np
from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor
from mmaction.structures import ActionDataSample

@TRANSFORMS.register_module()
class VideoInit(BaseTransform):
    def transform(self, results):
        container = decord.VideoReader(results['filename'])
        results['total_frames'] = len(container)
        results['video_reader'] = container
        return results
    
@TRANSFORMS.register_module()
class VideoSample(BaseTransform):
    def __init__(self, clip_len, num_clips, test_mode=False):
        self.clip_len = clip_len
        self.num_clips = num_clips
        self.test_mode = test_mode

    def transform(self, results):
        total_frames = results['total_frames']
        interval = total_frames // self.clip_len

        if self.test_mode:
            # Make the sampling during testing deterministic
            np.random.seed(42)

        inds_of_all_clips = []
        for i in range(self.num_clips):
            bids = np.arange(self.clip_len) * interval
            offset = np.random.randint(interval, size=bids.shape)
            inds = bids + offset
            inds_of_all_clips.append(inds)

        results['frame_inds'] = np.concatenate(inds_of_all_clips)
        results['clip_len'] = self.clip_len
        results['num_clips'] = self.num_clips
        return results
    
@TRANSFORMS.register_module()
class VideoDecode(BaseTransform):
    def transform(self, results):
        frame_inds = results['frame_inds']
        container = results['video_reader']

        imgs = container.get_batch(frame_inds).asnumpy()
        imgs = list(imgs)

        results['video_reader'] = None
        del container

        results['imgs'] = imgs
        results['img_shape'] = imgs[0].shape[:2]
        return results
    
@TRANSFORMS.register_module()
class VideoResize(BaseTransform):
    def __init__(self, r_size):
        self.r_size = (np.inf, r_size)

    def transform(self, results):
        img_h, img_w = results['img_shape']
        new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)

        imgs = [mmcv.imresize(img, (new_w, new_h))
                for img in results['imgs']]
        results['imgs'] = imgs
        results['img_shape'] = imgs[0].shape[:2]
        return results
    
@TRANSFORMS.register_module()
class VideoCrop(BaseTransform):
    def __init__(self, c_size):
        self.c_size = c_size

    def transform(self, results):
        img_h, img_w = results['img_shape']
        center_x, center_y = img_w // 2, img_h // 2
        x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
        y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
        imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
        results['imgs'] = imgs
        results['img_shape'] = imgs[0].shape[:2]
        return results
    
@TRANSFORMS.register_module()
class VideoFormat(BaseTransform):
    def transform(self, results):
        num_clips = results['num_clips']
        clip_len = results['clip_len']
        imgs = results['imgs']

        # [num_clips*clip_len, H, W, C]
        imgs = np.array(imgs)
        # [num_clips, clip_len, H, W, C]
        imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
        # [num_clips, C, clip_len, H, W]
        imgs = imgs.transpose(0, 4, 1, 2, 3)

        results['imgs'] = imgs
        return results
    
@TRANSFORMS.register_module()
class VideoPack(BaseTransform):
    def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
        self.meta_keys = meta_keys

    def transform(self, results):
        packed_results = dict()
        inputs = to_tensor(results['imgs'])
        data_sample = ActionDataSample()
        data_sample.set_gt_label(results['label'])
        metainfo = {k: results[k] for k in self.meta_keys if k in results}
        data_sample.set_metainfo(metainfo)
        packed_results['inputs'] = inputs
        packed_results['data_samples'] = data_sample
         return packed_results

In [12]:
@TRANSFORMS.register_module()
class VideoSample(BaseTransform):
    def __init__(self, clip_len, num_clips, test_mode=False):
        self.clip_len = clip_len
        self.num_clips = num_clips
        self.test_mode = test_mode

    def transform(self, results):
        total_frames = results['total_frames']
        interval = total_frames // self.clip_len

        if self.test_mode:
            # Make the sampling during testing deterministic
            np.random.seed(42)

        inds_of_all_clips = []
        for i in range(self.num_clips):
            bids = np.arange(self.clip_len) * interval
            offset = np.random.randint(interval, size=bids.shape)
            inds = bids + offset
            inds_of_all_clips.append(inds)

        results['frame_inds'] = np.concatenate(inds_of_all_clips)
        results['clip_len'] = self.clip_len
        results['num_clips'] = self.num_clips
        return results
    
@TRANSFORMS.register_module()
class VideoDecode(BaseTransform):
    def transform(self, results):
        frame_inds = results['frame_inds']
        container = results['video_reader']

        imgs = container.get_batch(frame_inds).asnumpy()
        imgs = list(imgs)

        results['video_reader'] = None
        del container

        results['imgs'] = imgs
        results['img_shape'] = imgs[0].shape[:2]
        return results
    
@TRANSFORMS.register_module()
class VideoResize(BaseTransform):
    def __init__(self, r_size):
        self.r_size = (np.inf, r_size)

    def transform(self, results):
        img_h, img_w = results['img_shape']
        new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)

        imgs = [mmcv.imresize(img, (new_w, new_h))
                for img in results['imgs']]
        results['imgs'] = imgs
        results['img_shape'] = imgs[0].shape[:2]
        return results
    
@TRANSFORMS.register_module()
class VideoCrop(BaseTransform):
    def __init__(self, c_size):
        self.c_size = c_size

    def transform(self, results):
        img_h, img_w = results['img_shape']
        center_x, center_y = img_w // 2, img_h // 2
        x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
        y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
        imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
        results['imgs'] = imgs
        results['img_shape'] = imgs[0].shape[:2]
        return results
    
@TRANSFORMS.register_module()
class VideoFormat(BaseTransform):
    def transform(self, results):
        num_clips = results['num_clips']
        clip_len = results['clip_len']
        imgs = results['imgs']

        # [num_clips*clip_len, H, W, C]
        imgs = np.array(imgs)
        # [num_clips, clip_len, H, W, C]
        imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
        # [num_clips, C, clip_len, H, W]
        imgs = imgs.transpose(0, 4, 1, 2, 3)

        results['imgs'] = imgs
        return results
    
@TRANSFORMS.register_module()
class VideoPack(BaseTransform):
    def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
        self.meta_keys = meta_keys

    def transform(self, results):
        packed_results = dict()
        inputs = to_tensor(results['imgs'])
        data_sample = ActionDataSample()
        data_sample.set_gt_label(results['label'])
        metainfo = {k: results[k] for k in self.meta_keys if k in results}
        data_sample.set_metainfo(metainfo)
        packed_results['inputs'] = inputs
        packed_results['data_samples'] = data_sample
        return packed_results

KeyError: 'VideoSample is already registered in transform at __main__'

In [13]:
import os.path as osp
from mmengine.dataset import Compose

pipeline_cfg = [
    dict(type='VideoInit'),
    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
    dict(type='VideoDecode'),
    dict(type='VideoResize', r_size=256),
    dict(type='VideoCrop', c_size=224),
    dict(type='VideoFormat'),
    dict(type='VideoPack')
]

pipeline = Compose(pipeline_cfg)


data_prefix = "C:\\Users\\rodol\\mmaction2\\data\\kinetics400_tiny\\train"
results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0)
packed_results = pipeline(results)

inputs = packed_results['inputs']
data_sample = packed_results['data_samples']

print('shape of the inputs: ', inputs.shape)

# Get metainfo of the inputs
print('image_shape: ', data_sample.img_shape)
print('num_clips: ', data_sample.num_clips)
print('clip_len: ', data_sample.clip_len)

# Get label of the inputs
print('label: ', data_sample.gt_label)

shape of the inputs:  torch.Size([1, 3, 16, 224, 224])
image_shape:  (224, 224)
num_clips:  1
clip_len:  16
label:  tensor([0])


In [14]:
import os.path as osp
from mmengine.fileio import list_from_file
from mmengine.dataset import BaseDataset
from mmaction.registry import DATASETS

@DATASETS.register_module()
class DatasetZelda(BaseDataset):
    def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''),
                 test_mode=False, modality='RGB', **kwargs):
        self.modality = modality
        super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root,
                                           data_prefix=data_prefix, test_mode=test_mode,
                                           **kwargs)

    def load_data_list(self):
        data_list = []
        fin = list_from_file(self.ann_file)
        for line in fin:
            line_split = line.strip().split()
            filename, label = line_split
            label = int(label)
            filename = osp.join(self.data_prefix['video'], filename)
            data_list.append(dict(filename=filename, label=label))
        return data_list

    def get_data_info(self, idx: int) -> dict:
        data_info = super().get_data_info(idx)
        data_info['modality'] = self.modality
        return data_info

In [15]:
root_dir = "C:\\Users\\rodol\\mmaction2\\data\\kinetics400_tiny\\"

In [16]:
from mmaction.registry import DATASETS

train_pipeline_cfg = [
    dict(type='VideoInit'),
    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
    dict(type='VideoDecode'),
    dict(type='VideoResize', r_size=256),
    dict(type='VideoCrop', c_size=224),
    dict(type='VideoFormat'),
    dict(type='VideoPack')
]

val_pipeline_cfg = [
    dict(type='VideoInit'),
    dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True),
    dict(type='VideoDecode'),
    dict(type='VideoResize', r_size=256),
    dict(type='VideoCrop', c_size=224),
    dict(type='VideoFormat'),
    dict(type='VideoPack')
]

train_dataset_cfg = dict(
    type='DatasetZelda',
    ann_file='kinetics_tiny_train_video.txt',
    pipeline=train_pipeline_cfg,
    data_root= root_dir,
    data_prefix=dict(video='train'))

val_dataset_cfg = dict(
    type='DatasetZelda',
    ann_file='kinetics_tiny_val_video.txt',
    pipeline=val_pipeline_cfg,
    data_root= root_dir,
    data_prefix=dict(video='val'))

train_dataset = DATASETS.build(train_dataset_cfg)

packed_results = train_dataset[0]

inputs = packed_results['inputs']
data_sample = packed_results['data_samples']

print('shape of the inputs: ', inputs.shape)

# Get metainfo of the inputs
print('image_shape: ', data_sample.img_shape)
print('num_clips: ', data_sample.num_clips)
print('clip_len: ', data_sample.clip_len)

# Get label of the inputs
print('label: ', data_sample.gt_label)



from mmengine.runner import Runner

BATCH_SIZE = 2

train_dataloader_cfg = dict(
    batch_size=BATCH_SIZE,
    num_workers=0,
    persistent_workers=False,
    sampler=dict(type='DefaultSampler', shuffle=True),
    dataset=train_dataset_cfg)

val_dataloader_cfg = dict(
    batch_size=BATCH_SIZE,
    num_workers=0,
    persistent_workers=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=val_dataset_cfg)

train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg)
val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg)

batched_packed_results = next(iter(train_data_loader))

batched_inputs = batched_packed_results['inputs']
batched_data_sample = batched_packed_results['data_samples']

assert len(batched_inputs) == BATCH_SIZE
assert len(batched_data_sample) == BATCH_SIZE

shape of the inputs:  torch.Size([1, 3, 16, 224, 224])
image_shape:  (224, 224)
num_clips:  1
clip_len:  16
label:  tensor([0])


In [17]:
import torch
from mmengine.model import BaseDataPreprocessor, stack_batch
from mmaction.registry import MODELS

@MODELS.register_module()
class DataPreprocessorZelda(BaseDataPreprocessor):
    def __init__(self, mean, std):
        super().__init__()

        self.register_buffer(
            'mean',
            torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1),
            False)
        self.register_buffer(
            'std',
            torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1),
            False)

    def forward(self, data, training=False):
        data = self.cast_data(data)
        inputs = data['inputs']
        batch_inputs = stack_batch(inputs)  # Batching
        batch_inputs = (batch_inputs - self.mean) / self.std  # Normalization
        data['inputs'] = batch_inputs
        return data

In [18]:
from mmaction.registry import MODELS

data_preprocessor_cfg = dict(
    type='DataPreprocessorZelda',
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375])

data_preprocessor = MODELS.build(data_preprocessor_cfg)

preprocessed_inputs = data_preprocessor(batched_packed_results)
print(preprocessed_inputs['inputs'].shape)

torch.Size([2, 1, 3, 16, 224, 224])


In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import BaseModel, BaseModule, Sequential
from mmengine.structures import LabelData
from mmaction.registry import MODELS

@MODELS.register_module()
class BackBoneZelda(BaseModule):
    def __init__(self, init_cfg=None):
        if init_cfg is None:
            init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"),
                        dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)]

        super(BackBoneZelda, self).__init__(init_cfg=init_cfg)

        self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7),
                                          stride=(1, 2, 2), padding=(1, 3, 3)),
                                nn.BatchNorm3d(64), nn.ReLU())
        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
                                    padding=(0, 1, 1))

        self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1),
                               nn.BatchNorm3d(128), nn.ReLU())

    def forward(self, imgs):
        # imgs: [batch_size*num_views, 3, T, H, W]
        # features: [batch_size*num_views, 128, T/2, H//8, W//8]
        features = self.conv(self.maxpool(self.conv1(imgs)))
        return features


@MODELS.register_module()
class ClsHeadZelda(BaseModule):
    def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None):
        if init_cfg is None:
            init_cfg = dict(type='Normal', layer='Linear', std=0.01)

        super(ClsHeadZelda, self).__init__(init_cfg=init_cfg)

        self.num_classes = num_classes
        self.in_channels = in_channels
        self.average_clips = average_clips

        if dropout != 0:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None

        self.fc = nn.Linear(self.in_channels, self.num_classes)
        self.pool = nn.AdaptiveAvgPool3d(1)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x):
        N, C, T, H, W = x.shape
        x = self.pool(x)
        x = x.view(N, C)
        assert x.shape[1] == self.in_channels

        if self.dropout is not None:
            x = self.dropout(x)

        cls_scores = self.fc(x)
        return cls_scores

    def loss(self, feats, data_samples):
        cls_scores = self(feats)
        labels = torch.stack([x.gt_label for x in data_samples])
        labels = labels.squeeze()

        if labels.shape == torch.Size([]):
            labels = labels.unsqueeze(0)

        loss_cls = self.loss_fn(cls_scores, labels)
        return dict(loss_cls=loss_cls)

    def predict(self, feats, data_samples):
        cls_scores = self(feats)
        num_views = cls_scores.shape[0] // len(data_samples)
        # assert num_views == data_samples[0].num_clips
        cls_scores = self.average_clip(cls_scores, num_views)

        for ds, sc in zip(data_samples, cls_scores):
            pred = LabelData(item=sc)
            ds.pred_scores = pred
        return data_samples

    def average_clip(self, cls_scores, num_views):
          if self.average_clips not in ['score', 'prob', None]:
            raise ValueError(f'{self.average_clips} is not supported. '
                             f'Currently supported ones are '
                             f'["score", "prob", None]')

          total_views = cls_scores.shape[0]
          cls_scores = cls_scores.view(total_views // num_views, num_views, -1)

          if self.average_clips is None:
              return cls_scores
          elif self.average_clips == 'prob':
              cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
          elif self.average_clips == 'score':
              cls_scores = cls_scores.mean(dim=1)

          return cls_scores


@MODELS.register_module()
class RecognizerZelda(BaseModel):
    def __init__(self, backbone, cls_head, data_preprocessor):
        super().__init__(data_preprocessor=data_preprocessor)

        self.backbone = MODELS.build(backbone)
        self.cls_head = MODELS.build(cls_head)

    def extract_feat(self, inputs):
        inputs = inputs.view((-1, ) + inputs.shape[2:])
        return self.backbone(inputs)

    def loss(self, inputs, data_samples):
        feats = self.extract_feat(inputs)
        loss = self.cls_head.loss(feats, data_samples)
        return loss

    def predict(self, inputs, data_samples):
        feats = self.extract_feat(inputs)
        predictions = self.cls_head.predict(feats, data_samples)
        return predictions

    def forward(self, inputs, data_samples=None, mode='tensor'):
        if mode == 'tensor':
            return self.extract_feat(inputs)
        elif mode == 'loss':
            return self.loss(inputs, data_samples)
        elif mode == 'predict':
            return self.predict(inputs, data_samples)
        else:
            raise RuntimeError(f'Invalid mode: {mode}')

In [20]:
import torch
import copy
from mmaction.registry import MODELS

model_cfg = dict(
    type='RecognizerZelda',
    backbone=dict(type='BackBoneZelda'),
    cls_head=dict(
        type='ClsHeadZelda',
        num_classes=2,
        in_channels=128,
        average_clips='prob'),
    data_preprocessor = dict(
        type='DataPreprocessorZelda',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375]))

model = MODELS.build(model_cfg)

# Train
model.train()
model.init_weights()
data_batch_train = copy.deepcopy(batched_packed_results)
data = model.data_preprocessor(data_batch_train, training=True)
loss = model(**data, mode='loss')
print('loss dict: ', loss)

# Test
with torch.no_grad():
    model.eval()
    data_batch_test = copy.deepcopy(batched_packed_results)
    data = model.data_preprocessor(data_batch_test, training=False)
    predictions = model(**data, mode='predict')
print('Label of Sample[0]', predictions[0].gt_label)
print('Scores of Sample[0]', predictions[0].to_dict()["pred_scores"]["item"])

09/06 12:54:11 - mmengine - [4m[97mINFO[0m - 
backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]): 
KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 
 
09/06 12:54:11 - mmengine - [4m[97mINFO[0m - 
backbone.conv1.0.bias - torch.Size([64]): 
KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 
 
09/06 12:54:11 - mmengine - [4m[97mINFO[0m - 
backbone.conv1.1.weight - torch.Size([64]): 
The value is the same before and after calling `init_weights` of RecognizerZelda  
 
09/06 12:54:11 - mmengine - [4m[97mINFO[0m - 
backbone.conv1.1.bias - torch.Size([64]): 
The value is the same before and after calling `init_weights` of RecognizerZelda  
 
09/06 12:54:11 - mmengine - [4m[97mINFO[0m - 
backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]): 
KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 
 
09/06 12:54:11 - mmengine - [4m[97mINFO[0m - 
backbone.conv.0.bias - torch.Size([128]):

### Ignore the following cells (START)
The following cells are simply to get to know mmaction2

In [33]:
dir(predictions[0])

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_data_fields',
 '_metainfo_fields',
 'all_items',
 'all_keys',
 'all_values',
 'clip_len',
 'clone',
 'cpu',
 'cuda',
 'detach',
 'features',
 'get',
 'gt_instances',
 'gt_label',
 'img_shape',
 'items',
 'keys',
 'metainfo',
 'metainfo_items',
 'metainfo_keys',
 'metainfo_values',
 'mlu',
 'musa',
 'new',
 'npu',
 'num_clips',
 'numpy',
 'pop',
 'pred_scores',
 'proposals',
 'set_data',
 'set_field',
 'set_gt_label',
 'set_metainfo',
 'set_pred_label',
 'set_pred_score',
 'to',
 'to_dict',
 'to_tensor',
 'update',
 'values']

In [34]:
# copy the predictions[0] variable to temp
temp = copy.deepcopy(predictions[0])
temp.to_dict()

{'clip_len': 16,
 'num_clips': 1,
 'img_shape': (224, 224),
 'gt_label': tensor([1]),
 'pred_scores': {'item': tensor([0.4975, 0.5025])}}

In [39]:
temp.to_dict()["pred_scores"]["item"]

tensor([0.4975, 0.5025])

In [55]:
temp.to_dict()["pred_scores"]["item"].numpy()

array([0.49752143, 0.5024786 ], dtype=float32)

### Ignore the following cells (END)

In [21]:
import copy 
from collections import OrderedDict
from mmengine.evaluator import BaseMetric
from mmaction.evaluation import top_k_accuracy
from mmaction.registry import METRICS

@METRICS.register_module()
class AccuracyMetric_c(BaseMetric):
    def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'):
        super().__init__(collect_device=collect_device, prefix=prefix)
        self.topk = topk

    def process(self, data_batch, data_samples):
        data_samples = copy.deepcopy(data_samples)
        for data_sample in data_samples:
            result = dict()
            scores = data_sample['pred_scores']['item'].cpu().numpy() # if you get errors downstream of this cell, try changine 'pred_score' to 'pred_scores'
            label = data_sample['gt_label'].item()
            result['scores'] = scores
            result['label'] = label
            self.results.append(result)

    def compute_metrics(self, results: list) -> dict:
        eval_results = OrderedDict()
        labels = [res['label'] for res in results]
        scores = [res['scores'] for res in results]
        topk_acc = top_k_accuracy(scores, labels, self.topk)
        for k, acc in zip(self.topk, topk_acc):
            eval_results[f'topk{k}'] = acc
        return eval_results

In [22]:
from mmaction.registry import METRICS

metric_cfg = dict(type='AccuracyMetric_c', topk=(1, 5))

metric = METRICS.build(metric_cfg)

data_samples = [d.to_dict() for d in predictions]

metric.process(batched_packed_results, data_samples)
acc = metric.compute_metrics(metric.results)
print(acc)

OrderedDict([('topk1', 1.0), ('topk5', 1.0)])


In [23]:
save_directory = "C:\\Users\\rodol\\Desktop\\mmaction2_test"

In [24]:
from mmengine.runner import Runner

train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
val_cfg = dict(type='ValLoop')

optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01))

runner = Runner(model=model_cfg, work_dir= save_directory,
                train_dataloader=train_dataloader_cfg,
                train_cfg=train_cfg,
                val_dataloader=val_dataloader_cfg,
                val_cfg=val_cfg,
                optim_wrapper=optim_wrapper,
                val_evaluator=[metric_cfg],
                default_scope='mmaction')
runner.train()

09/06 12:54:48 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: win32
    Python: 3.8.19 (default, Mar 20 2024, 19:55:45) [MSC v.1916 64 bit (AMD64)]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 449743703
    GPU 0: NVIDIA GeForce RTX 4070 Ti SUPER
    CUDA_HOME: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8
    NVCC: Cuda compilation tools, release 11.8, V11.8.89
    MSVC: n/a, reason: fileno
    PyTorch: 2.0.1
    PyTorch compiling details: PyTorch built with:
  - C++ Version: 199711
  - MSVC 193431937
  - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)
  - OpenMP 2019
  - LAPACK is enabled (usually provided by MKL)
  - CPU capability usage: AVX2
  - CUDA Runtime 11.8
  - NVCC architecture flags: -gencode;arch=compute_37,

RecognizerZelda(
  (data_preprocessor): DataPreprocessorZelda()
  (backbone): BackBoneZelda(
    (conv1): Sequential(
      (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3))
      (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (maxpool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
    (conv): Sequential(
      (0): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
      (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  init_cfg=[{'type': 'Kaiming', 'layer': 'Conv3d', 'mode': 'fan_out', 'nonlinearity': 'relu'}, {'type': 'Constant', 'layer': 'BatchNorm3d', 'val': 1, 'bias': 0}]
  (cls_head): ClsHeadZelda(
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=128, out_features=2, bias=True)
    (pool): AdaptiveAvgPool3d(output_size

In [71]:
from mmaction.apis import inference_recognizer, init_recognizer
config_path = r"C:\Users\rodol\mmaction2\configs\recognition\tsn\tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py"
checkpoint_path = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth'
# img_path = "C:/Users/rodol/Downloads/mmaction2/mmaction2/demo/demo.mp4"
img_path = r"C:\Users\rodol\Desktop\E2E_Animation\mp4_videos\myle_salsa_dance_right_1.mp4"



# build the model from a config file and a checkpoint file
model = init_recognizer(config_path, checkpoint_path, device="cuda:0")  # device can be 'cuda:0'
# test a single image
result = inference_recognizer(model, img_path)

Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth


In [72]:
result

<ActionDataSample(

    META INFORMATION
    img_shape: (224, 224)
    num_classes: 400

    DATA FIELDS
    gt_label: tensor([-1], device='cuda:0')
    pred_score: tensor([2.1894e-05, 6.0890e-05, 9.0677e-07, 1.8772e-06, 2.6567e-07, 8.4372e-06,
                1.1853e-07, 6.1647e-07, 3.2694e-09, 1.9830e-06, 2.5955e-08, 3.3746e-07,
                1.0267e-05, 3.7873e-08, 2.6045e-08, 3.7008e-07, 7.6685e-04, 1.0057e-08,
                4.8864e-04, 1.7828e-07, 1.7355e-04, 7.2847e-08, 1.4406e-08, 1.7917e-08,
                1.8651e-06, 1.5357e-06, 1.0317e-06, 1.1868e-07, 1.1899e-08, 4.8263e-09,
                9.0950e-08, 8.8833e-04, 2.3220e-08, 8.9272e-08, 7.0239e-03, 3.1559e-08,
                1.6723e-07, 1.5605e-07, 4.7817e-07, 5.9423e-08, 1.1296e-09, 2.6783e-04,
                1.5621e-07, 8.5514e-04, 1.0160e-05, 7.0143e-04, 3.5789e-07, 9.3122e-08,
                4.1398e-05, 1.4509e-03, 6.7694e-04, 3.6674e-07, 1.5850e-08, 3.6335e-08,
                1.8279e-08, 6.7639e-06, 1.3834e-08,

In [67]:
from operator import itemgetter
label = r"C:\Users\rodol\mmaction2\tools\data\kinetics\label_map_k400.txt"

pred_scores = result.pred_score.tolist()
score_tuples = tuple(zip(range(len(pred_scores)), pred_scores))
score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)
top5_label = score_sorted[:5]

labels = open(label).readlines()
labels = [x.strip() for x in labels]
results = [(labels[x[0]], x[1]) for x in top5_label]

print('The top-5 labels with corresponding scores are:')
for result in results:
    print(f'{result[0]}: ', result[1])

The top-5 labels with corresponding scores are:
robot dancing:  0.32952478528022766
juggling balls:  0.26003003120422363
country line dancing:  0.15764248371124268
contact juggling:  0.09355822205543518
jumpstyle dancing:  0.0630040094256401


In [70]:
results

[('robot dancing', 0.32952478528022766),
 ('juggling balls', 0.26003003120422363),
 ('country line dancing', 0.15764248371124268),
 ('contact juggling', 0.09355822205543518),
 ('jumpstyle dancing', 0.0630040094256401)]

In [4]:
img_path2 = "C:\\Users\\rodol\\Pictures\\Screenshots\\rando.png"
img_test = mmcv.imread(img_path2)


In [8]:
import numpy as np
import mmcv
img = np.random.rand(100, 100, 3)
mmcv.imshow(img)

In [31]:
path = r"C:\Users\rodol\Desktop\E2E_Animation\mp4_videos\myle_salsa_dance_right_1.mp4"
mmcv.imshow(img_path2)# 

In [22]:
import cv2

# load image at path image_path2, and show the image
image = cv2.imread(img_path2)
cv2.imshow('image', image)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [45]:
video = mmcv.VideoReader(img_path)
# obtain basic information
print(len(video))
print(video.width, video.height, video.resolution, video.fps)

260
3840 2160 (3840, 2160) 60.0


In [61]:
label = r"C:\Users\rodol\mmaction2\tools\data\kinetics\label_map_k400.txt"
# open the file, go through each line, and append it to a variable called label_list
label_list = []
with open(label) as f:
    for line in f:
        label_list.append(line.strip())

# print the first 10 labels
print(label_list[:10])

['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream', 'archery', 'arm wrestling', 'arranging flowers', 'assembling computer', 'auctioning']


In [53]:
mmcv.imshow(img, "check_it")

In [74]:
import torch
import decord
from pathlib import Path
from mmaction.structures import ActionDataSample
from mmaction.visualization import ActionVisualizer
from mmengine.structures import LabelData
path = r"C:\Users\rodol\Desktop\E2E_Animation\mp4_videos\myle_salsa_dance_right_1.mp4"
video = decord.VideoReader(path)
video = video.get_batch(range(32)).asnumpy()
# data_sample is analogous to results in the previous example
vis = ActionVisualizer(
    save_dir=Path("C:\\Users\\rodol\\Desktop\\mmaction2_test\\outputs"),
    vis_backends= [dict(type='LocalVisBackend')]
)
vis.dataset_meta = {'classes': label_list}
vis.add_datasample("check_it_out", video, data_sample = result)
assert Path("C:\\Users\\rodol\\Desktop\\mmaction2_test\\outputs\\vis_data\\check_it_out\\frames_0\\1.png").exists()
assert Path("C:\\Users\\rodol\\Desktop\\mmaction2_test\\outputs\\vis_data\\check_it_out\\frames_0\\2.png").exists()
vis.add_datasample("check_it_out", video, step=1)
assert Path("C:\\Users\\rodol\\Desktop\\mmaction2_test\\outputs\\vis_data\\check_it_out\\frames_1\\1.png").exists()

In [66]:
label_list[0]

'abseiling'

In [56]:
dir(video)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_avg_fps',
 '_frame_pts',
 '_handle',
 '_key_indices',
 '_num_frame',
 '_validate_indices',
 'get_avg_fps',
 'get_batch',
 'get_frame_timestamp',
 'get_key_indices',
 'next',
 'seek',
 'seek_accurate',
 'skip_frames']

In [57]:
torch.tensor([2])

tensor([2])