# **Packages**

In [None]:
!pip install decord
!pip install gluoncv
!pip install mxnet

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model
from gluoncv.utils.filesystem import try_import_decord
import ffmpeg
import glob
import cv2

# **Utils**

In [3]:
def getInputID3Models(path):
    decord = try_import_decord()
    vr = decord.VideoReader(path)
    frame_id_list = range(0,64,2)
    video_data = vr.get_batch(frame_id_list).asnumpy()
    clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
    transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    clip_input = transform_fn(clip_input)
    clip_input = np.stack(clip_input, axis=0)
    clip_input = clip_input.reshape((-1,) + (32, 3, 224, 224))
    clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
    name_video = path.split('/')[-1].split('.')[0]
    print(f'Video data {name_video} is downloaded and preprocessed.')
    return clip_input


def getInputSlowFastModels(path):
    decord = try_import_decord()
    vr = decord.VideoReader(path)
    fast_frame_id_list = range(0, 64,2)
    slow_frame_id_list = range(0, 64, 16)
    frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
    video_data = vr.get_batch(frame_id_list).asnumpy()
    clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]
    transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    clip_input = transform_fn(clip_input)
    clip_input = np.stack(clip_input, axis=0)
    clip_input = clip_input.reshape((-1,) + (36, 3, 224, 224))
    clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
    name_video = path.split('/')[-1].split('.')[0]
    print(f'Video data {name_video} is downloaded and preprocessed.')
    return clip_input

In [4]:
import os
lst = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        lst.append(os.path.join(dirname, filename))
print(lst)

/kaggle/input/actio-clips/video_100_archery.mp4
/kaggle/input/actio-clips/video_902_handshake.mp4
/kaggle/input/actio-clips/video-kiss_or_hug.mp4
['/kaggle/input/actio-clips/video_100_archery.mp4', '/kaggle/input/actio-clips/video_902_handshake.mp4', '/kaggle/input/actio-clips/video-kiss_or_hug.mp4']


# **Experiment 1: Model i3d_nl10_resnet50_v1_kinetics400**

In [5]:
model_name = 'i3d_nl10_resnet50_v1_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)
print('%s model is successfully loaded.' % model_name)

i3d_nl10_resnet50_v1_kinetics400 model is successfully loaded.


In [6]:
for path in lst:
    clip_input = getInputID3Models(path)
    pred = net(nd.array(clip_input))
    classes = net.classes
    topK = 5
    ind = nd.topk(pred, k=topK)[0].astype('int')
    print('The input video clip is classified to be')
    for i in range(topK):
        print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))
    print('')

Video data video_100_archery is downloaded and preprocessed.
The input video clip is classified to be
	[archery], with probability 1.000.
	[throwing_axe], with probability 0.000.
	[using_remote_controller_-not_gaming-], with probability 0.000.
	[golf_driving], with probability 0.000.
	[paragliding], with probability 0.000.

Video data video_902_handshake is downloaded and preprocessed.
The input video clip is classified to be
	[washing_hands], with probability 0.691.
	[shaking_hands], with probability 0.116.
	[drumming_fingers], with probability 0.039.
	[applying_cream], with probability 0.039.
	[tapping_pen], with probability 0.020.

Video data video-kiss_or_hug is downloaded and preprocessed.
The input video clip is classified to be
	[water_sliding], with probability 0.351.
	[parasailing], with probability 0.182.
	[flying_kite], with probability 0.106.
	[skydiving], with probability 0.054.
	[snorkeling], with probability 0.037.



**Observations:**
This model classify correctly archery and almost classify correctly the handshaking video. However the hugging video is misclassified completely

# **Experiment 2: Model i3d_nl10_resnet101_v1_kinetics400**

In [7]:
model_name = 'i3d_nl10_resnet101_v1_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)
print('%s model is successfully loaded.' % model_name)

i3d_nl10_resnet101_v1_kinetics400 model is successfully loaded.


In [8]:
for path in lst:
    clip_input = getInputID3Models(path)
    pred = net(nd.array(clip_input))
    classes = net.classes
    topK = 5
    ind = nd.topk(pred, k=topK)[0].astype('int')
    print('The input video clip is classified to be')
    for i in range(topK):
        print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))
    print('')

Video data video_100_archery is downloaded and preprocessed.
The input video clip is classified to be
	[archery], with probability 1.000.
	[throwing_axe], with probability 0.000.
	[javelin_throw], with probability 0.000.
	[riding_or_walking_with_horse], with probability 0.000.
	[golf_driving], with probability 0.000.

Video data video_902_handshake is downloaded and preprocessed.
The input video clip is classified to be
	[shaking_hands], with probability 0.379.
	[washing_hands], with probability 0.092.
	[rock_scissors_paper], with probability 0.057.
	[riding_camel], with probability 0.039.
	[arm_wrestling], with probability 0.035.

Video data video-kiss_or_hug is downloaded and preprocessed.
The input video clip is classified to be
	[water_sliding], with probability 0.452.
	[plastering], with probability 0.161.
	[flying_kite], with probability 0.035.
	[yoga], with probability 0.027.
	[diving_cliff], with probability 0.017.



**Observations:**
This model classified correctly archery and shaking hands. However, the hugging video is misclassified as in experiment 1 with similar labels found

# **Experiment 3: Model slowfast_4x16_resnet50_kinetics400**

In [11]:
model_name = 'slowfast_4x16_resnet50_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)
print('%s model is successfully loaded.' % model_name)

slowfast_4x16_resnet50_kinetics400 model is successfully loaded.


In [12]:
for path in lst:
    clip_input = getInputSlowFastModels(path)
    pred = net(nd.array(clip_input))
    classes = net.classes
    topK = 3
    ind = nd.topk(pred, k=topK)[0].astype('int')
    print('The input video clip is classified to be')
    for i in range(topK):
        print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))
    print('')

Video data video_100_archery is downloaded and preprocessed.
The input video clip is classified to be
	[archery], with probability 1.000.
	[throwing_axe], with probability 0.000.
	[riding_or_walking_with_horse], with probability 0.000.

Video data video_902_handshake is downloaded and preprocessed.
The input video clip is classified to be
	[riding_camel], with probability 0.995.
	[flying_kite], with probability 0.001.
	[jogging], with probability 0.000.

Video data video-kiss_or_hug is downloaded and preprocessed.
The input video clip is classified to be
	[water_sliding], with probability 0.639.
	[bungee_jumping], with probability 0.041.
	[plastering], with probability 0.037.



**Observations:**
This model classified correctly archery but misclassify the two other videos

# **CONCLUSION**

Experiment 2 produced the best results. However, I couldn't classify correctly the hugging on any experiment