## Optional pip installation (for Windows, uncomment to run)

In [1]:
#%pip install matplotlib torch==1.8.0 torchvision==0.9.0 gluoncv decord

In [2]:
#%pip uninstall Pillow

In [3]:
#%pip install Pillow

In [4]:
#%matplotlib inline

## Optional pip installation (for Mac)

1. Use python 3.8
2. Run `pip install torch==1.6.0 torchvision==0.7.0 gluoncv decord`
3. Run `pip uninstall Pillow`
4. Run `pip install Pillow==9.5.0`
5. (Optional) install Jupyter lab to run example notebook linked in tutorial `pip install jupyterlab`
6. Download the model config to download the pretrained model used in the tutorial (you will need to edit the config file path to where this file is stored on your system when running the code block which loads the model): https://raw.githubusercontent.com/dmlc/gluon-cv/master/scripts/action-recognition/configuration/resnet50_v1b_kinetics400.yaml
7. Run the notebook and check if class 0 (abseiling) is the final output.

In [6]:
import numpy as np
import decord
import torch

from gluoncv.torch.utils.model_utils import download
from gluoncv.torch.data.transforms.videotransforms import video_transforms, volume_transforms
from gluoncv.torch.engine.config import get_cfg_defaults
from gluoncv.torch.model_zoo import get_model

Download Kinetics dataset for processing (TODO - right now we just download one video)

In [3]:
url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'  # contains 250 frames
video_fname = download(url)
vr = decord.VideoReader(video_fname)
frame_id_list = [5, 6, 7, 8, 9]
video_data = vr.get_batch(frame_id_list).asnumpy()

Now we define transformations for the video clip.
This transformation function does four things:
(1) resize the shorter side of video clip to short_side_size,
(2) center crop the video clip to crop_size x crop_size,
(3) transpose the video clip to ``num_channels*num_frames*height*width``,
and (4) normalize it with mean and standard deviation calculated across all ImageNet images.



In [4]:
crop_size = 224
short_side_size = 256
transform_fn = video_transforms.Compose([video_transforms.Resize(short_side_size, interpolation='bilinear'),
                                         video_transforms.CenterCrop(size=(crop_size, crop_size)),
                                         volume_transforms.ClipToTensor(),
                                         video_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

clip_input = transform_fn(video_data)
print('Video data is downloaded and preprocessed.')

Video data is downloaded and preprocessed.


Next, we load a pre-trained I3D model. Make sure to change the ``pretrained`` in the configuration file to True.



In [5]:
config_file = './i3d_resnet50_v1_kinetics400.yaml'
cfg = get_cfg_defaults()
cfg.merge_from_file(config_file)
model = get_model(cfg)
model.eval()
print('%s model is successfully loaded.' % cfg.CONFIG.MODEL.NAME)

i3d_resnet50_v1_kinetics400 model is successfully loaded.


Finally, we prepare the video clip and feed it to the model.



In [6]:
with torch.no_grad():
    pred = model(torch.unsqueeze(clip_input, dim=0)).numpy()

# Convert raw logits to probabilities using softmax
probs = torch.nn.functional.softmax(torch.tensor(pred), dim=1).numpy()


# Get the top predicted class and calculate confidence interval
top_class = np.argmax(probs)
confidence_interval = np.max(probs) - np.min(probs)

print(f'The input video clip is classified as class {top_class} with confidence interval {confidence_interval}')

The input video clip is classified as class 0 with confidence interval 0.7715632319450378


## Calculate confidence of frame windows
##### Adjust the 'N' value to set the step size

In [8]:
N = 2

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4' 
video_fname = download(url)
vr = decord.VideoReader(video_fname)
config_file = './i3d_resnet50_v1_kinetics400.yaml'
cfg = get_cfg_defaults()
cfg.merge_from_file(config_file)
model = get_model(cfg)
model.eval()
for i in range(2 * N, len(vr) - (2 * N), 2): 
    frame_id_list = range(i - (2*N), i + (2*N) + 1, N)
    video_data = vr.get_batch(frame_id_list).asnumpy()
    crop_size = 224
    short_side_size = 256
    transform_fn = video_transforms.Compose([video_transforms.Resize(short_side_size, interpolation='bilinear'),
                                            video_transforms.CenterCrop(size=(crop_size, crop_size)),
                                            volume_transforms.ClipToTensor(),
                                            video_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


    clip_input = transform_fn(video_data)
    with torch.no_grad():
        pred = model(torch.unsqueeze(clip_input, dim=0)).numpy()

    # Convert raw logits to probabilities using softmax
    probs = torch.nn.functional.softmax(torch.tensor(pred), dim=1).numpy()

    # Get the top predicted class and calculate confidence interval
    top_class = np.argmax(probs)
    confidence_interval = np.max(probs) - np.min(probs)

    print(f'The input video clip is classified as class {top_class} with confidence interval {confidence_interval} for frame window {i}')


The input video clip is classified as class 0 with confidence interval 0.9443978071212769 for frame window 4
The input video clip is classified as class 0 with confidence interval 0.928821861743927 for frame window 6
The input video clip is classified as class 0 with confidence interval 0.9487524032592773 for frame window 8
The input video clip is classified as class 0 with confidence interval 0.8573701977729797 for frame window 10
The input video clip is classified as class 0 with confidence interval 0.8656694293022156 for frame window 12
The input video clip is classified as class 0 with confidence interval 0.9996094107627869 for frame window 14
The input video clip is classified as class 0 with confidence interval 0.9967054724693298 for frame window 16
The input video clip is classified as class 0 with confidence interval 0.9980601668357849 for frame window 18
The input video clip is classified as class 0 with confidence interval 0.9998185038566589 for frame window 20
The input vide

##### function to measure average confidence given window size 'N' (Variable # frames)

In [10]:
def frame_window_confidence(N, vr, model, true_class):
    sum_confidence = 0
    sum_class = 0
    for i in range(N, len(vr) - N):
        frame_id_list = range(i-N, i+N+1)
        video_data = vr.get_batch(frame_id_list).asnumpy()
        crop_size = 224
        short_side_size = 256
        transform_fn = video_transforms.Compose([video_transforms.Resize(short_side_size, interpolation='bilinear'),
                                                video_transforms.CenterCrop(size=(crop_size, crop_size)),
                                                volume_transforms.ClipToTensor(),
                                                video_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
        clip_input = transform_fn(video_data)
        with torch.no_grad():
            pred = model(torch.unsqueeze(clip_input, dim=0)).numpy()
        probs = torch.nn.functional.softmax(torch.tensor(pred), dim=1).numpy()
        top_class = np.argmax(probs)
        confidence_interval = np.max(probs) - np.min(probs)
        if top_class == true_class: 
            sum_class += 1
            sum_confidence += confidence_interval
        #print(f'Class: {top_class} \tConfidence: {confidence_interval} \tWindow:{i}')
    #print(f'Average confidence level for window size {N} is {sum_confidence / (len(vr) - 2*N)}')
    #print(f'Predicted top class with accuracy {sum_class / (len(vr) - 2*N)}')
    return sum_confidence / (len(vr) - 2*N), sum_class / (len(vr) - 2*N)


In [None]:
url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4' 
video_fname = download(url)
vr = decord.VideoReader(video_fname)
config_file = './i3d_resnet50_v1_kinetics400.yaml'
cfg = get_cfg_defaults()
cfg.merge_from_file(config_file)
model = get_model(cfg)
model.eval()

for i in range(2, 10):
    confidence, accuracy = frame_window_confidence(i, vr, model, 0)
    print(f'{i*2+1} frames has average confidence of {confidence} with an accuracy of {accuracy}')