In [1]:
import torch
import torchvision
import cv2
from torch import nn
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
import torchvision.transforms as tt
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torchvision.models.video import r2plus1d_18, R2Plus1D_18_Weights
from sklearn.metrics import accuracy_score

## Loading data

In [12]:
device="cuda"

In [292]:
seed_constant = 27
np.random.seed(seed_constant)
random.seed(seed_constant)
torch.random.seed= seed_constant

BATCH_SIZE=10

# Specify the height and width to which each video frame will be resized in our dataset.
IMAGE_HEIGHT , IMAGE_WIDTH = 224,224

# Specify the number of frames of a video that will be fed to the model as one sequence.
SEQUENCE_LENGTH = 20

# Specify the directory containing the UCF50 dataset.
DATASET_DIR = "UCF50"

# Specify the list containing the names of the classes used for training. Feel free to choose any set of classes.
CLASSES_LIST = ["Diving", "Fencing", "GolfSwing", "PlayingGuitar","HighJump",
                "HorseRiding", "Swing", "Punch","PlayingViolin","JumpRope",
                "Basketball","Shooting"]


In [3]:
%%capture

# Downlaod the UCF50 Dataset
!wget --no-check-certificate https://www.crcv.ucf.edu/data/UCF50.rar

#Extract the Dataset
!unrar x UCF50.rar

In [4]:
# all_classes_names = os.listdir('UCF50')
# all_classes_names

In [4]:
# transforms required by r2plus1d model
r21d_trans=tt.Compose([R2Plus1D_18_Weights.KINETICS400_V1.transforms()])

In [5]:
# functions to apply transforms to each frame
def apply_tansforms(feat):

  # list to store transformed frames
  feats=[]
  for i in range(len(feat)):

      #converting to array and reshaping in required format
      x=np.transpose(np.array(feat[i]), (0,3,1,2))
      # convertin to tensor to apply transforms
      a=torch.Tensor(x)
      # apply transforms and append to the list
      feats.append(r21d_trans(a))
  return feats

In [6]:
# keeping default weights i.e. for Kinetics dataset
weights=R2Plus1D_18_Weights.DEFAULT
# Initializing the model
r21d=r2plus1d_18(weights=weights,progress=True)
# getting number of input features in the last layer
r21d.fc


Downloading: "https://download.pytorch.org/models/r2plus1d_18-91a641e6.pth" to /root/.cache/torch/hub/checkpoints/r2plus1d_18-91a641e6.pth
100%|██████████| 120M/120M [00:01<00:00, 82.5MB/s]


Linear(in_features=512, out_features=400, bias=True)

In [13]:
# putting model on gpu
model=r21d.to(device)

## Predictions

In [14]:
def val_frames_extraction(video_path,SEQUENCE_LENGTH=18,TIME_SECODNS=3):
    '''
    This function will extract the required frames from a video after resizing and normalizing them.
    Args:
        video_path: The path of the video in the disk, whose frames are to be extracted.
        SEQUENCE_LENGTH: Nu,ber of frames per sub-video
        TIME_SECODNS: video to be divided into sub-videos of what duration
    Returns:
        vid_list: A list of list of sub-videos containing the resized and normalized frames.
    '''

    # Declare a list to store sub-videos and their frames.
    vid_list=[]
    # Read the Video File using the VideoCapture object.
    video_reader = cv2.VideoCapture(video_path)
    # Get Frame counts
    frame_count=video_reader.get(cv2.CAP_PROP_FRAME_COUNT)
    # Get FPS
    FPS=video_reader.get(cv2.CAP_PROP_FPS)
    # Find video length
    vid_len=frame_count/FPS
    # Finding frames in 3 seconds window
    thresh_frames=int(3*FPS)

    # Calculate the the interval after which frames will be added to the list.
    skip_frames_window = int(thresh_frames/SEQUENCE_LENGTH)

    # Iterate through the Video Frames.
    n_videos=int(vid_len/3)
    print("Print division by three gives : ",round(vid_len/3,2),f"so dividing the full video into {n_videos} sub-videos")

    # Loop to dividide videos into subvideos and append to vid_list
    for i in range(n_videos):

      # counter for skipping window
      count=i*SEQUENCE_LENGTH

      # Initialize a list to store frames of sub videos
      frames_list=[]

      # split each sub video into frames equal to sequence length
      for frame_counter in range(SEQUENCE_LENGTH):

          # Set the current frame position of the video. and keep skipping to required frames
          video_reader.set(cv2.CAP_PROP_POS_FRAMES, (count + frame_counter) * skip_frames_window)

          # Reading the frame from the video.
          success, frame = video_reader.read()

          # Check if Video frame is not successfully read then break the loop
          if not success:
              break

          # Resize the Frame to fixed height and width.
          resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))

          # Normalize the resized frame by dividing it with 255 so that each pixel value then lies between 0 and 1
          normalized_frame = resized_frame / 255

          # Append the normalized frame into the frames list
          frames_list.append(normalized_frame)

      # append the list of collected frames from sub-video to vid_list
      vid_list.append(frames_list)

    # Release the VideoCapture object.
    video_reader.release()

    # Return the frames list.
    return vid_list

In [15]:

def preprocess_pred(video_file_path="/content/video (1080p).mp4",SEQUENCE_LENGTH=6):
    '''
    This function will call extract_frames function and apply transforms on each sub-video
    Args:
        video_file_path: The path of the video in the disk, whose frames are to be extracted.
    Returns:
        vid_list: A list of transformed sub-videos containing the required preprocessed frames.
    '''

    # Extract the list of sub-videos and required frames.
    vid_list = val_frames_extraction(video_file_path,SEQUENCE_LENGTH=SEQUENCE_LENGTH)

    #loop over each sub_video and apply required transformation
    for i in range(len(vid_list)):

      #converting to numpy array
      vid_list[i]=np.asarray([vid_list[i]])
      # applying the r21d's specific transformations
      vid_list[i]=apply_tansforms(vid_list[i])[0]

    # Return processed vid_list
    return vid_list

In [16]:

import pandas as pd
df=pd.read_csv("/content/kinetics_400_labels.csv")
class_names=np.array(df["name"])

In [503]:
# calling preprocess_pred()
video_list=preprocess_pred("/content/UCF50/PlayingGuitar/v_PlayingGuitar_g01_c01.avi",18)

Print division by three gives :  3.33 so dividing the full video into 3 sub-videos


In [504]:
###-------------------
# Making Predictions
###-------------------

with torch.no_grad():
    # setting model mode to evaluation
    model.eval()
    # lists to store predictions and softmax probabilities for each sub-video
    y_preds=[]
    y_probas=[]

    #pass each sub video to model and store the predictions
    for i in video_list:
      # giving batch of 1 and getting predictions
      logits=model(i.unsqueeze(0).to(device))
      # getting softmax probabilities
      probas=(F.softmax(logits[0],dim=0)).to("cpu")
      # getting class with highest logit value
      pred=torch.argmax(logits, dim = 1).to("cpu").numpy()
      print(pred.shape)
      # append the predictions and probabilities to resp. lists
      y_preds.append(pred)
      y_probas.append(probas)


(1,)
(1,)
(1,)


In [505]:
y_preds,max(y_probas[0])#,max(y_probas[1])

([array([232]), array([232]), array([232])], tensor(0.8307))

In [506]:
for j in range(len(y_probas)):
  print(f"----For sub video {j} --------")
  print(np.sort(y_probas[j])[-3:])
  print([class_names[i] for i in np.argsort(y_probas[j])[-3:] ])

----For sub video 0 --------
[0.01667413 0.05712632 0.83069927]
['finger snapping', 'playing violin', 'playing guitar']
----For sub video 1 --------
[0.05401805 0.36987865 0.4236617 ]
['finger snapping', 'playing violin', 'playing guitar']
----For sub video 2 --------
[0.02555264 0.09929601 0.77150047]
['strumming guitar', 'playing violin', 'playing guitar']


In [482]:
class_predictions=[class_names[i] for i in y_preds]
for i,j in zip(y_probas,class_predictions):
  print(max(i),j)

tensor(0.2459) ['passing American football (not in game)']


In [None]:
playing guitar and basket ball
both show better results with lower FPS (6-10 sec)

# inference:-
## seems like the videos which have high speed we need more frames and for slower we require less
## for videos with more static or less movement lesser frames work better and videos with lots of movements more FPS needed

In [None]:
Swing
tried with custom video as well as dataset videos(which are not at all that good)
10-12 (88,57) (67,81) sec gives best
13 sec onwards model starts getting bit confused  (53,61)  second class is real low for all 10,12,13 secs

In [None]:
diving
g01_c01
confused as needed at 15 sec  - 13 sec fine for 1st pred but 2nd pred 74 hammer throw

g03_c04
confused as needed at 15 sec  - 13 sec fine for 2nd pred but 1st pred 64 hammer throw  (bcauze person jsut moves up-down on board)

In [None]:
punch - overall 13 sec works fine
g01_c01
70 - 13 sec best

g03_c01
90 - 8 sec
85 - 10 sec   drop kick 8
87 - 12 sec   drop kick 6
60 - 13 sec   wrestling 30  - best
45 - 14 sec   wrestling 28
55 - 15 sec   wrestling 25

g03_c01
73 - 13 sec  drop kick 23 - best



In [None]:
4.
35X,30X - 2 sec
42,27X  - 3 sec
48,47X  - 4 sec
90,41X  - 5 sec
96,27X  - 6 sec
97,39X  - 7 sec
99,42X  - 8 sec
97,31X  - 9 sec
99,35X  - 10 sec
97,21  -  11 sec
97,28  -  12 sec
90,20X  - 13 sec
78,13X   - 14 sec
95,12X  - 15 sec - best
86,12X  - 16 sec
93,08X  - 17 sec
94,13X  - 18 sec
92,09   -19 sec
97,19   - 20 sec
97,34   - 21 sec
95,36   - 22 sec
95,36   - 22 sec
96,47   - 23 sec
93,53   - 24 sec
91,54   - 25 sec

g02C04:

31, 80  -  8 sec  x
36x,54 -  10 sec x
50X,74 - 12 sec  x
77,57X - 13 sec
72,42X - 14 sec  - best
66,51X - 15 sec
72,56X - 16 sec
51,44  - 18 sec
57,57 - 20 sec

60, 33X - 25 sec
31,31   - 30 sec