**** Execute the following cell if running on Google Colab ****

In [1]:
!pip install torcheval
!pip install av

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7
Collecting av
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-11.0.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Libraries

In [3]:
import os, math
import pandas as pd
import numpy as np

from torchvision.io.video import read_video
from torchvision.models.video import s3d, S3D_Weights#r3d_18, R3D_18_Weights #swin3d_b, Swin3D_B_Weights
import torch
import torch.nn.functional as F2
import torchvision.transforms.functional as F
from torchvision.io.video import read_video
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader
from torchvision import models, utils
from torcheval.metrics import MulticlassAccuracy, MulticlassConfusionMatrix

device = "cuda" if torch.cuda.is_available() else "cpu"


# Creating Data Loader

In [4]:
class GestureDataset(Dataset):
    def __init__(self, root_dir, transform=None, device='cpu'):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.device = device
        self.annotations = pd.read_csv(os.path.join(self.root_dir,'annotations.csv'))

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        path_to_video, label = self.annotations.iloc[idx]
        frames, _, _ = read_video(os.path.join(self.root_dir, path_to_video), output_format="TCHW")
        total_frames = len(frames)
        desired_num_frames = 128
        if (total_frames - desired_num_frames) == 1:
            frames = frames[1:]
        elif (total_frames - desired_num_frames) > 1:
            start_offset, end_offset = self.more_frames(total_frames, desired_num_frames)
            frames = frames[start_offset:-end_offset]
        elif total_frames < desired_num_frames:
            print("not enough frames")
            return

        if self.transform:
            #
            frames = self.transform(frames)

        return {'video_frames': frames, 'label': label}

    def more_frames(self, total_frames, desired_num_frames):
        start_offset = math.ceil((total_frames - desired_num_frames)/2)
        end_offset = math.floor((total_frames - desired_num_frames)/2)
        return start_offset, end_offset

    def collate_fn(self, batch):
        videos = []
        targets = []

        for b in batch:
            videos.append(b['video_frames'])
            targets.append(b['label'])


        videos = torch.stack(videos, dim=0).type(torch.float32)

        return {'videos': videos, 'labels': targets}


# Defining the model architecture

In [5]:
import torch
from torch import nn

class s3d_Gestures(nn.Module):
    def __init__(self, pretrained_model):
        super(s3d_Gestures, self).__init__()
        self.pretrained_model = nn.Sequential(*(list(pretrained_model.children())))
        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(6000, 64),
            nn.ReLU(),
            # nn.Linear(64, 64),
            # nn.ReLU(),
        )
        self.final_classifier = nn.Sequential(
            nn.Linear(64, 3),
            nn.Softmax(),
        )

    def forward(self, x):
        x = self.pretrained_model(x)
        x = self.fc1(x)
        x = self.final_classifier(x)
        return x


## Loading the pre-trained Separable 3D CNN model from PyTorch

In [6]:
# Step 1: Initialize model with the best available weights
weights = S3D_Weights.DEFAULT
pretrained_model = s3d(weights=weights)
print("model loading done")
#model.eval()

Downloading: "https://download.pytorch.org/models/s3d-d76dad2f.pth" to /root/.cache/torch/hub/checkpoints/s3d-d76dad2f.pth
100%|██████████| 32.0M/32.0M [00:00<00:00, 97.9MB/s]


model loading done


# Training the model

*** Only run this section to train the model. Otherwise skip to next section to test the model performance on test data and get predictions from the model. ***

## Setting up DataLoaders and Dataset for training the model

### *******Set path of the the root directory containing train, test and eval folders*******

In [None]:
root_dir = "/content/drive/MyDrive/gestures_dataset_new/"

### Initializing DataLoader for training the deep learning model

In [None]:
################# PLEASE SET THE BATCH SIZE ##################
batch_size = 8
##############################################################

gesture_dataset_train = GestureDataset(root_dir=os.path.join(root_dir, 'train'), transform=weights.transforms(), device=device)
train_dataloader = DataLoader(gesture_dataset_train, batch_size=batch_size, collate_fn=gesture_dataset_train.collate_fn, shuffle=True)

## Initializing Model

In [None]:
gesture_model = s3d_Gestures(pretrained_model).to(device)
for param in gesture_model.pretrained_model.parameters():
    param.requires_grad = False

In [None]:
gesture_model.train()
for param in gesture_model.parameters():
    print(param.requires_grad)

## Traning the model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
max_epochs = 25
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW([p for p in gesture_model.parameters() if p.requires_grad], lr=1e-3)

scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=max_epochs+25,
    T_mult=1,
    verbose=True
)
gesture_model.train()
training_losses = []
for epoch in tqdm(range(max_epochs)):
    train_loss = 0.0
    training_outputs = []
    training_labels = []
    for i, data in enumerate(train_dataloader, 0):
        videos, labels = data['videos'], data['labels']
        labels = torch.tensor(F2.one_hot(torch.tensor(labels, dtype=torch.long), num_classes=3), dtype=torch.float32)

        optimizer.zero_grad()

        outputs = gesture_model(videos.to(device))
        loss = criterion(outputs, labels.to(device))
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        print("Loss: ", loss.item())
        training_labels.append(labels)
        training_outputs.append(outputs)

    scheduler.step()

    training_losses.append(train_loss/len(train_dataloader))
    print("Epoch: {}/{} | Training Loss: {}".format(epoch+1,max_epochs, train_loss/len(train_dataloader)))

## Saving the trained model

In [None]:
# you can change the name of model and save path of the model as required
################# PLEASE SET THE PATH WHERE TO SAVE THE MODEL ##################
torch.save(gesture_model, os.path.join(root_dir,"gesture_model_v3.pt"))

# Testing the trained model

### Loading the trained model

In [None]:
# Update the path of the model as required
################# SET THE PATH TO THE SAVED MODEL ##################
trained_model = torch.load(os.path.join(root_dir,"gesture_model_v3.pt"), map_location=torch.device('cpu')) # Use 'cuda' instead of 'cpu' if you saved the model on GPU

In [None]:
trained_model = trained_model.to(device)
trained_model = trained_model.eval()

### Creatingt the test dataloader

In [None]:
batch_size = 8

gesture_dataset_test = GestureDataset(root_dir=os.path.join(root_dir, 'test'), transform=weights.transforms(), device=device)
test_dataloader = DataLoader(gesture_dataset_test, batch_size=batch_size, collate_fn=gesture_dataset_train.collate_fn, shuffle=True)

### Observing performance of the model on batches of test data

In [None]:
import warnings
warnings.filterwarnings("ignore")
testing_outputs = []
testing_labels = []
for i, data in enumerate(train_dataloader, 0):
  print("\t>>>>>> Progress: {}/{} >>>>>>".format(i+1,len(train_dataloader)))
  videos, labels = data['videos'], data['labels']
  testing_labels.extend(labels)

  labels = torch.tensor(F2.one_hot(torch.tensor(labels, dtype=torch.long), num_classes=3), dtype=torch.float32)

  test_output = trained_model(videos.to(device))
  testing_outputs.append(test_output)
  metric = MulticlassAccuracy(num_classes=3)
  metric.update(test_output.argmax(dim=1), labels.argmax(dim=1))
  print("Accuracy: {}%".format(metric.compute()*100))

# testing_outputs = torch.tensor([out.argmax().item() for out in testing_outputs])
# testing_labels = torch.tensor([out[0] for out in testing_labels])

	>>>>>> Progress: 1/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 2/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 3/28 >>>>>>
Accuracy: 62.5%
	>>>>>> Progress: 4/28 >>>>>>
Accuracy: 75.0%
	>>>>>> Progress: 5/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 6/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 7/28 >>>>>>
Accuracy: 87.5%
	>>>>>> Progress: 8/28 >>>>>>
Accuracy: 75.0%
	>>>>>> Progress: 9/28 >>>>>>
Accuracy: 75.0%
	>>>>>> Progress: 10/28 >>>>>>
Accuracy: 87.5%
	>>>>>> Progress: 11/28 >>>>>>
Accuracy: 87.5%
	>>>>>> Progress: 12/28 >>>>>>
Accuracy: 75.0%
	>>>>>> Progress: 13/28 >>>>>>
Accuracy: 75.0%
	>>>>>> Progress: 14/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 15/28 >>>>>>
Accuracy: 87.5%
	>>>>>> Progress: 16/28 >>>>>>
Accuracy: 50.0%
	>>>>>> Progress: 17/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 18/28 >>>>>>
Accuracy: 62.5%
	>>>>>> Progress: 19/28 >>>>>>
Accuracy: 87.5%
	>>>>>> Progress: 20/28 >>>>>>
Accuracy: 100.0%
	>>>>>> Progress: 21/28 >>>>>>
Accuracy: 75.0%
	>>>>>> Progres

### Computing accuracy of the model on the whole testing dataset

In [None]:
argmaxed_outputs = []
for expanded_output in testing_outputs:
  argmaxed_outputs.append(expanded_output.argmax(dim=1))

In [None]:
argmaxed_outputs = torch.cat(argmaxed_outputs)

In [None]:
testing_labels = torch.tensor(testing_labels)

In [None]:
metric = MulticlassAccuracy(num_classes=3)
metric.update(argmaxed_outputs, testing_labels)
print("Accuracy on the whole test set: ", metric.compute())

metric2 = MulticlassConfusionMatrix(3)
metric2.update(argmaxed_outputs, testing_labels)
print("Confusion Matrxi: ", metric2.compute())

tensor(0.8349)
tensor([[74.,  0.,  0.],
        [ 9., 61.,  0.],
        [17., 10., 47.]])


### Saving the predictions and true labels

In [None]:
# saving the predictions
# the path and the name of the file can be changed as required
torch.save(argmaxed_outputs,'argmaxed_outputs.pt')

In [None]:
# saving the true labels
# the path and the name of the file can be changed as required
torch.save(testing_labels,'testing_labels.pt')

# Getting model prediction on a gesture video

In [16]:
def more_frames(total_frames, desired_num_frames):
        start_offset = math.ceil((total_frames - desired_num_frames)/2)
        end_offset = math.floor((total_frames - desired_num_frames)/2)
        return start_offset, end_offset

### Loading the model

In [7]:
################# SET THE PATH TO THE SAVED MODEL ##################
root_dir = "/content/drive/MyDrive/gestures_dataset_new/"
path_to_model = os.path.join(root_dir,"gesture_model_v3.pt")
##############################################################

In [8]:
# Update the path of the model as required
trained_model = torch.load(path_to_model, map_location=torch.device('cpu')) # Use 'cuda' instead of 'cpu' if you saved the model on GPU
trained_model = trained_model.to(device)
trained_model = trained_model.eval()

### Reading the gesture video

In [26]:

################# SET THE PATH TO THE GESTURE VIDEO ##################
root_dir = "/content/drive/MyDrive/gestures_dataset_new/"
path_to_video = os.path.join(root_dir, "train/hand_waving/Wave_20231211_part_1.mp4")
##############################################################

frames, _, _ = read_video(path_to_video, output_format="TCHW")
total_frames = len(frames)
desired_num_frames = 128
if (total_frames - desired_num_frames) == 1:
    frames = frames[1:]
elif (total_frames - desired_num_frames) > 1:
    start_offset, end_offset = more_frames(total_frames, 128)
    frames = frames[start_offset:-end_offset]
elif total_frames < desired_num_frames:
    print("not enough frames")
    exit(0)

# pre-processing the video for the
preprocess = weights.transforms()
video = preprocess(torch.unsqueeze(frames, 0))



### Getting the prediction

In [27]:
prediction = trained_model(video)

  return self._call_impl(*args, **kwargs)


In [35]:
idx_to_class = {
    0: 'hand_waving',
    1: 'pointing',
    2: 'other'
}
print("Predicted Gesture: ", idx_to_class[prediction.argmax(dim=1).item()])

Predicted Gesture:  hand_waving
