In [2]:
import cv2
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.models as pretrained

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm
from PIL import Image
from tqdm import tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device}')

Using cuda


In [4]:
class VideoDataset(Dataset):

    def __init__(self, video_dir_path):

        self.resnet = pretrained.resnet50(pretrained=True).to(device)
        self.resnet.eval()
        self.layer = self.resnet.avgpool
        self.video_embeddings = []
        self.labels = []

        self.transforms = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
                0.229, 0.224, 0.255]),
        ])

        self.label_map = {
            'CricketShot': torch.tensor(0),
            'PlayingCello': torch.tensor(1),
            'Punch': torch.tensor(2),
            'ShavingBeard': torch.tensor(3),
            'TennisSwing': torch.tensor(4)
        }

        def hook(module, inputs, outputs):
            self.video_embeddings[-1].append(outputs.detach().cpu().squeeze())

        self.handle = self.layer.register_forward_hook(hook)

        for video in tqdm(os.listdir(video_dir_path)):
            frames = self.get_frames(os.path.join(video_dir_path, video))
            self.video_embeddings.append([])
            
            for frame in frames:
                inp = self.transforms(Image.fromarray(frame)).to(device).unsqueeze(0)
                self.resnet(inp)

            action = video.split('_')[1]
            self.labels.append(action)

        self.handle.remove()
        del self.handle
        del self.resnet
        del self.layer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.vstack(self.video_embeddings[idx]), self.label_map[self.labels[idx]]

    def get_frames(self, path, max_frames=20):
        vidObj = cv2.VideoCapture(path)
        success = 1
        frames = []
        count = 0
        while success:
            success, image = vidObj.read()
            count+=1
            if not success or count > max_frames:
                break
            frames.append(image)
        return frames

    def unregister_hook(self):
        self.handle.remove()
        del self.handle


In [5]:
dataset = VideoDataset('./train')

100%|██████████| 594/594 [03:11<00:00,  3.10it/s]


In [6]:
with open('video_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f)

In [7]:
class Attention(nn.Module):
  def __init__(self, embedding_dim, n_hidden):
    super().__init__()

    self.embedding_dim = embedding_dim
    self.n_hidden = n_hidden

    self.wx = nn.Linear(self.embedding_dim, self.embedding_dim)
    self.wh = nn.Linear(self.n_hidden, self.embedding_dim)
    self.sigmoid = nn.Sigmoid()

  def forward(self, X, h):
    out1 = self.wx(X)
    out2 = self.wh(h)
    a = self.sigmoid(out1+out2)
    
    return torch.mul(a,X)

In [8]:

class EleAttG_GRU(nn.Module):
  def __init__(self, embedding_dim, n_hidden=128, n_classes=None):
    super().__init__()

    assert n_classes is not None

    self.embedding_dim = embedding_dim
    self.n_hidden = n_hidden
    self.n_classes = n_classes

    self.attention = Attention(self.embedding_dim, self.n_hidden)
    self.grucell = nn.GRUCell(self.embedding_dim, self.n_hidden)
    self.fc = nn.Sequential(
        nn.Linear(self.n_hidden, self.n_hidden),
        nn.ReLU(),
        nn.Linear(self.n_hidden, self.n_classes),
        nn.Softmax(dim=1)
    )

  def forward(self, X):
    '''
      x = batch_size * frames * embedding_dim

    '''
    h = torch.zeros(X.shape[0], self.n_hidden).to(device)
    for i in range(X.shape[1]):
      X[:, i, :] = self.attention(X[:, i, :].clone(), h)
      h = self.grucell(X[:, i, :].clone(), h)

    return self.fc(h)


In [9]:
class Vanilla_GRU(nn.Module):
    def __init__(self, embedding_dim,  n_hidden, n_classes=None):
        super().__init__()

        assert n_classes is not None
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_classes = n_classes

        self.grucell = nn.GRUCell(self.embedding_dim, self.n_hidden)
        self.fc = nn.Sequential(
            nn.Linear(self.n_hidden, self.n_hidden),
            nn.ReLU(),
            nn.Linear(self.n_hidden, self.n_classes),
            nn.Softmax(dim=1)
        )
    
    def forward(self,X):
        h = torch.zeros(X.shape[0], self.n_hidden).to(device)
        for i in range(X.shape[1]):
            h = self.grucell(X[:, i, :].clone(), h)
        
        return self.fc(h)

In [10]:
model = EleAttG_GRU(2048, 256, 5)
model.to(device)

EleAttG_GRU(
  (attention): Attention(
    (wx): Linear(in_features=2048, out_features=2048, bias=True)
    (wh): Linear(in_features=256, out_features=2048, bias=True)
    (sigmoid): Sigmoid()
  )
  (grucell): GRUCell(2048, 256)
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=5, bias=True)
    (3): Softmax(dim=1)
  )
)

In [11]:
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [12]:
def train(model, dataloader, device, n_epochs=10):
    optimizer = torch.optim.AdamW(model.parameters())
    criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        loss_val = 0
        for i, (X, y) in enumerate(tqdm(dataloader)):
            optimizer.zero_grad()
            X = X.to(device)
            y = y.to(device)
            y_pred = model(X)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            loss_val += loss.item()

        print(f'Epoch:{epoch} Loss:{loss_val}')


In [13]:
train(model, train_dataloader, device, 10)

100%|██████████| 75/75 [00:04<00:00, 16.85it/s]


Epoch:0 Loss:78.94987326860428


100%|██████████| 75/75 [00:03<00:00, 23.80it/s]


Epoch:1 Loss:68.47460919618607


100%|██████████| 75/75 [00:03<00:00, 22.65it/s]


Epoch:2 Loss:69.91883206367493


100%|██████████| 75/75 [00:03<00:00, 22.53it/s]


Epoch:3 Loss:69.39174294471741


100%|██████████| 75/75 [00:03<00:00, 22.94it/s]


Epoch:4 Loss:80.0042245388031


100%|██████████| 75/75 [00:03<00:00, 22.86it/s]


Epoch:5 Loss:74.26113003492355


100%|██████████| 75/75 [00:03<00:00, 23.16it/s]


Epoch:6 Loss:67.91187655925751


100%|██████████| 75/75 [00:03<00:00, 21.69it/s]


Epoch:7 Loss:69.02254790067673


100%|██████████| 75/75 [00:03<00:00, 21.87it/s]


Epoch:8 Loss:67.86471486091614


100%|██████████| 75/75 [00:03<00:00, 23.62it/s]

Epoch:9 Loss:67.8636080622673





In [14]:
vanilla_model = Vanilla_GRU(2048, 256, 5)
vanilla_model.to(device)

Vanilla_GRU(
  (grucell): GRUCell(2048, 256)
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=5, bias=True)
    (3): Softmax(dim=1)
  )
)

In [15]:
train(vanilla_model, train_dataloader, device, 10)

100%|██████████| 75/75 [00:01<00:00, 62.56it/s]


Epoch:0 Loss:85.52383834123611


100%|██████████| 75/75 [00:01<00:00, 57.86it/s]


Epoch:1 Loss:70.38166725635529


100%|██████████| 75/75 [00:01<00:00, 61.76it/s]


Epoch:2 Loss:68.73321413993835


100%|██████████| 75/75 [00:01<00:00, 63.63it/s]


Epoch:3 Loss:69.00575160980225


100%|██████████| 75/75 [00:01<00:00, 57.97it/s]


Epoch:4 Loss:68.57186931371689


100%|██████████| 75/75 [00:01<00:00, 64.02it/s]


Epoch:5 Loss:68.45581543445587


100%|██████████| 75/75 [00:01<00:00, 60.25it/s]


Epoch:6 Loss:67.87236303091049


100%|██████████| 75/75 [00:01<00:00, 58.55it/s]


Epoch:7 Loss:67.86842328310013


100%|██████████| 75/75 [00:01<00:00, 63.94it/s]


Epoch:8 Loss:67.86680608987808


100%|██████████| 75/75 [00:01<00:00, 64.17it/s]

Epoch:9 Loss:67.86595809459686





In [16]:
test_dataset = VideoDataset('./test')

100%|██████████| 224/224 [01:25<00:00,  2.61it/s]


In [17]:
with open('test_dataset.pkl', 'wb') as f:
    pickle.dump(test_dataset, f)

In [18]:
preds = list()
y_true = list()

for (X, y) in tqdm(test_dataset):
    X = X.unsqueeze(0).to(device)
    out = model(X).squeeze(0)
    y_pred = out.argmax()
    preds.append(y_pred.item())
    y_true.append(y.item())

100%|██████████| 224/224 [00:01<00:00, 134.12it/s]


In [19]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score

In [23]:
att_gru_accu = accuracy_score(y_true, preds)
print(classification_report(y_true, preds))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96        49
           1       0.98      1.00      0.99        44
           2       1.00      0.92      0.96        39
           3       0.98      1.00      0.99        43
           4       0.98      0.96      0.97        49

    accuracy                           0.97       224
   macro avg       0.98      0.97      0.97       224
weighted avg       0.97      0.97      0.97       224



In [24]:
preds = list()
y_true = list()

for (X, y) in tqdm(test_dataset):
    X = X.unsqueeze(0).to(device)
    out = vanilla_model(X).squeeze(0)
    y_pred = out.argmax()
    preds.append(y_pred.item())
    y_true.append(y.item())


100%|██████████| 224/224 [00:00<00:00, 368.55it/s]


In [25]:
vanilla_accu = accuracy_score(y_true, preds)
print(classification_report(y_true, preds))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        49
           1       1.00      0.98      0.99        44
           2       1.00      0.95      0.97        39
           3       0.93      1.00      0.97        43
           4       0.92      1.00      0.96        49

    accuracy                           0.97       224
   macro avg       0.97      0.97      0.97       224
weighted avg       0.97      0.97      0.97       224



In [28]:
print(vanilla_accu)
print(att_gru_accu)

0.96875
0.9732142857142857
