# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/EECS_442 Final Proj/WLASL/

/content/drive/.shortcut-targets-by-id/1Sc7VSl-7PU4L0hxklxePzLVSFNQ7pnLE/EECS_442 Final Proj/WLASL


In [None]:
import os
import torch
import tensorflow as tf

import numpy as np
import pandas as pd
import numbers
import random
import math
import cv2
import json
import glob
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models as torch_models
import torch.nn as nn
from torch import optim
from torchsummary import summary

In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

57

# Split

### Data Process 3 (final)

In [None]:
#IF MODEL USES DATA DIRECTLY
class VideoDataset(Dataset):
  def __init__(self, task, video_root_dir, json_file, num_desired_frames, transform):
    super().__init__()
    self.task = task
    self.num_desired_frames = num_desired_frames
    self.num_class = self.get_num_class(json_file)
    self.info = self.extract_info_from_json(json_file, video_root_dir)
    self.transform = transform

  def get_num_class(self, json_file):
    classes = set()    
    with open(json_file, 'r') as f:
      data = json.load(f)
    for vid_id in data.keys(): # get num clasees
      class_id = data[vid_id]['action'][0]
      classes.add(class_id)
    return len(classes)

  def __getitem__(self, idx):
    frames = torch.from_numpy(self.extract_frames(self.info[idx][0], self.info[idx][2]))
    label = self.info[idx][1]
    return frames, torch.from_numpy(label)

  def __len__(self):
    return len(self.info)
 
  # obtain information from nslt100.json/nslt300.json, which can be found in WLASL/code/I3D/preprocess
  def extract_info_from_json(self, split_file, video_root_dir):
    combined_info = []
    # some videos may be missing...
    all_videos = glob.glob(video_root_dir + '*.mp4')
    all_video_ids = [x.split('/')[-1].split('.')[0] for x in all_videos]
    print(all_video_ids)
    with open(split_file, 'r') as f:
      data = json.load(f)
    ##########################################
    count = 0
    for vid_id in data.keys():
      #if count == 16:
      #  return combined_info
      if (data[vid_id]['subset'] != self.task): # train, val, test?
        continue
      if vid_id not in all_video_ids:
        continue
      lab = data[vid_id]['action'][0]
      label = np.zeros((self.num_class, 1)) # compute label e.g., [0,0,0,0,0...1,0,0,0...]
      label[lab] = 1
      start = data[vid_id]['action'][1]
      vid_path = os.path.join(video_root_dir, vid_id + '.mp4')
      combined_info.append([vid_path, label, start]) # [/videos/00623.mp4, [0,0,1,0,...], 1]
      #count += 1
    return combined_info
    #########################################

  # basically copied down the code 
  def extract_frames(self, video_filepath, start_frame):
    vidcap = cv2.VideoCapture(video_filepath)
    frames = []
    total_frames = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)

    vidcap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    stepsize = int(total_frames // self.num_desired_frames)

    stepsize = int(max(stepsize, 1))

    for i in range(0, min(self.num_desired_frames, int(total_frames - start_frame)), stepsize):
      ret, frame = vidcap.read()
      if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break
      w, h, c = frame.shape
      
      if w < 226 or h < 226:
        d = 226. - min(w, h)
        sc = 1 + d / min(w, h)
        frame = cv2.resize(frame, dsize=(0, 0), fx=sc, fy=sc)
      if w > 256 or h > 256:
        frame = cv2.resize(frame, (math.ceil(w * (256 / w)), math.ceil(h * (256 / h))))
      #frame = (frame - np.mean(frame, axis=2)) / np.std(frame, axis=2)
      frame = (frame / 255.) * 2 - 1
      frames.append(frame)

    frames = np.asarray(frames, dtype=np.float32)

    ###########################################
    #         if shorter, loop video          #
    ###########################################
    if len(frames) < self.num_desired_frames:
      num_padding = self.num_desired_frames - len(frames)
      time_to_loop = num_padding // len(frames)
      extra = num_padding % len(frames)

      original = frames
      for i in range(time_to_loop):
        frames = np.concatenate([frames, original], axis=0)          
      pad_video = frames[:extra]
      frames = np.concatenate([frames, pad_video], axis=0)

      assert frames.shape[0] == self.num_desired_frames
    transformed = self.transform(frames)
    r = transformed[:,:,:,0]
    g = transformed[:,:,:,1]
    b = transformed[:,:,:,2]
    transformed = np.stack([r,g,b], axis=0)
    return transformed

import numbers
import random

class RandomCrop(object):
    """Crop the given video sequences (t x h x w) at a random location.
    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made.
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    @staticmethod
    def get_params(img, output_size):
        """Get parameters for ``crop`` for a random crop.
        Args:
            img (PIL Image): Image to be cropped.
            output_size (tuple): Expected output size of the crop.
        Returns:
            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
        """
        t, h, w, c = img.shape
        th, tw = output_size
        if w == tw and h == th:
            return 0, 0, h, w

        i = random.randint(0, h - th) if h!=th else 0
        j = random.randint(0, w - tw) if w!=tw else 0
        return i, j, th, tw

    def __call__(self, imgs):
        
        i, j, h, w = self.get_params(imgs, self.size)

        imgs = imgs[:, i:i+h, j:j+w, :]
        return imgs

    def __repr__(self):
        return self.__class__.__name__ + '(size={0})'.format(self.size)

class CenterCrop(object):
    """Crops the given seq Images at the center.
    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made.
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, imgs):
        """
        Args:
            img (PIL Image): Image to be cropped.
        Returns:
            PIL Image: Cropped image.
        """
        t, h, w, c = imgs.shape
        th, tw = self.size
        i = int(np.round((h - th) / 2.))
        j = int(np.round((w - tw) / 2.))

        return imgs[:, i:i+th, j:j+tw, :]


    def __repr__(self):
        return self.__class__.__name__ + '(size={0})'.format(self.size)


class RandomHorizontalFlip(object):
    """Horizontally flip the given seq Images randomly with a given probability.
    Args:
        p (float): probability of the image being flipped. Default value is 0.5
    """

    def __init__(self, p=0.5):
        self.p = p

    def __call__(self, imgs):
        """
        Args:
            img (seq Images): seq Images to be flipped.
        Returns:
            seq Images: Randomly flipped seq images.
        """
        if random.random() < self.p:
            # t x h x w
            return np.flip(imgs, axis=2).copy()
        return imgs

    def __repr__(self):
        return self.__class__.__name__ + '(p={})'.format(self.p)

In [None]:
split_file = './splits/nslt_100.json'
video_dir = './videos/'
transform = CenterCrop(224)
num_desired_frame = 10
train = VideoDataset('train', video_dir, split_file, num_desired_frame, transform)
val = VideoDataset('val', video_dir, split_file, num_desired_frame, transform)
test = VideoDataset('test', video_dir, split_file, num_desired_frame, transform)

['66538', '53428', '53429', '53431', '53434', '53552', '53544', '53545', '53546', '53547', '53548', '54080', '54081', '54082', '54083', '54074', '54087', '54076', '54077', '54213', '66551', '54205', '54206', '54207', '54209', '54886', '66560', '54878', '54879', '54880', '54882', '54883', '55003', '66561', '54995', '54998', '54999', '55000', '69491', '55077', '66564', '55070', '55071', '55072', '55074', '55157', '55150', '55151', '55152', '55154', '55184', '66570', '55176', '55177', '55178', '55179', '55181', '55182', '55206', '66571', '55198', '55199', '55200', '55202', '69493', '66572', '55276', '55286', '55277', '55278', '55279', '55283', '55658', '55668', '55659', '55660', '55661', '55663', '55664', '69495', '55738', '55731', '55732', '55733', '55735', '56167', '56158', '56159', '56160', '56161', '56163', '56305', '56295', '56296', '56297', '56298', '56300', '57139', '57141', '57132', '57133', '57134', '57136', '57137', '57138', '57677', '57670', '57671', '57765', '57768', '57758', 

# I3D (With Inception Model)
Original Paper: https://arxiv.org/pdf/1705.07750.pdf <br>
Github Implementation: https://github.com/deepmind/kinetics-i3d/blob/master/i3d.py

## Attempt \#1
Using model pretrained on `ImageNet` and finetuned on `Kinetics` (instead of training from scratch) to provide better result, and do a final finetune with `WLASL`

---

Useful Resource:
* An example of finetuning using `UCF101`: https://github.com/USTC-Video-Understanding/I3D_Finetune

## Attempt \#2
Implement I3D model from scratch, and train from scratch using `WLASL`

**NOTE:** original implementation used `snt.AbstractModel`, to keep consistency with our code, I will be trying to implement using `nn.Model`. <br>
**NOTE:** original implementation also used `end_points` to allow building up to a specific layer - not sure what that's used for, so I'm ignoring it for now.

---

**Useful Resource:**
* An implementation of I3D using `Pytorch`: https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py

---

**TODO:** <br>
* Implement Training

### The Model

In [None]:
#################################################################################################
# This section is copied from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py#
#################################################################################################
class MaxPool3dSamePadding(nn.MaxPool3d):
  def compute_pad(self, dim, s):
    if s % self.stride[dim] == 0:
      return max(self.kernel_size[dim] - self.stride[dim], 0)
    else:
      return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)

  def forward(self, x):
    # compute 'same' padding
    
    (batch, channel, t, h, w) = x.size()
    #print t,h,w
    out_t = np.ceil(float(t) / float(self.stride[0]))
    out_h = np.ceil(float(h) / float(self.stride[1]))
    out_w = np.ceil(float(w) / float(self.stride[2]))
    #print out_t, out_h, out_w
    pad_t = self.compute_pad(0, t)
    pad_h = self.compute_pad(1, h)
    pad_w = self.compute_pad(2, w)
    #print pad_t, pad_h, pad_w

    pad_t_f = pad_t // 2
    pad_t_b = pad_t - pad_t_f
    pad_h_f = pad_h // 2
    pad_h_b = pad_h - pad_h_f
    pad_w_f = pad_w // 2
    pad_w_b = pad_w - pad_w_f

    pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
    x = F.pad(x, pad)
    return super(MaxPool3dSamePadding, self).forward(x)
#################################################################################################
#                                          End Copy                                             #
################################################################################################# 

In [None]:
#################################################################################################
# This section is copied from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py#
#################################################################################################
class Conv3dSamePadding(nn.Conv3d):
  def compute_pad(self, dim, s):
    if s % self.stride[dim] == 0:
      return max(self.kernel_size[dim] - self.stride[dim], 0)
    else:
      return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)

            
  def forward(self, x):
    # compute 'same' padding
    (batch, channel, t, h, w) = x.size()
    #print t,h,w
    out_t = np.ceil(float(t) / float(self.stride[0]))
    out_h = np.ceil(float(h) / float(self.stride[1]))
    out_w = np.ceil(float(w) / float(self.stride[2]))
    #print out_t, out_h, out_w
    pad_t = self.compute_pad(0, t)
    pad_h = self.compute_pad(1, h)
    pad_w = self.compute_pad(2, w)
    #print pad_t, pad_h, pad_w

    pad_t_f = pad_t // 2
    pad_t_b = pad_t - pad_t_f
    pad_h_f = pad_h // 2
    pad_h_b = pad_h - pad_h_f
    pad_w_f = pad_w // 2
    pad_w_b = pad_w - pad_w_f

    pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
    x = F.pad(x, pad)
    return super(Conv3dSamePadding, self).forward(x)
#################################################################################################
#                                          End Copy                                             #
################################################################################################# 

In [None]:
# This is Conv3D + Same Padding + ReLu
class UnitConv3D(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size=(1,1,1), stride=(1,1,1), final=False):
    super().__init__()
    self._in_channels = in_channels
    self._output_channels = out_channels
    self._kernel_size = kernel_size
    self._stride = stride
    self.final = final

    self.Conv3d = Conv3dSamePadding(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=0)

    ########### Can Tune Here ############
    if final == False:
      self.batchnorm = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
    ######################################

  def forward(self, x):
    x = x.to(device)
    x = self.Conv3d(x)
    if self.final == False:
      #x = self.batchnorm(x)
      x = F.relu(x)
    return x

In [None]:
class Inception(nn.Module):
  def __init__(self, in_channels, out_channels):
    super().__init__()
    self.out_channels = out_channels

    self.Conv_1a_1x1 = UnitConv3D(in_channels=in_channels, out_channels=out_channels[0])
    self.Conv_2a_1x1 = UnitConv3D(in_channels=in_channels, out_channels=out_channels[1])
    self.Conv_2b_3x3 = UnitConv3D(in_channels=out_channels[1], out_channels=out_channels[2], kernel_size=[3,3,3])
    self.Conv_3a_1x1 = UnitConv3D(in_channels=in_channels, out_channels=out_channels[3])
    self.Conv_3b_3x3 = UnitConv3D(in_channels=out_channels[3], out_channels=out_channels[4], kernel_size=[3,3,3])
    self.Maxpool_4a_3x3 = MaxPool3dSamePadding(kernel_size=[3,3,3], stride=[1,1,1], padding=0)
    self.Conv_4b_1x1 = UnitConv3D(in_channels=in_channels, out_channels=out_channels[5])

  def forward(self, x):
    x = x.to(device)
    line1 = self.Conv_1a_1x1(x)
    line2 = self.Conv_2a_1x1(x)
    line2 = self.Conv_2b_3x3(line2)
    line3 = self.Conv_3a_1x1(x)
    line3 = self.Conv_3b_3x3(line3)
    line4 = self.Maxpool_4a_3x3(x)
    line4 = self.Conv_4b_1x1(line4)
    x = torch.cat([line1, line2, line3, line4], dim=1)
    return x

In [None]:
from torch.nn.modules.pooling import MaxPool3d
class I3D(nn.Module):
  def __init__(self, num_class=100, drop_out_prob=0.5):
    super().__init__()
    self.num_class = num_class

    self.Conv_1a_7x7 = UnitConv3D(in_channels=3, out_channels=64, kernel_size=[7,7,7], stride=[2,2,2])
    self.Maxpool_2a_3x3 = MaxPool3dSamePadding(kernel_size=[1,3,3], stride=[1,2,2])
    self.Conv_2b_1x1 = UnitConv3D(in_channels=64, out_channels=64)
    self.Conv_2c_3x3 = UnitConv3D(in_channels=64, out_channels=192, kernel_size=[3,3,3])
    self.Maxpool_3a_3x3 = MaxPool3dSamePadding(kernel_size=[1,3,3], stride=[1,2,2])
    self.Inc_3b = Inception(in_channels=192, out_channels=[64,96,128,16,32,32])
    self.Inc_3c = Inception(in_channels=256, out_channels=[128,128,192,32,96,64])
    self.Maxpool_4a_3x3 = MaxPool3dSamePadding(kernel_size=[3,3,3], stride=[2,2,2])
    self.Inc_4b = Inception(in_channels=480, out_channels=[192,96,208,16,48,64])
    self.Inc_4c = Inception(in_channels=512, out_channels=[160,112,224,24,64,64])
    self.Inc_4d = Inception(in_channels=512, out_channels=[128,128,256,24,64,64])
    self.Inc_4e = Inception(in_channels=512, out_channels=[112,144,288,32,64,64])
    self.Inc_4f = Inception(in_channels=528, out_channels=[256,160,320,32,128,128])
    self.Maxpool_5a_2x2 = MaxPool3dSamePadding(kernel_size=[2,2,2], stride=[2,2,2])
    self.Inc_5b = Inception(in_channels=832, out_channels=[256,160,320,32,128,128])
    self.Inc_5c = Inception(in_channels=832, out_channels=[384,192,384,48,128,128])
    self.Avgpool = nn.AvgPool3d(kernel_size=[2,7,7], stride=[1,1,1])
    self.Drop = nn.Dropout(drop_out_prob)
    self.Final = UnitConv3D(in_channels=1024, out_channels=num_class, final=True)
  
  def forward(self,x):
    x = x.to(device)
    x = self.Conv_1a_7x7(x)
    x = self.Maxpool_2a_3x3(x)
    x = self.Conv_2b_1x1(x)
    x = self.Conv_2c_3x3(x)
    x = self.Maxpool_3a_3x3(x)
    x = self.Inc_3b(x)
    x = self.Inc_3c(x)
    x = self.Maxpool_4a_3x3(x)
    x = self.Inc_4b(x)
    x = self.Inc_4c(x)
    x = self.Inc_4d(x)
    x = self.Inc_4e(x)
    x = self.Inc_4f(x)
    x = self.Maxpool_5a_2x2(x)
    x = self.Inc_5b(x)
    x = self.Inc_5c(x)
    x = self.Avgpool(x)
    #x = self.Drop(x)
    x = self.Final(x)
    result = x.squeeze(3).squeeze(3)
    return result

In [None]:
device = 'cuda'
model = I3D().to(device)
criterion = nn.CrossEntropyLoss().to(device) # Specify the loss layer
#summary(model, (3, 10,224,224),device=device)

# Training

In [None]:
############### Can Tune Here #################
learning_rate = 1e-2
weight_decay = 1e-7
epoch = 500
batch_size = 32
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
################################################

In [None]:
trainloader = DataLoader(train, batch_size=batch_size, shuffle=True)
valloader = DataLoader(val, batch_size=batch_size, shuffle=True)
testloader = DataLoader(test, batch_size=batch_size, shuffle=True)

In [None]:
from tqdm import tqdm

In [None]:
%%time
def trainer(model, trainloader, valloader, num_epoch=10):  # Train the model
    print("Start training...")
    trn_loss_hist = []
    trn_acc_hist = []
    val_acc_hist = []
    model.train()  # Set the model to training mode
    for i in range(num_epoch):
        running_loss = []
        print('-----------------Epoch = %d-----------------' % (i+1))
        for batch, label in tqdm(trainloader):
            batch = batch.to(device)
            label = label.to(device)
            #label = torch.argmax(label, dim=1)
            optimizer.zero_grad()  # Clear gradients from the previous iteration
            # This will call Network.forward() that you implement
            pred = model(batch)
            loss = criterion(pred, label)  # Calculate the loss
            running_loss.append(loss.item())
            loss.backward()  # Backprop gradients to all tensors in the network
            optimizer.step()  # Update trainable weights
        print("\n Epoch {} loss:{}".format(i+1, np.mean(running_loss)))

        # Keep track of training loss, accuracy, and validation loss
        trn_loss_hist.append(np.mean(running_loss))
        trn_acc_hist.append(evaluate(model, trainloader))
        print("\n Evaluate on validation set...")
        val_acc_hist.append(evaluate(model, valloader))
    print("Done!")
    return trn_loss_hist, trn_acc_hist, val_acc_hist

def evaluate(model, loader):  # Evaluate accuracy on validation / test set
    model.eval()  # Set the model to evaluation mode
    correct = 0
    with torch.no_grad():  # Do not calculate grident to speed up computation
        for batch, label in tqdm(loader):
            batch = batch.to(device)
            label = label.to(device)
            label = torch.argmax(label, dim=1)
            label = label.cpu().detach().numpy()
            pred = model(batch)
            pred = torch.argmax(pred, dim=1)
            pred = pred.cpu().detach().numpy()
            mask = np.zeros(pred.shape)
            mask[pred==label] = 1
            correct += mask.sum()
        acc = correct/len(loader.dataset)
        print("\n Evaluation accuracy: {}".format(acc))
        return acc

trn_loss_hist, trn_acc_hist, val_acc_hist = trainer(model, trainloader, valloader, epoch)

NameError: ignored

In [None]:
import matplotlib.pyplot as plt
# visualize the training / validation accuracies
x = np.arange(epoch)
# train/val accuracies for MiniVGG
plt.figure()
plt.plot(x, trn_acc_hist)
plt.plot(x, val_acc_hist)
plt.legend(['Training', 'Validation'])
plt.xticks(x)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('WLASL - I3D')
plt.gcf().set_size_inches(10, 5)
plt.savefig('part1.png', dpi=300)
plt.show()