In [1]:
!pip install pytorchvideo

Collecting pytorchvideo
  Downloading pytorchvideo-0.1.5.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.7/132.7 KB[0m [31m966.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting fvcore
  Downloading fvcore-0.1.5.post20220512.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting av
  Downloading av-9.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.2/28.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting parameterized
  Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)
Collecting iopath
  Downloading iopath-0.1.9-py3-none-any.whl (27 kB)
Building wheels for collected packages: pytorchvideo, fvcore
  Building wheel for

In [2]:
import torch
import numpy as np
from torch.utils.data import (
    Dataset,
    DataLoader,
) 
import pickle

import os

import math
import torch.nn as nn
import torch.nn.functional as F

from torchvision.transforms import Compose, Lambda, Grayscale,Normalize
from torchvision.transforms._transforms_video import CenterCropVideo
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    UniformTemporalSubsample,
)

from tqdm import tqdm

from collections import OrderedDict

import torch.optim as optim



class SignDataset(Dataset):
    def __init__(self,x,y):
        self.x = x
        self.y = y


    def __len__(self):

        return len(self.y)

        # length = 0
        # with open(self.file, 'rb') as f:
        #     data = pickle.load(f)

        # for x in data:
        #     length += len(data[x])
        # return length


    def __getitem__(self, index):
        return self.x[index], self.y[index]
        

  "The _functional_video module is deprecated. Please use the functional module instead."
  "The _transforms_video module is deprecated. Please use the transforms module instead."


In [3]:
IMAGE_HEIGHT = 720
IMAGE_WIDTH = 800
IMAGE_CHANNEL = 1
NUM_FRAMES = 25
NUM_CLASSES = 60



inputs =[] #x
classes = [] #y

def transform_data(x, mean, std):
    
    transform =  ApplyTransformToKey(
        key="video",
        transform=Compose(
            [

                Lambda(lambda x: x.permute(1,0,2,3)),#(frames(depth), channel, height, width) -> (channel, frames(depth), height, width)

                UniformTemporalSubsample(NUM_FRAMES),
                Lambda(lambda x: x.permute(1,0,2,3)),#(frames(depth), channel, height, width)
                Lambda(lambda x: x/255.0),
                
                Normalize((mean,), (std,)),

                CenterCropVideo([720,800]),
                Lambda(lambda x: x.permute(1,0,2,3)),#(channel, frames(depth), height, width)

            ]

        ),
    )
    
    return transform(x)



    

def get_data_info(f):
    for line in f:
        a = line.split(',')
        yield a
        


def load_dataloader(batch_size):
    with open('../input/signdataset/sign/MSSL_Train_Set/TRAIN/MSSL_TRAIN_SET_GT.pkl', 'rb') as f:
        data = pickle.load(f)


# keys are files so iterate only limited files due to memory limitations.
    for key in list(data.keys())[21:24]:
        filename = key
        print("file",filename)
        video = EncodedVideo.from_path("../input/signdataset/sign/MSSL_Train_Set/TRAIN/MSSL_TRAIN_SET_VIDEOS_ELAN/"+filename+".mp4")
    # file functions

        for x in data[key]:
            classes.append(x[0])
            start_time = x[1]
            end_time = x[2]
   #give path
            
    
            
            
            video_data = video.get_clip(start_sec=float(start_time)/1000.0, end_sec=float(end_time)/1000.0)

            
            video_data["video"] = Grayscale(num_output_channels=1)((video_data["video"]).permute(1,0,2,3))
#             video_data["video"] = video_data["video"]/255
            #print(video_data["video"].shape)
            
            std, mean = torch.std_mean(video_data["video"])
            std = std/255.0
            mean = mean/255.0
        
            
            
            video_data = transform_data( video_data, mean, std)

        # Move the inputs to the desired device
            inputs.append(video_data["video"])

    signds = SignDataset(inputs, classes)
    dataloader = DataLoader(signds, batch_size=batch_size, shuffle=True, num_workers=1)

    return dataloader
        



In [4]:
class conv_3d(nn.Module):
    def __init__(self):
        super(conv_3d, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv3d(1, 64, kernel_size=5, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        
        self.conv2 = nn.Sequential(
            nn.Conv3d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        

        self._features = nn.Sequential(
            self.conv1,
            self.conv2
        )


    def forward(self, x):
        out = self._features(x)
        
        out= out.reshape(out.shape[0], out.shape[1]*out.shape[2], out.shape[3], out.shape[4])
        return out

class conv_2d(nn.Module):
    def __init__(self):
        super(conv_2d, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(640, 640, kernel_size = 3, padding =1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)))
            
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(640, 512, kernel_size = 3, padding =1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)))
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size = 3, padding =1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)))
        
    
    def forward(self,x):
        out = self.conv2(self.conv1(x))
        out = self.conv3(out)
       
        out = out.view(out.shape[0],-1)
       
        return out


  

In [5]:



n_classes = 60

model = nn.Sequential(OrderedDict([
    ('frontend', conv_3d()),
    ('mid', conv_2d()),
    ('fc', nn.Sequential( nn.Dropout(p=0.4), nn.Linear(135168, 1024),nn.Linear(1024,256), nn.Linear(256,60) ))
]))





# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
# specify optimizer and learning rate
optimizer = optim.SGD(
  [
        {"params": model.fc.parameters(), "lr": 1e-3},
        {"params": model.mid.parameters(), "lr": 1e-5},
        {"params": model.frontend.parameters(), "lr": 1e-4},
  ],
  momentum = 0.9
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state = torch.load("../input/sign-classification/model_optimizer.pt")
model.load_state_dict(state['model_state_dict'])
model.half()
model.cuda()

optimizer.load_state_dict(state['optimizer_state_dict'])


def train(model, device, train_loader, optimizer, criterion, epoch):
    
    
    model.train()
    
    
    
    
    
    for epoch in range(epoch):
        correct = 0
        num_samples = 0
        train_loss = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            
            data = data.to(device)
            target = (target).to(device)

    
    
            optimizer.zero_grad()
            output = model(data.half())
            
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            pred = output.max(1, keepdim=True)[1]# get the index of the max log-probability
            num_samples += pred.shape[0]
            correct += int(pred==target)
            
        
        train_loss /= num_samples
    
        print('Epoch: {} , Training Accuracy: {}/{} ({:.0f}%) Training Loss: {:.6f}'.format(
                epoch, correct, num_samples,
                100. * correct / num_samples, train_loss))
    torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()}, "./model_optimizer.pt")
        


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataloader = load_dataloader(batch_size=1)


file p01_n105
file p04_n058
file p05_n043


In [7]:
train(model,device,dataloader,optimizer,criterion,15)

Epoch: 0 , Training Accuracy: 19/52 (37%) Training Loss: 3.603780
Epoch: 1 , Training Accuracy: 31/52 (60%) Training Loss: 1.657309
Epoch: 2 , Training Accuracy: 44/52 (85%) Training Loss: 0.665447
Epoch: 3 , Training Accuracy: 51/52 (98%) Training Loss: 0.224948
Epoch: 4 , Training Accuracy: 52/52 (100%) Training Loss: 0.059373
Epoch: 5 , Training Accuracy: 52/52 (100%) Training Loss: 0.035389
Epoch: 6 , Training Accuracy: 51/52 (98%) Training Loss: 0.091478
Epoch: 7 , Training Accuracy: 50/52 (96%) Training Loss: 0.237341
Epoch: 8 , Training Accuracy: 52/52 (100%) Training Loss: 0.037009
Epoch: 9 , Training Accuracy: 52/52 (100%) Training Loss: 0.023361
Epoch: 10 , Training Accuracy: 52/52 (100%) Training Loss: 0.019220
Epoch: 11 , Training Accuracy: 52/52 (100%) Training Loss: 0.019055
Epoch: 12 , Training Accuracy: 52/52 (100%) Training Loss: 0.014066
Epoch: 13 , Training Accuracy: 52/52 (100%) Training Loss: 0.013768
Epoch: 14 , Training Accuracy: 52/52 (100%) Training Loss: 0.013

In [8]:
print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])))

Number of model parameters: 146736828
