In [1]:
import torchvision
import torch
from torch import nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim
import copy
import os
import torch
from tqdm.autonotebook import tqdm
from torch.optim.lr_scheduler import _LRScheduler
import matplotlib.pyplot as plt
from PIL import Image
from torch.utils.data import Dataset



In [2]:
import pandas as pd
train_label = pd.read_csv('./TrainLabels.csv')
train_label.head(10)

Unnamed: 0,ClipID,Boredom,Engagement,Confusion,Frustration
0,1100011002.avi,0,2,0,0
1,1100011003.avi,0,2,0,0
2,1100011004.avi,0,3,0,0
3,1100011005.avi,0,3,0,0
4,1100011006.avi,0,3,0,0
5,1100011007.avi,1,2,0,0
6,1100011008.avi,0,3,0,0
7,1100011009.avi,0,2,1,0
8,1100011010.avi,0,3,0,0
9,1100011011.avi,0,3,0,0


In [3]:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import os
from tqdm.autonotebook import tqdm
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
im_size = 224
train_transforms = transforms.Compose([
                                        transforms.ToPILImage(),
                                        transforms.Resize((im_size,im_size)),
                                        transforms.ToTensor()])

In [4]:
class video_dataset(Dataset):
    def __init__(self,frame_dir,train_csv,sequencelength = 60,skip_length = 5,transform = None):
        self.folder = os.listdir(frame_dir)
        self.id = train_csv['ClipID']
        self.engagement = train_csv['Engagement']
        self.frame_dir = frame_dir
        self.transform = transform
        self.not_exist = list()
    def __len__(self):
        return len(self.id)
    def __getitem__(self,idx):
        id_1 = self.id[idx][:6]
        path1 = os.path.join(self.frame_dir,id_1)
        id_2 = self.id[idx][:-4]
        path2 = os.path.join(path1,id_2)
        seq_image = list()
        i = 0
        while i<300:
            path3 = os.path.join(path2,str(i)+'.jpg')
            image = cv2.imread(path3)
            if(self.transform):
                image = self.transform(image)
            seq_image.append(image)
            i = i+8
        seq_image = torch.stack(seq_image)
        label = self.engagement[idx]
        return seq_image,label

In [5]:
data = video_dataset('./frames/train',train_label,transform = train_transforms)

In [7]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
from torch.autograd import Variable
from torchvision.models import resnet152
from efficientnet_pytorch import EfficientNet
##############################
#         Encoder
##############################


class Encoder(nn.Module):
    def __init__(self, latent_dim):
        super(Encoder, self).__init__()
        self.resnet = EfficientNet.from_pretrained('efficientnet-b0')
        self.final = nn.Sequential(nn.Linear(1000, latent_dim), nn.BatchNorm1d(latent_dim, momentum=0.01))

    def forward(self, x):
        x = self.resnet(x)
        x = x.view(x.size(0), -1)
        return self.final(x)


##############################
#           LSTM
##############################


class LSTM(nn.Module):
    def __init__(self, latent_dim, num_layers, hidden_dim, bidirectional):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(latent_dim, hidden_dim, num_layers, batch_first=True, bidirectional=bidirectional)
        self.hidden_state = None

    def reset_hidden_state(self):
        self.hidden_state = None

    def forward(self, x):
        x, self.hidden_state = self.lstm(x, self.hidden_state)
        return x


##############################
#      Attention Module
##############################


class Attention(nn.Module):
    def __init__(self, latent_dim, hidden_dim, attention_dim):
        super(Attention, self).__init__()
        self.latent_attention = nn.Linear(latent_dim, attention_dim)
        self.hidden_attention = nn.Linear(hidden_dim, attention_dim)
        self.joint_attention = nn.Linear(attention_dim, 1)

    def forward(self, latent_repr, hidden_repr):
        if hidden_repr is None:
            hidden_repr = [
                Variable(
                    torch.zeros(latent_repr.size(0), 1, self.hidden_attention.in_features), requires_grad=False
                ).float()
            ]
        h_t = hidden_repr[0]
        latent_att = self.latent_attention(latent_att)
        hidden_att = self.hidden_attention(h_t)
        joint_att = self.joint_attention(F.relu(latent_att + hidden_att)).squeeze(-1)
        attention_w = F.softmax(joint_att, dim=-1)
        return attention_w


##############################
#         ConvLSTM
##############################


class ConvLSTM(nn.Module):
    def __init__(
        self, num_classes, latent_dim=512, lstm_layers=1, hidden_dim=1024, bidirectional=True, attention=True
    ):
        super(ConvLSTM, self).__init__()
        self.encoder = Encoder(latent_dim)
        self.lstm = LSTM(latent_dim, lstm_layers, hidden_dim, bidirectional)
        self.output_layers = nn.Sequential(
            nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes),
            nn.Softmax(dim=-1),
        )
        self.attention = attention
        self.attention_layer = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, 1)

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        x = x.view(batch_size, seq_length, -1)
        x = self.lstm(x)
        if self.attention:
            attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
            x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        else:
            x = x[:, -1]
        return self.output_layers(x)


##############################
#     Conv2D Classifier
#        (Baseline)
##############################


class ConvClassifier(nn.Module):
    def __init__(self, num_classes, latent_dim):
        super(ConvClassifier, self).__init__()
        resnet = resnet152(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1])
        self.final = nn.Sequential(
            nn.Linear(resnet.fc.in_features, latent_dim),
            nn.BatchNorm1d(latent_dim, momentum=0.01),
            nn.Linear(latent_dim, num_classes),
            nn.Softmax(dim=-1),
        )

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.feature_extractor(x)
        x = x.view(batch_size * seq_length, -1)
        x = self.final(x)
        x = x.view(batch_size, seq_length, -1)
        return x

In [8]:
model = ConvLSTM(
        num_classes=4,
        latent_dim=256,
        lstm_layers=1,
        hidden_dim=1024,
        bidirectional=True,
        attention=True,
    )

Loaded pretrained weights for efficientnet-b0


In [None]:
model = model.to('cuda')
model = nn.DataParallel(model)

In [None]:
train_loader = DataLoader(data,batch_size = 4,num_workers = 4)

In [None]:
device = 'cuda'
import sys
import cv2
cls_criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
num_epochs = 10
for epoch in range(num_epochs):
    epoch_metrics = {"loss": [], "acc": []}
    print(f"--- Epoch {epoch} ---")
    for batch_i, (X, y) in enumerate(train_loader):
        image_sequences = Variable(X.to(device), requires_grad=True)
        labels = Variable(y.to(device), requires_grad=False)
        optimizer.zero_grad()
        #model.lstm.reset_hidden_state()
        predictions = model(image_sequences)
        loss = cls_criterion(predictions, labels)
        acc = 100 * (predictions.detach().argmax(1) == labels).cpu().numpy().mean()
        loss.backward()
        optimizer.step()
        epoch_metrics["loss"].append(loss.item())
        epoch_metrics["acc"].append(acc)
        batches_done = epoch * len(train_loader) + batch_i
        batches_left = num_epochs * len(train_loader) - batches_done
        sys.stdout.write(
                "\r[Epoch %d/%d] [Batch %d/%d] [Loss: %f (%f), Acc: %.2f%% (%.2f%%)]"
                % (
                    epoch,
                    num_epochs,
                    batch_i,
                    len(train_loader),
                    loss.item(),
                    np.mean(epoch_metrics["loss"]),
                    acc,
                    np.mean(epoch_metrics["acc"]),
                )
            )

            # Empty cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

--- Epoch 0 ---


  self.dropout, self.training, self.bidirectional, self.batch_first)


