https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

A custom Dataset class must implement three functions: __init__, __len__, and __getitem__.

https://github.com/bomri/SlowFast/blob/master/slowfast/datasets/loader.py

https://github.com/bomri/SlowFast/blob/master/slowfast/datasets/ava_dataset.py

https://github.com/HHTseng/video-classification/blob/master/ResNetCRNN_varylength/UCF101_ResNetCRNN_varlen.py
https://www.ai-contentlab.com/2023/01/video-classification-is-important-task.html

https://discuss.pytorch.org/t/how-upload-sequence-of-image-on-video-classification/24865/13

Оптический поток
https://docs.opencv.org/2.4/modules/video/doc/motion_analysis_and_object_tracking.html

Skeleton
https://www.fireblazeaischool.in/blogs/human-pose-estimation-using-opencv/

Could not run 'aten::mkldnn_rnn_layer' with arguments from the 'CUDA' backend.Could not run 'aten::mkldnn_rnn_layer' with arguments from the 'CUDA' backend.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_lstm_neuralnetwork/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Loader

Добавить нормализацию!!!

In [None]:
import os
import cv2
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torchvision

In [None]:
class BasicVideoDataset(Dataset):
    def __init__(self, labels_list, video_dir, IMG_SIZE, labels_df):
        self.video_labels = labels_df
        self.video_dir = video_dir
        self.IMG_SIZE = IMG_SIZE
        self.frames_cnt = max(self.video_labels['end']-self.video_labels['begin'])
        self.labels_list = labels_list

    def __len__(self):
        return len(self.video_labels)

    def crop_center_square(self, frame):
        y, x = frame.shape[0:2]
        min_dim = min(y, x)
        start_x = (x // 2) - (min_dim // 2)
        start_y = (y // 2) - (min_dim // 2)
        return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


    def load_video(self, path, begin, end, max_frames=0):
        cap = cv2.VideoCapture(path)
        frames = []

        frame_index=begin
        try:
            while True and frame_index <= end:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
                ret, frame = cap.read()
                if not ret:
                    break
                frame = self.crop_center_square(frame)
                frame = cv2.resize(frame, (self.IMG_SIZE, self.IMG_SIZE))
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # convert to grayscale
                #frame = Concatenate()([frame, frame, frame])
                #frame = np.dstack((frame, frame, frame))
                frame = np.array([frame, frame, frame])
                frames.append(frame)
                frame_index+=1

                if len(frames) == max_frames:
                    break
        finally:
            cap.release()
        return torch.from_numpy(np.array(frames))

    def __getitem__(self, idx):
        filename  = os.path.join(self.video_dir, self.video_labels.iloc[idx]['attachment_id']+".mp4")
        label = self.video_labels.iloc[idx]['text']
        begin = self.video_labels.iloc[idx]['begin']
        end = self.video_labels.iloc[idx]['end']
        frames = self.load_video(filename, begin, end) # Загрузка видео!!!!
        return frames/255, torch.from_numpy(np.array([1 if l==label else 0 for l in labels_list])) #labels_list.index(label)

* The __init__ function is run once when instantiating the Dataset object. We initialize the directory containing the images, the annotations file, and both transforms (covered in more detail in the next section).
* The __len__ function returns the number of samples in our dataset.
* The __getitem__ function loads and returns a sample from the dataset at the given index idx.

In [None]:
annotations_file = "/content/drive/MyDrive/slovo/SLOVO_DF_SHORT.tsv" #"/home/jupyter/mnt/s3/rsl-videos/slovo/slovo_annotations/SLOVO_DATAFRAME.tsv"
video_dir = "/content/drive/MyDrive/slovo/animals" #"/home/jupyter/mnt/s3/rsl-videos/slovo/slovo"
IMG_SIZE = 224
BATCH_SIZE = 1
LEARNING_RATE = 0.0001
NUM_EPOCHS = 10
model_type = 'rnn'

In [None]:
video_labels = pd.read_csv(annotations_file, sep='\t')
video_labels['group_rank'] = video_labels.groupby(['text']).cumcount()+1;
video_labels['dataset'] = np.where(video_labels['group_rank']<16,'train', np.where(video_labels['group_rank']<19,'val', 'test'))
video_labels.head(5)

Unnamed: 0.1,Unnamed: 0,attachment_id,text,begin,end,group_rank,dataset
0,0,8f4d3be1-3a09-4d76-94ef-f8b1dbfa686b,пингвин,29,100,1,train
1,1,4f9e3cb5-b9de-48bc-a51d-875b8fea8e10,пингвин,21,79,2,train
2,2,1de7b5b0-ce08-419f-aeed-e7e480da953d,пингвин,7,59,3,train
3,3,72f70640-6931-4f57-8c72-a68e48032cfb,пингвин,22,87,4,train
4,4,6933a0f1-a0e1-48d8-91be-b445ca6c80ce,пингвин,9,64,5,train


In [None]:
labels_list = list(video_labels['text'].unique())
num_classes = len(labels_list)
labels_list[:5]

['пингвин', 'жираф', 'лягушка', 'бегемот', 'козел']

In [None]:
training_data = BasicVideoDataset(labels_list=labels_list, video_dir=video_dir, IMG_SIZE=IMG_SIZE, labels_df=video_labels[video_labels['dataset']=='train'])
val_data = BasicVideoDataset(labels_list=labels_list, video_dir=video_dir, IMG_SIZE=IMG_SIZE, labels_df=video_labels[video_labels['dataset']=='val'])
test_data = BasicVideoDataset(labels_list=labels_list, video_dir=video_dir, IMG_SIZE=IMG_SIZE, labels_df=video_labels[video_labels['dataset']=='test'])

In [None]:
train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
frames, label = next(iter(train_dataloader))

In [None]:
frames.shape
# torch.Size([1, ts, 3, 224, 224])
# torch.Size([1, 54, 3, 224, 224])
# torch.Size([1, 93, 3, 224, 224])

torch.Size([1, 54, 3, 224, 224])

In [None]:
label.shape

torch.Size([1, 30])

In [None]:
frames.dtype

torch.float32

In [None]:
label.dtype

torch.int64

# Model

https://programming.vip/docs/pytorch-basics-14-video-classification-based-on-pytorch.html

We want to be able to train our model on a hardware accelerator like the GPU or MPS, if available. Let’s check to see if torch.cuda or torch.backends.mps are available, otherwise we use the CPU.

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")
#device = "cuda"

Using cpu device


In [None]:
class Identity(nn.Module):
	def __init__(self):
		super(Identity,self).__init__()
	def forward(self,x):
		return x

from torchvision.models import ResNet18_Weights

baseModel=torchvision.models.resnet18(weights=ResNet18_Weights.DEFAULT)
num_features=baseModel.fc.in_features
baseModel.fc=Identity()

We define our neural network by subclassing nn.Module, and initialize the neural network layers in __init__. Every nn.Module subclass implements the operations on input data in the forward method.

In [None]:
test_item_shape = [1, 5, 3, 224, 224]
test_item = torch.randn(test_item_shape)
test_item.shape

torch.Size([1, 5, 3, 224, 224])

In [None]:
(test_item[:,4].shape)

torch.Size([1, 3, 224, 224])

In [None]:
b_z,ts,c,h,w=test_item.shape
embeds = torch.empty(size=(ts, 1, num_features))
for idx in range(ts):
    embeds[idx] = (baseModel(test_item[:,idx]))
print(embeds.shape)

torch.Size([5, 1, 512])


In [None]:
lstm=nn.LSTM(num_features,10,1)
lstm_out, _ = lstm(embeds)
lstm_out.shape

torch.Size([5, 1, 10])

In [None]:
class Resnet18Rnn(nn.Module):
    def __init__(self,params_model):
        super(Resnet18Rnn,self).__init__()
        num_classes=params_model["num_classes"]
        dr_rate=params_model["dr_rate"]
        weights=params_model["weights"]
        rnn_hidden_size=params_model["rnn_hidden_size"]
        rnn_num_layers=params_model["rnn_num_layers"]
        baseModel=torchvision.models.resnet18(weights=weights)
        for param in baseModel.parameters():
          param.requires_grad = False
        num_features=baseModel.fc.in_features
        baseModel.fc=Identity() # baseModel.classifier[-1]=Identity() # обнуляем fully connected layer
        #baseModel.fc = nn.Linear(num_features, num_features)
        self.baseModel=baseModel
        self.dropout=nn.Dropout(dr_rate)
        self.lstm=nn.LSTM(num_features,rnn_hidden_size,rnn_num_layers)
        self.fc1=nn.Linear(rnn_hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, frames):
        b_z,ts,c,h,w=frames.shape
        embeds = torch.empty(size=(ts, 1, num_features))
        for idx in range(ts):
            embeds[idx] = (self.baseModel(frames[:,idx]))
        embeds = embeds.to(frames.device)
        lstm_out,(hidden_state,cell_state) = self.lstm(embeds)
        #hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1) # Concatenating the final forward and backward hidden states
        vocab_space = self.fc1(hidden_state[-1,:,:])
        #lstm_out, _ = self.lstm(embeds)
        #vocab_space = self.fc1(lstm_out.view(ts, -1))
        vocab_scores =self.sigmoid(vocab_space)
        return vocab_space
params_model={
		"num_classes":num_classes,
		"dr_rate":0.1,
		"weights":ResNet18_Weights.DEFAULT,
		"rnn_num_layers":1,
		"rnn_hidden_size":100,
		}

model=Resnet18Rnn(params_model)

# Print model
print(model)

Resnet18Rnn(
  (baseModel): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tra

In [None]:
for name, param in model.named_parameters():
        if param.requires_grad:
            print(name,':',param.size())

lstm.weight_ih_l0 : torch.Size([400, 512])
lstm.weight_hh_l0 : torch.Size([400, 100])
lstm.bias_ih_l0 : torch.Size([400])
lstm.bias_hh_l0 : torch.Size([400])
fc1.weight : torch.Size([30, 100])
fc1.bias : torch.Size([30])


In [None]:
label_scores = model(frames)
label_scores.shape

torch.Size([1, 30])

To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. Do not call model.forward() directly!

Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output. We get the prediction probabilities by passing it through an instance of the nn.Softmax module.

# Обучение

In [None]:
# See what the scores are after training
def check_some_predictions(n):
  for i in range(n):
    test_dl = iter(train_dataloader)
    with torch.no_grad():
      frames, label = next(test_dl)
      frames=frames.to(device)
      label=label.to(device)
      true_label = labels_list[(label[-1] == max(label[-1])).nonzero(as_tuple=False)[0][0].item()]
      label_scores = model(frames)
      pred_label = labels_list[(label_scores[-1] == max(label_scores[-1])).nonzero(as_tuple=False)[0][0].item()]
      print(f"Label: {true_label}; Predicted: {pred_label}; Max_score: {max(label_scores[-1])}; Frames shape: {frames.shape}")

In [None]:
# Before training
check_some_predictions(5)

Label: лягушка; Predicted: дельфин; Max_score: 0.486176073551178; Frames shape: torch.Size([1, 76, 3, 224, 224])
Label: бегемот; Predicted: дельфин; Max_score: 0.4786910116672516; Frames shape: torch.Size([1, 56, 3, 224, 224])
Label: козел; Predicted: дельфин; Max_score: 0.5088500380516052; Frames shape: torch.Size([1, 50, 3, 224, 224])
Label: лось; Predicted: дельфин; Max_score: 0.5024398565292358; Frames shape: torch.Size([1, 45, 3, 224, 224])
Label: динозавр; Predicted: дельфин; Max_score: 0.49851155281066895; Frames shape: torch.Size([1, 63, 3, 224, 224])


In [None]:
# m = nn.LogSoftmax(dim=1)
# test_loss = nn.NLLLoss()
# # input is of size N x C = 3 x 5
# test_input = torch.randn(3, 5, requires_grad=True)
# print(test_input)
# print(m(test_input))
# # each element in target has to have 0 <= value < C
# test_target = torch.tensor([1, 0, 4])
# print(test_target)
# test_output = test_loss(m(test_input), test_target)
# print(test_output)

In [None]:
# hidden_state = torch.Tensor([[[-0.3384, -0.1265,  0.3567, -0.6371,  0.1451, -0.1924,  0.1628,
#            0.2251, -0.3264, -0.5475, -0.2158,  0.0917,  0.0525, -0.1003,
#           -0.0601, -0.2791,  0.0443,  0.5774,  0.0596,  0.0711, -0.7636,
#            0.0071, -0.4925, -0.3220,  0.0986,  0.3968, -0.0641, -0.3004,
#           -0.0459,  0.4087, -0.2239,  0.1176,  0.1774, -0.0835,  0.3158,
#           -0.2811, -0.0243, -0.0141,  0.3228,  0.2082, -0.0114, -0.2839,
#            0.5712,  0.8170, -0.4613,  0.1820, -0.0657,  0.0526, -0.4621,
#           -0.6268,  0.7839, -0.5987, -0.1115, -0.4608, -0.1647, -0.0189,
#            0.3420,  0.3679, -0.2579,  0.5030,  0.1194,  0.1415,  0.1913,
#           -0.7466, -0.0555, -0.0983,  0.4235,  0.0553,  0.0920,  0.2105,
#           -0.3041,  0.2723,  0.2214,  0.1893, -0.7795,  0.0991,  0.0176,
#            0.4491,  0.2184,  0.0711,  0.1846,  0.1492,  0.2785,  0.1911,
#           -0.4886,  0.2761, -0.6493, -0.4522,  0.7365, -0.4166, -0.3030,
#            0.2727, -0.2115, -0.2878, -0.0615,  0.7614,  0.0398,  0.1143,
#           -0.4373, -0.3712]]])


#torch.cat((hidden_state[-1,:,:], hidden_state[-1,:,:]), dim = 1)

In [None]:
previous_epochs = 0


In [None]:
from tqdm import tqdm

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
model = model.to(device)
epoches = previous_epochs+15
save = True
best_acc = 0.033

losses = []
test_losses = []
for epoch in range(previous_epochs+1, epoches+1, 1):
    print(f"epoch: {epoch}", end=":")
    total_loss = []
    train_dl = iter(train_dataloader)
    pbar = tqdm(train_dataloader, desc=f'Train Epoch{epoch}/{epoches}')
    # train
    for frames, label in pbar:
        if frames.shape==torch.Size([1, 0]):
            pass
        else:
            frames=frames.to(device)
            label=label.to(device)
            model.zero_grad()
            label_scores = model(frames)
            loss = loss_function(label_scores, torch.argmax(label, 1))
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
            pbar.set_description(f'Train Epoch:{epoch}/{epoches} train_loss:{round(np.mean(total_loss), 4)}')
    losses.append(np.mean(total_loss))
    #test
    total_loss = []
    test_loss = 0
    correct = 0
    total = 0
    pbar = tqdm(test_dataloader, desc=f'Test Epoch{epoch}/{epoches}', mininterval=0.3)
    for frames, label in pbar:
      if frames.shape==torch.Size([1, 0]):
            pass
      else:
            frames=frames.to(device)
            label=label.to(device)
            with torch.no_grad():
              output = model(frames)
            total_loss.append(loss_function(output, torch.argmax(label, 1)).item())
            test_loss += loss_function(output, torch.argmax(label, 1)).item()  # sum up batch loss
            pred = torch.argmax(output, 1)
            correct += (pred == torch.argmax(label, 1)).sum().float()
            total += len(label)
            predict_acc = correct / total
            pbar.set_description(f'Test Epoch:{epoch}/{epoches} acc:{predict_acc:.3f}')
    test_losses.append(np.mean(total_loss))
    if save and predict_acc > best_acc:
      best_acc = predict_acc
      torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': round(np.mean(total_loss), 2)
            },
                "/content/drive/MyDrive/slovo/weights" + f'/Resnet18_LSTM-Epoch-{epoch}-Test_acc-{best_acc:.3f}.pth')
    check_some_predictions(5)

epoch: 1:

Train Epoch:1/16 train_loss:-87.6863: 100%|██████████| 450/450 [1:36:11<00:00, 12.83s/it]
Test Epoch1/16:   0%|          | 0/60 [00:13<?, ?it/s]


AttributeError: ignored

In [None]:
#test
total_loss = 0
test_loss = 0
correct = 0
total_loss = []
pbar = tqdm(test_dataloader, desc=f'Test Epoch{epoch}/{epoches}', mininterval=0.3)
for frames, label in pbar:
  if frames.shape==torch.Size([1, 0]):
        pass
  else:
        frames=frames.to(device)
        label=label.to(device)
        with torch.no_grad():
          output = model(frames)
        total_loss.append(loss_function(output, torch.argmax(label, 1)).item())
        test_loss += loss_function(output, torch.argmax(label, 1)).item()  # sum up batch loss
        pred = torch.argmax(output, 1)
        correct += (pred == torch.argmax(label, 1)).sum().float()
        total += len(label)
        predict_acc = correct / total
        pbar.set_description(f'Test Epoch:{epoch}/{epoches} acc:{predict_acc:.3f}')
test_losses.append(np.mean(total_loss))
if save and predict_acc > best_acc:
  best_acc = predict_acc
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': round(np.mean(total_loss), 2)
        },
            "/content/drive/MyDrive/slovo/weights" + f'/Resnet18_LSTM-Epoch-{epoch}-Test_acc-{best_acc:.3f}.pth')
check_some_predictions(5)

Test Epoch:1/16 acc:0.033: 100%|██████████| 60/60 [13:34<00:00, 13.58s/it]


Label: лев; Predicted: дельфин; Max_score: 0.9107490181922913; Frames shape: torch.Size([1, 64, 3, 224, 224])
Label: лебедь; Predicted: дельфин; Max_score: 0.8992813229560852; Frames shape: torch.Size([1, 70, 3, 224, 224])
Label: бегемот; Predicted: дельфин; Max_score: 0.9192471504211426; Frames shape: torch.Size([1, 60, 3, 224, 224])
Label: тигр; Predicted: дельфин; Max_score: 0.9049146771430969; Frames shape: torch.Size([1, 41, 3, 224, 224])
Label: слон; Predicted: дельфин; Max_score: 0.9085675477981567; Frames shape: torch.Size([1, 50, 3, 224, 224])


In [None]:
for epoch in range(previous_epochs+2, epoches+1, 1):
    print(f"epoch: {epoch}", end=":")
    total_loss = []
    train_dl = iter(train_dataloader)
    pbar = tqdm(train_dataloader, desc=f'Train Epoch{epoch}/{epoches}')
    # train
    for frames, label in pbar:
        if frames.shape==torch.Size([1, 0]):
            pass
        else:
            frames=frames.to(device)
            label=label.to(device)
            model.zero_grad()
            label_scores = model(frames)
            loss = loss_function(label_scores, torch.argmax(label, 1))
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
            pbar.set_description(f'Train Epoch:{epoch}/{epoches} train_loss:{round(np.mean(total_loss), 4)}')
    losses.append(np.mean(total_loss))
    #test
    total_loss = []
    test_loss = 0
    correct = 0
    total = 0
    pbar = tqdm(test_dataloader, desc=f'Test Epoch{epoch}/{epoches}', mininterval=0.3)
    for frames, label in pbar:
      if frames.shape==torch.Size([1, 0]):
            pass
      else:
            frames=frames.to(device)
            label=label.to(device)
            with torch.no_grad():
              output = model(frames)
            total_loss.append(loss_function(output, torch.argmax(label, 1)).item())
            test_loss += loss_function(output, torch.argmax(label, 1)).item()  # sum up batch loss
            pred = torch.argmax(output, 1)
            correct += (pred == torch.argmax(label, 1)).sum().float()
            total += len(label)
            predict_acc = correct / total
            pbar.set_description(f'Test Epoch:{epoch}/{epoches} acc:{predict_acc:.3f}')
    test_losses.append(np.mean(total_loss))
    if save and predict_acc > best_acc:
      best_acc = predict_acc
      torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': round(np.mean(total_loss), 2)
            },
                "/content/drive/MyDrive/slovo/weights" + f'/Resnet18_LSTM-Epoch-{epoch}-Test_acc-{best_acc:.3f}.pth')
    check_some_predictions(5)

epoch: 2:

Train Epoch:2/16 train_loss:-0.5781: 100%|██████████| 450/450 [1:32:39<00:00, 12.35s/it]
Test Epoch:2/16 acc:0.033: 100%|██████████| 60/60 [13:00<00:00, 13.01s/it]


Label: курица; Predicted: дельфин; Max_score: 1.4125105142593384; Frames shape: torch.Size([1, 47, 3, 224, 224])
Label: слон; Predicted: дельфин; Max_score: 1.412015676498413; Frames shape: torch.Size([1, 82, 3, 224, 224])
Label: пчела; Predicted: дельфин; Max_score: 1.4162449836730957; Frames shape: torch.Size([1, 48, 3, 224, 224])
Label: жираф; Predicted: дельфин; Max_score: 1.4113614559173584; Frames shape: torch.Size([1, 47, 3, 224, 224])
Label: обезьяна; Predicted: дельфин; Max_score: 1.4133634567260742; Frames shape: torch.Size([1, 46, 3, 224, 224])
epoch: 3:

Train Epoch:3/16 train_loss:-0.9277:  45%|████▍     | 201/450 [40:05<54:22, 13.10s/it]  