https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

A custom Dataset class must implement three functions: __init__, __len__, and __getitem__.

https://github.com/bomri/SlowFast/blob/master/slowfast/datasets/loader.py

https://github.com/bomri/SlowFast/blob/master/slowfast/datasets/ava_dataset.py

https://github.com/HHTseng/video-classification/blob/master/ResNetCRNN_varylength/UCF101_ResNetCRNN_varlen.py
https://www.ai-contentlab.com/2023/01/video-classification-is-important-task.html

https://discuss.pytorch.org/t/how-upload-sequence-of-image-on-video-classification/24865/13

Оптический поток
https://docs.opencv.org/2.4/modules/video/doc/motion_analysis_and_object_tracking.html

Skeleton
https://www.fireblazeaischool.in/blogs/human-pose-estimation-using-opencv/

# Data Loader

Добавить нормализацию!!!

In [None]:
import os
import cv2
import torch
torch.cuda.empty_cache()
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torchvision

In [24]:
class BasicVideoDataset(Dataset):
    def __init__(self, labels_list, video_dir, IMG_SIZE, labels_df):
        self.video_labels = labels_df
        self.video_dir = video_dir
        self.IMG_SIZE = IMG_SIZE
        self.frames_cnt = max(self.video_labels['end']-self.video_labels['begin'])
        self.labels_list = labels_list

    def __len__(self):
        return len(self.video_labels)

    def crop_center_square(self, frame):
        y, x = frame.shape[0:2]
        min_dim = min(y, x)
        start_x = (x // 2) - (min_dim // 2)
        start_y = (y // 2) - (min_dim // 2)
        return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


    def load_video(self, path, begin, end, max_frames=0, resize=(10, 10)):
        cap = cv2.VideoCapture(path)
        frames = []

        frame_index=begin
        try:
            while True and frame_index <= end:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
                ret, frame = cap.read()
                if not ret:
                    break
                frame = self.crop_center_square(frame)
                frame = cv2.resize(frame, resize)
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # convert to grayscale
                #frame = Concatenate()([frame, frame, frame])
                #frame = np.dstack((frame, frame, frame))
                frame = np.array([frame, frame, frame])
                frames.append(frame)
                frame_index+=1

                if len(frames) == max_frames:
                    break
        finally:
            cap.release()
        return torch.from_numpy(np.array(frames))

    def __getitem__(self, idx):
        filename  = os.path.join(self.video_dir, self.video_labels.iloc[idx]['attachment_id']+".mp4")
        label = self.video_labels.iloc[idx]['text']
        begin = self.video_labels.iloc[idx]['begin']
        end = self.video_labels.iloc[idx]['end']
        frames = self.load_video(filename, begin, end, resize=(self.IMG_SIZE, self.IMG_SIZE)) # Загрузка видео!!!!
        return frames/255, labels_list.index(label)

* The __init__ function is run once when instantiating the Dataset object. We initialize the directory containing the images, the annotations file, and both transforms (covered in more detail in the next section).
* The __len__ function returns the number of samples in our dataset.
* The __getitem__ function loads and returns a sample from the dataset at the given index idx.

In [30]:
annotations_file = "/home/jupyter/mnt/s3/rsl-videos/slovo/slovo_annotations/SLOVO_DATAFRAME.tsv"
video_dir = "/home/jupyter/mnt/s3/rsl-videos/slovo/slovo"
IMG_SIZE = 224
BATCH_SIZE = 1
LEARNING_RATE = 0.0001
NUM_EPOCHS = 10
model_type = 'rnn'

In [21]:
video_labels = pd.read_csv(annotations_file, sep='\t')
video_labels['group_rank'] = video_labels.groupby(['text']).cumcount()+1;
video_labels['dataset'] = np.where(video_labels['group_rank']<16,'train', np.where(video_labels['group_rank']<19,'val', 'test'))
video_labels.head(5)

Unnamed: 0,attachment_id,text,user_id,height,width,length,begin,end,group_rank,dataset
0,44e8d2a0-7e01-450b-90b0-beb7400d2c1e,Ё,185bd3a81d9d618518d10abebf0d17a8,640,360,156.0,36,112,1,train
1,df5b08f0-41d1-4572-889c-8b893e71069b,А,185bd3a81d9d618518d10abebf0d17a8,640,360,150.0,36,76,1,train
2,17f53df4-c467-4aff-9f48-20687b63d49a,Р,185bd3a81d9d618518d10abebf0d17a8,640,360,133.0,40,97,1,train
3,e3add916-c708-4339-ad98-7e2740be29e9,Е,185bd3a81d9d618518d10abebf0d17a8,640,360,144.0,43,107,1,train
4,bd7272ed-1850-48f1-a2a8-c8fed523dc37,Ч,185bd3a81d9d618518d10abebf0d17a8,640,360,96.0,20,70,1,train


In [36]:
labels_list = list(video_labels['text'].unique())
num_classes = len(labels_list)
labels_list[:5]

['Ё', 'А', 'Р', 'Е', 'Ч']

In [27]:
training_data = BasicVideoDataset(labels_list=labels_list, video_dir=video_dir, IMG_SIZE=IMG_SIZE, labels_df=video_labels[video_labels['dataset']=='train'])
val_data = BasicVideoDataset(labels_list=labels_list, video_dir=video_dir, IMG_SIZE=IMG_SIZE, labels_df=video_labels[video_labels['dataset']=='val'])
test_data = BasicVideoDataset(labels_list=labels_list, video_dir=video_dir, IMG_SIZE=IMG_SIZE, labels_df=video_labels[video_labels['dataset']=='test'])

In [31]:
train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [32]:
frames, label = next(iter(train_dataloader))

In [33]:
frames.shape
# 1,x,3,244,244

torch.Size([1, 52, 3, 224, 224])

In [34]:
label.shape

torch.Size([1])

In [14]:
frames.dtype

torch.float32

In [15]:
label.dtype

torch.int64

# Model

https://programming.vip/docs/pytorch-basics-14-video-classification-based-on-pytorch.html

We want to be able to train our model on a hardware accelerator like the GPU or MPS, if available. Let’s check to see if torch.cuda or torch.backends.mps are available, otherwise we use the CPU.

In [16]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")
#device = "cuda"

Using cpu device


We define our neural network by subclassing nn.Module, and initialize the neural network layers in __init__. Every nn.Module subclass implements the operations on input data in the forward method.

In [37]:
model_type=="rnn"

class Resnet18Rnn(nn.Module):
	def __init__(self,params_model):
		super(Resnet18Rnn,self).__init__()
		num_classes=params_model["num_classes"]
		dr_rate=params_model["dr_rate"]
		pretrained=params_model["pretrained"]
		rnn_hidden_size=params_model["rnn_hidden_size"]
		rnn_num_layers=params_model["rnn_num_layers"]
		baseModel=torchvision.models.resnet18(pretrained=pretrained)
		num_features=baseModel.fc.in_features
    # baseModel.classifier[-1]=Identity()
		baseModel.fc=Identity() # обнуляем fully connected layer
		self.baseModel=baseModel
		self.dropout=nn.Dropout(dr_rate)
		self.rnn=nn.LSTM(num_features,rnn_hidden_size,rnn_num_layers)
		self.fc1=nn.Linear(rnn_hidden_size, num_classes)
	def forward(self,x):
		try:
				b_z,ts,c,h,w=x.shape
				ii=0
				y=self.baseModel((x[:,ii]))
				out,(hn,cn)=self.rnn(y.unsqueeze(1))
				for ii in range(1,ts):
					y=self.baseModel((x[:,ii]))
					out,(hn,cn)=self.rnn(y.unsqueeze(1),(hn,cn))
				out=self.dropout(out[:,-1])
				out=self.fc1(out)
		except:
				print(f'x: {x}')
				print(f'x.shape: {x.shape}')
				raise
		return out

class Identity(nn.Module):
	def __init__(self):
		super(Identity,self).__init__()
	def forward(self,x):
		return x

params_model={
		"num_classes":num_classes,
		"dr_rate":0.1,
		"pretrained":True,
		"rnn_num_layers":1,
		"rnn_hidden_size":100,
		}
model=Resnet18Rnn(params_model)

#3. Use some virtual input to test the model
with torch.no_grad():
	if model_type=="rnn":
		x=torch.zeros(1,16,3,244,244)
	else:
		x=torch.zeros(1,3,16,244,244)
	y = model(x)
	print(y.shape)

#4. Move the model to the GPU device
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model=model.to(device)

#5. Print model
print(model)

# According to model_type, the corresponding model will be printed. The following is the result of printing 3dcnn model:
# VideoResNet(
# (stem): BasicStem(
# (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
# (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
# (2): ReLU(inplace=True)
# )
# ...
# The printing results of rnn model are as follows:
# Resnt18Rnn(
# (baseModel): ResNet(
# (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
# (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
# (relu): ReLU(inplace=True)
# (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
# ...




torch.Size([1, 1000])
Resnet18Rnn(
  (baseModel): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum

We create an instance of NeuralNetwork, and move it to the device, and print its structure.

To use the model, we pass it the input data. This executes the model’s forward, along with some background operations. Do not call model.forward() directly!

Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output. We get the prediction probabilities by passing it through an instance of the nn.Softmax module.

# Обучение

In [38]:
import matplotlib.pylab as plt
import copy

def get_lr(opt):
	for param_group in opt.param_groups:
		return param_group["lr"]

def metrics_batch(output, target):
	pred=output.argmax(dim=1,keepdim=True)
	corrects=pred.eq(target.view_as(pred)).sum().item()
	return corrects

def loss_batch(loss_func, output, target, opt=None):
	loss=loss_func(output, target)
	with torch.no_grad():
		metric_b=metrics_batch(output,target)
	if opt is not None:
		opt.zero_grad()
		loss.backward()
		opt.step()
	return loss.item(), metric_b

def loss_epoch(model, loss_func, dataset_dl, sanity_check=False,opt=None):
	running_loss=0.0
	running_metric=0.0
	len_data=len(dataset_dl.dataset)
	for xb,yb in dataset_dl:
		xb=xb.to(device)
		yb=yb.to(device)
		output=model(xb)
		loss_b,metric_b=loss_batch(loss_func,output,yb,opt)
		running_loss+=loss_b
		if metric_b is not None:
			running_metric+=metric_b
		if sanity_check is True:
			break
	loss=running_loss/float(len_data)
	metric=running_metric/float(len_data)
	return loss, metric

def plot_loss(loss_hist, metric_hist):
	num_epochs=len(loss_hist["train"])
	plt.title("Train-Val Loss")
	plt.plot(range(1,num_epochs+1),loss_hist["train"],label="train")
	plt.plot(range(1,num_epochs+1),loss_hist["val"],label="val")
	plt.ylabel("Loss")
	plt.xlabel("Training Epochs")
	plt.legend()
	plt.show()
	plt.title("Train-Val Accuracy")
	plt.plot(range(1,num_epochs+1),metric_hist["train"],label="train")
	plt.plot(range(1,num_epochs+1),metric_hist["val"],label="val")
	plt.ylabel("Accuracy")
	plt.xlabel("Training Epochs")
	plt.legend()
	plt.show()

def train_val(model, params):
	num_epochs=params["num_epochs"]
	loss_func=params["loss_func"]
	opt=params["optimizer"]
	train_dl=params["train_dl"]
	val_dl=params["val_dl"]
	sanity_check=params["sanity_check"]
	lr_scheduler=params["lr_scheduler"]
	path2weights=params["path2weights"]

	loss_history={"train":[],"val":[]}
	metric_history={"train":[],"val":[]}
	best_model_wts=copy.deepcopy(model.state_dict())
	best_loss=float("inf")
	for epoch in range(num_epochs):
		current_lr=get_lr(opt)
		print("Epoch {}/{}, current lr={}".format(epoch, num_epochs-1,current_lr))
		model.train()
		train_loss,train_metric = loss_epoch(model, loss_func, train_dl, sanity_check,opt)
		loss_history["train"].append(train_loss)
		metric_history["train"].append(train_metric)
		model.eval()
		with torch.no_grad():
			val_loss, val_metric = loss_epoch(model,loss_func,val_dl,sanity_check)
		if val_loss<best_loss:
			best_loss=val_loss
			best_model_wts=copy.deepcopy(model.state_dict())
			torch.save(model.state_dict(),path2weights)
			print("Copied best model weights")
		loss_history["val"].append(val_loss)
		metric_history["val"].append(val_metric)
		lr_scheduler.step(val_loss)
		if current_lr!=get_lr(opt):
			print("Loading best model weights")
			model.load_state_dict(best_model_wts)
		print("Train loss:%.6f, dev loss:%.6f, accuracy:%.2f" % (train_loss, val_loss, 100*val_metric))
		print("-"*10)
	model.load_state_dict(best_model_wts)
	return model, loss_history, metric_history

In [39]:
model = model.to(device)
# criterion = nn.CrossEntropyLoss().to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [40]:
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
loss_func=nn.CrossEntropyLoss(reduction="sum")
opt=torch.optim.Adam(model.parameters(),lr=3e-5)
# The change of LR in cosine annealing learning rate is periodic, T_max is 1 / 2 of the period; eta_min(float) represents the minimum learning rate, which is 0 by default;
# last_epoch(int) represents the number of previous epoch, which is used to indicate whether the learning rate needs to be adjusted. When last_ When the epoch meets the set interval,
# The learning rate will be adjusted. When - 1, the learning rate is set to the initial value.
# lr_scheduler = CosineAnnealingLR(opt, T_max=20, verbose=True)
lr_scheduler=ReduceLROnPlateau(opt,mode="min",factor=0.5,patience=5,verbose=1)
os.makedirs("./models",exist_ok=True)
#2. Call train in myutils_ Val auxiliary function training model
params_train={
	"num_epochs":200,
	"optimizer":opt,
	"loss_func":loss_func,
	"train_dl":train_dataloader,
	"val_dl":val_dataloader,
	"sanity_check":True,
	"lr_scheduler":lr_scheduler,
	"path2weights":"./models/weights_"+model_type+".pt",
}
model,loss_hist,metric_hist=train_val(model,params_train)
# After running the previous code snippet, the training will begin, and you should see its progress on the screen.
#3. After the training, draw the training progress
plot_loss(loss_hist, metric_hist)
# The previous clip will show a graph of loss and accuracy.

Epoch 0/199, current lr=3e-05
Copied best model weights
Train loss:0.000440, dev loss:0.002235, accuracy:0.00
----------
Epoch 1/199, current lr=3e-05
Copied best model weights
Train loss:0.000468, dev loss:0.002161, accuracy:0.00
----------
Epoch 2/199, current lr=3e-05
Train loss:0.000494, dev loss:0.002411, accuracy:0.00
----------
Epoch 3/199, current lr=3e-05
Train loss:0.000447, dev loss:0.002378, accuracy:0.00
----------
Epoch 4/199, current lr=3e-05
Train loss:0.000464, dev loss:0.002294, accuracy:0.00
----------
Epoch 5/199, current lr=3e-05
Train loss:0.000458, dev loss:0.002370, accuracy:0.00
----------
Epoch 6/199, current lr=3e-05
Train loss:0.000467, dev loss:0.002250, accuracy:0.00
----------
Epoch 7/199, current lr=3e-05
Epoch 00008: reducing learning rate of group 0 to 1.5000e-05.
Loading best model weights
Train loss:0.000453, dev loss:0.002387, accuracy:0.00
----------
Epoch 8/199, current lr=1.5e-05
Train loss:0.000463, dev loss:0.002422, accuracy:0.00
----------
Ep

KeyboardInterrupt: 

In [None]:
y

In [None]:
# model.eval()
# with torch.no_grad():
#     correct = 0
#     total = 0
#     for i, (frames, labels) in enumerate(train_dataloader):
#       frames = frames.to(device, dtype=torch.float)
#       labels = labels.to(device, dtype=torch.float)
#       outputs = model(frames)
#       _, predicted = torch.max(outputs.data, 1)
#       total += labels.size(0)
#       correct += (predicted == labels).sum().item()

# print('Test Accuracy of the model on the 10000 test images: {} %'.format((correct / total) * 100))



In [None]:
# # Сохраняем модель и строим график
# torch.save(model.state_dict(), MODEL_STORE_PATH + 'conv_net_model.ckpt')

In [None]:
# images.dtype