In [1]:
import os
import multiprocessing as mp
import time
import random

In [2]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning
import pytorchvideo.data
import pytorchvideo.models.resnet
from pytorchvideo.data.labeled_video_paths import LabeledVideoPaths
from torchmetrics import Accuracy, F1Score, MetricCollection
from torch.utils.data import Dataset, DataLoader
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)
from torch.autograd import Variable



In [3]:
import pandas as pd
import numpy as np
import timm
import evaluate
import av
import yt_dlp
import albumentations as A
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import TrainingArguments, Trainer
from albumentations.pytorch import ToTensorV2

In [26]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fdab641a8f0>

In [27]:
"""
Установим единный seed для всего
"""
def seed_everything(seed: int):

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

In [3]:
"""
Для использования HuggingFace необходимо ввести логин
"""

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Prepare data

In [5]:
"""
Чтение разметки Kinetics700-2020
"""
df_train = pd.read_csv("data/kinetics700_2020/train.csv")
df_valid = pd.read_csv("data/kinetics700_2020/validate.csv")

In [6]:
df_train

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,clay pottery making,---0dWlqevI,19,29,train
1,news anchoring,---aQ-tA5_A,9,19,train
2,using bagging machine,---j12rm3WI,14,24,train
3,javelin throw,--07WQ2iBlw,1,11,train
4,climbing a rope,--0NTAs-fA0,29,39,train
...,...,...,...,...,...
532901,washing dishes,zzz_3yWpTXo,0,10,train
532902,juggling fire,zzzkS3amkWE,124,134,train
532903,taking photo,zzzsd1R7H0E,6,16,train
532904,brush painting,zzzxltuPx2Q,84,94,train


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532906 entries, 0 to 532905
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   label       532906 non-null  object
 1   youtube_id  532906 non-null  object
 2   time_start  532906 non-null  int64 
 3   time_end    532906 non-null  int64 
 4   split       532906 non-null  object
dtypes: int64(2), object(3)
memory usage: 20.3+ MB


In [8]:
"""
Проверяем наличие None в таблице
"""

df_train.isnull().sum()

label         0
youtube_id    0
time_start    0
time_end      0
split         0
dtype: int64

In [9]:
df_valid

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,testifying,---QUuC4vJs,84,94,validate
1,washing feet,--GkrdYZ9Tc,0,10,validate
2,air drumming,--nQbRBEz2s,104,114,validate
3,pull ups,--rd8woSLiM,41,51,validate
4,building cabinet,--uGS0Y4D6k,9,19,validate
...,...,...,...,...,...
33309,trimming trees,zxdSPlGlSAQ,38,48,validate
33310,feeding goats,zxrvNwur1RE,194,204,validate
33311,country line dancing,zy7uvdwyK8k,3,13,validate
33312,playing paintball,zylVBFyoxZ0,94,104,validate


In [10]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33314 entries, 0 to 33313
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       33314 non-null  object
 1   youtube_id  33314 non-null  object
 2   time_start  33314 non-null  int64 
 3   time_end    33314 non-null  int64 
 4   split       33314 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.3+ MB


In [11]:
"""
Проверяем наличие None в таблице
"""
df_valid.isnull().sum()

label         0
youtube_id    0
time_start    0
time_end      0
split         0
dtype: int64

In [12]:
"""
Выбираем видео, в разметке которых содержится 'dancing'
"""

df_train = df_train[df_train["label"].str.contains("dancing")]

In [13]:
df_train["label"].str.contains("dancing").sum()

13331

In [14]:
"""
Выбираем видео, в разметке которых содержится 'dancing'
"""
df_valid = df_valid[df_valid["label"].str.contains("dancing")]

In [15]:
"""
Проверяем, что количество классов в train и valid совпадает
"""

df_train["label"].nunique() == df_valid["label"].nunique()

True

In [None]:
"""
Функция для скачивания видео с Youtube. После скачивания
видео обрезация по тем кадрам, в которых есть действия (согласно разметке)
Функцию можно параллелить с помощью multiprocess.

"""


pool = mp.Pool(processes=np.cpu_count())

os.makedirs(name="videos/train", exist_ok=True)
os.makedirs(name="videos/valid", exist_ok=True)

train_dir = "videos/train"
valid_dir = "videos/valid"

dataframes = {"train" : df_train, 
              "valid":  df_valid}

for k, v in dataframes.items():
    
    df_iter = pd.DataFrame()
    
    filename_list = []
    label_list = []
    
    for label in v['label'].unique():
        
        if k == "train":
            sample = 100
        else:
            sample = 10
        
        df_new = v[v['label'] == label].sample(sample, replace=True)
        df_iter = pd.concat([df_iter, df_new])

    def download(args):

        idx, row = args

        yt = f"http://youtube.com/watch?v={row['youtube_id']}"
        
        filename = row['youtube_id']

        label = row["label"]
        
        split = row["split"]

        if split == "train":
            output_path = train_dir
        else:
            output_path = valid_dir

        try:
          
            # Скачиваем видео
            ydl_opts = {'outtmpl': output_path + "/" + "%(id)s_temp.%(ext)s"}
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download(url_list=yt)

            start = row["time_start"]
            end = row["time_end"]
            
            # Обрезаем видео и заново сохраняем
            filepath_temp = os.path.join(output_path, filename + "_temp.mp4")
            filepath = os.path.join(output_path, filename + ".mp4")
            ffmpeg_extract_subclip(filepath_temp, start, end, targetname=filepath)
            os.remove(filepath_temp)

            filename_list.append(filepath)
            label_list.append(label)
            

        except Exception as e:
            print(f"Failed to download video {filename}. Exception: {e}")
            time.sleep(5)
        
    pool.map(download, [(idx,row) for idx,row in df_iter.iterrows()])

    # Сохраняем результат в CSV Файл
    df_result = pd.DataFrame.from_dict({"filename": filename_list,
                                        "label": label_list})
    df_result.to_csv(f"{k}_new.csv", index=False)

### Train the model on  few frames

In [5]:
"""
Читаем данные из CSV файла
"""

df_train_tr = pd.read_csv("train_new.csv")
df_valid_tr = pd.read_csv("valid_new.csv")

In [6]:
"""
Маркируем лейблы
"""

class_labels = list(df_train_tr["label"].unique())

label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

Unique classes: ['tap dancing', 'breakdancing', 'belly dancing', 'dancing charleston', 'dancing ballet', 'square dancing', 'jumpstyle dancing', 'salsa dancing', 'robot dancing', 'country line dancing', 'dancing macarena', 'mosh pit dancing', 'dancing gangnam style', 'swing dancing', 'tango dancing'].


In [7]:
"""
Подготавливаем список словарей для train датасета
"""

train_list = []

for idx, row in df_train_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    train_list.append((name, {"label": label2id[label]}))

In [8]:
"""
Подготавливаем список словарей для val датасета
"""

val_list = []

for idx, row in df_valid_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    val_list.append((name, {"label": label2id[label]}))

In [9]:
"""
Проверяем первый элемент списка
"""
val_list[0]

('videos/valid/-UI8AJgs0nI.mp4', {'label': 2})

In [10]:

class VideoOneFrameDataset(Dataset):
    def __init__(self, annotations: list, transform=None,
                 subsample: int=1):
        self.annotations = annotations
        self.transform = transform
        self.subsample = subsample

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        
        # Получаем ссылку на видео 
        video_path = self.annotations[idx][0]
        
        # Получаем label видео 
        label = self.annotations[idx][1]["label"]
        
        container = av.open(video_path)
        
        frames = []
        container.seek(0)

        # Режем видео на кадры и переводим в формат RGB
        for i, frame in enumerate(container.decode(video=0)):
            
            frame = frame.to_ndarray(format="rgb24")
            frames.append(frame)
    
        # Выбираем случаный кадр
        image_list = random.sample(frames, 8)

        if self.transform:
            image_list = [self.transform(image=image)["image"][None, :, :, :] for image in image_list]
        
        image_tensors = torch.vstack(image_list)
        
        return image_tensors, label

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [18]:
"""
Для обучения попробуем взять кадр размером 640 на 640
без аугментации
"""

train_transform = A.Compose([
            A.Resize(height=320, width=320),
            A.Normalize(),
            ToTensorV2()
        ])

val_transform = A.Compose([
            A.Resize(height=320, width=320),
            A.Normalize(),
            ToTensorV2()
        ])

In [22]:
"""
Сохраняем train и val даталоадеры
"""

train_dataset = VideoOneFrameDataset(annotations=train_list,
                                     subsample=8,
                                     transform=train_transform)
val_dataset = VideoOneFrameDataset(annotations=val_list,
                                   subsample=8,
                                   transform=val_transform)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [23]:
"""
Получаем список всех предобученных моделей
"""
all_pretrained_models = timm.list_models(pretrained=True)
all_pretrained_models;

In [24]:
"""
Для обучения возьмем предобученную модель
"efficientnet"
"""

model_name = "efficientnet_b3.ra2_in1k"
model = timm.create_model(model_name, pretrained=True)
model.classifier = nn.Sequential(
    nn.Linear(model.classifier.in_features, 15)
)
model.to(device);

In [25]:
"""
Лучше всего обучение было на 15 эпохах
"""

epochs = 15
criterion = nn.CrossEntropyLoss()
model_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

In [28]:
"""
Запускаем процесс обучения
"""

for epoch in range(epochs):

    model.train()      

    train_loss = []
    train_targets = []
    train_preds = []
    for i, (batch, targets) in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch}")):
        
        model_optimizer.zero_grad()
        
        batch_tensors = [batch_tensor.squeeze(0) for batch_tensor in torch.split(batch, 1)]
        batch_tensors = [batch.to(device) for batch in batch_tensors]
        targets = targets.to(device)
        
        outputs = [model(batch) for batch in batch_tensors]
        outputs = torch.vstack([torch.mean(output, dim=0) for output in outputs])

        loss = criterion(outputs, targets) 
        loss.backward()
        train_loss.append(loss.item())

        model_optimizer.step()
        train_targets.extend(targets.cpu().numpy())
        train_preds.extend(outputs.argmax(axis=1).cpu().numpy())

    print('Training loss:', np.mean(train_loss))
    print('Acc:', accuracy_score(train_targets, train_preds))

    model.eval()

    val_loss = []
    val_targets = []
    val_preds = []
    for i, (batch, targets) in enumerate(tqdm(val_dataloader, desc=f"Epoch: {epoch}")):
        with torch.no_grad():

            batch_tensors = [batch_tensor.squeeze(0) for batch_tensor in torch.split(batch, 1)]
            batch_tensors = [batch.to(device) for batch in batch_tensors]
            targets = targets.to(device)
        
            outputs = [model(batch) for batch in batch_tensors]
            outputs = torch.vstack([torch.mean(output, dim=0) for output in outputs])

            loss = criterion(outputs, targets) 

            val_loss.append(loss.item())
            val_targets.extend(targets.cpu().numpy())
            val_preds.extend(outputs.argmax(axis=1).cpu().numpy())           

    print('Val loss:', np.mean(val_loss))
    print('Acc:', accuracy_score(val_targets, val_preds))

   
    torch.save(model, f"model_epoch_{epoch}.pt")

Epoch: 0:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
Missing reference picture, default is 65562
mmco: unref short failure
mmco: unref short failure
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure


Training loss: 2.87800535578165
Acc: 0.055944055944055944


Epoch: 0:   0%|          | 0/31 [00:00<?, ?it/s]

Val loss: 2.768263655324136
Acc: 0.04838709677419355


Epoch: 1:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure
mmco: unref short failure
Missing reference picture, default is 65562
mmco: unref short failure


Training loss: 2.760398495271339
Acc: 0.06682206682206682


Epoch: 1:   0%|          | 0/31 [00:00<?, ?it/s]

Val loss: 4.69126650979442
Acc: 0.07258064516129033


Epoch: 2:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
mmco: unref short failure
mmco: unref short failure
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure


Training loss: 2.776558006772343
Acc: 0.06526806526806526


Epoch: 2:   0%|          | 0/31 [00:00<?, ?it/s]

Val loss: 2.8460965118100567
Acc: 0.06451612903225806


Epoch: 3:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure
mmco: unref short failure
Missing reference picture, default is 65562
mmco: unref short failure


Training loss: 2.7666463503926435
Acc: 0.06604506604506605


Epoch: 3:   0%|          | 0/31 [00:00<?, ?it/s]

Val loss: 2.7450887080161803
Acc: 0.06451612903225806


Epoch: 4:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure
mmco: unref short failure
Missing reference picture, default is 65562
mmco: unref short failure


Training loss: 2.775341404891162
Acc: 0.050505050505050504


Epoch: 4:   0%|          | 0/31 [00:00<?, ?it/s]

Val loss: 2.7929591440385386
Acc: 0.06451612903225806


Epoch: 5:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure
mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
mmco: unref short failure


Training loss: 2.7695566956300914
Acc: 0.06837606837606838


Epoch: 5:   0%|          | 0/31 [00:00<?, ?it/s]

Val loss: 2.814539570962229
Acc: 0.07258064516129033


Epoch: 6:   0%|          | 0/322 [00:00<?, ?it/s]

mmco: unref short failure
Missing reference picture, default is 65562
mmco: unref short failure


KeyboardInterrupt: 

In [18]:
import gc 

gc.collect()
torch.cuda.empty_cache()

#### Метрика на 15 эпохе: Training loss: 1.62 Acc: 0.475 Val loss: 2.596 Acc: 0.258 при размере изображения 640 на 640, batch_size 8, lr - 1e-3

### VideoMAE

In [18]:
"""
Читаем данные из CSV файла
"""

df_train_tr = pd.read_csv("train_new.csv")
df_valid_tr = pd.read_csv("valid_new.csv")

In [17]:
"""
Маркируем лейблы
"""

class_labels = list(df_train_tr["label"].unique())

label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

Unique classes: ['tap dancing', 'breakdancing', 'belly dancing', 'dancing charleston', 'dancing ballet', 'square dancing', 'jumpstyle dancing', 'salsa dancing', 'robot dancing', 'country line dancing', 'dancing macarena', 'mosh pit dancing', 'dancing gangnam style', 'swing dancing', 'tango dancing'].


In [15]:
"""
Скачиваем модель VideoMAE из HugginFace
"""

model_ckpt = "MCG-NJU/videomae-base"
image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: ['decoder.decoder_layers.1.attention.attention.query.weight', 'decoder.decoder_layers.3.layernorm_after.bias', 'decoder.decoder_layers.0.intermediate.dense.bias', 'decoder.decoder_layers.3.attention.attention.query.weight', 'decoder.head.weight', 'decoder.decoder_layers.0.layernorm_after.bias', 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.1.attention.attention.v_bias', 'mask_token', 'decoder.decoder_layers.0.attention.output.dense.weight', 'decoder.decoder_layers.0.attention.output.dense.bias', 'decoder.decoder_layers.1.layernorm_after.bias', 'decoder.decoder_layers.0.attention.attention.q_bias', 'decoder.norm.weight', 'decoder.decoder_layers.0.attention.attention.key.weight', 'decoder.decoder_layers.3.intermediate.dense.weight', 'decoder.head.bias', 'decoder.decoder_layers.1.attention.attention.q_bias', 'decoder.decoder_layers

In [62]:
"""
Устанавливаем параметры нормализации,
количество фреймов и длительность видео для обучения
"""

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

In [28]:
# Преобразование для train датасета
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)


# Преобразование для valid датасета (без аугментации)
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)


In [19]:
"""
Подготавливаем список словарей для train датасета
"""

train_dict = []

for idx, row in df_train_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    train_dict.append((name, {"label": label2id[label]}))

In [20]:
"""
Подготавливаем список словарей для val датасета
"""
val_dict = []

for idx, row in df_valid_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    val_dict.append((name, {"label": label2id[label]}))

In [None]:
"""
Создаем датасеты с видео с помощью pytorchvideo
"""

train_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths=train_dict,
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)


val_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths=val_dict,
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

In [32]:
"""
Задаем параметры обучения
"""

model_name = model_ckpt.split("/")[-1]
new_model_name = f"{model_name}-finetuned"
num_epochs = 8
batch_size = 6

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
)

In [33]:
"""
Задаем метрики
"""

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [34]:
def collate_fn(examples):
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [35]:
"""
Создаем объект класса Trainer из pytorch lightning
"""

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

/home/sergey/Projects/itmo/action_recognition/videomae-base-finetuned is already a clone of https://huggingface.co/serjsaraev/videomae-base-finetuned. Make sure you pull the latest changes with `repo.git_pull()`.


In [36]:
# Обучаем на 10 эпохах
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,1.8106,2.20758,0.262834
1,1.6798,2.204916,0.297741
2,1.5431,2.14514,0.340862
3,1.3922,2.272062,0.299795
4,0.9802,2.354137,0.316222
5,1.2273,2.206298,0.355236
6,0.8035,2.242723,0.361396
7,0.6862,2.259268,0.37577


mmco: unref short failure
 (repeated 3 more times)
Missing reference picture, default is 65562
mmco: unref short failure


In [37]:
# Дообучаем еще на 10 эпохах (дообучение результата не дало)
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.7896,2.259268,0.37577
1,0.6979,2.259268,0.37577
2,0.8734,2.259268,0.37577
3,0.6812,2.259268,0.37577
4,0.6131,2.259268,0.37577
5,0.5244,2.259268,0.37577
6,0.8647,2.259268,0.37577
7,0.7842,2.259268,0.37577


mmco: unref short failure
 (repeated 20 more times)
Missing reference picture, default is 65562
mmco: unref short failure


In [191]:
torch.cuda.empty_cache()

#### Метрики на 16 эпохах: Training loss: 0.784200	 Val loss:2.259268	Val acc: 0.375770

### ResNet-152

In [5]:
"""
Читаем данные из CSV файла
"""

df_train_tr = pd.read_csv("train_new.csv")
df_valid_tr = pd.read_csv("valid_new.csv")

In [6]:
"""
Маркируем лейблы
"""

class_labels = list(df_train_tr["label"].unique())

label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

Unique classes: ['tap dancing', 'breakdancing', 'belly dancing', 'dancing charleston', 'dancing ballet', 'square dancing', 'jumpstyle dancing', 'salsa dancing', 'robot dancing', 'country line dancing', 'dancing macarena', 'mosh pit dancing', 'dancing gangnam style', 'swing dancing', 'tango dancing'].


In [7]:
"""
Подготавливаем список словарей для train датасета
"""

train_dict = []

for idx, row in df_train_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    train_dict.append((name, {"label": label2id[label]}))

In [8]:
"""
Подготавливаем список словарей для val датасета
"""

val_dict = []

for idx, row in df_valid_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    val_dict.append((name, {"label": label2id[label]}))

In [9]:
class KineticsDataModule(pytorch_lightning.LightningDataModule):

  # Dataset configuration

  _CLIP_DURATION = 2  # Duration of sampled clip for each video
  _BATCH_SIZE = 8
  _NUM_WORKERS = 2  # Number of parallel processes fetching data

  def train_dataloader(self):
    """
    Создаем train dataset (c аугментацией)
    """
    
    train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(8),
                    Lambda(lambda x: x / 255.0),
                    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(224),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
    )
    
    train_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths=train_dict,
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
    decode_audio=False,
    transform=train_transform)

    return torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )

  def val_dataloader(self):
    """
    Создаем val dataset (без аугментации)
    """
    
    val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(8),
                    Lambda(lambda x: x / 255.0),
                    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                    Resize((224, 224)),
                ]
            ),
        ),
    ]
    )
    
    val_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths=val_dict,
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", self._CLIP_DURATION),
    decode_audio=False,
    transform=val_transform)
    
    
    return torch.utils.data.DataLoader(
        val_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )


In [10]:
def make_resnet():
  return pytorchvideo.models.resnet.create_resnet(
      input_channel=3,
      model_depth=152,
      model_num_class=15,
      norm=nn.BatchNorm3d,
      activation=nn.ReLU,
  )

In [11]:
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
  def __init__(self):
      super().__init__()
      self.model = make_resnet()
      self.accuracy = Accuracy(task="multiclass", num_classes=15)

  def forward(self, x):
      return self.model(x)

  def training_step(self, batch, batch_idx):
      # The model expects a video tensor of shape (B, C, T, H, W), which is the
      # format provided by the dataset
      y_hat = self.model(batch["video"])

      # Compute cross entropy loss, loss.backwards will be called behind the scenes
      # by PyTorchLightning after being returned from this method.
      loss = F.cross_entropy(y_hat, batch["label"])
      acc = self.accuracy(y_hat, batch["label"])
      # Log the train loss to Tensorboard
      self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True, logger=True)
      self.log("train_acc", acc, prog_bar=True, on_step=False, on_epoch=True, logger=True)

      return loss

  def validation_step(self, batch, batch_idx):
      y_hat = self.model(batch["video"])
      loss = F.cross_entropy(y_hat, batch["label"])
      acc = self.accuracy(y_hat, batch["label"])
      self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True, logger=True)
      self.log("val_acc", acc, prog_bar=True, on_step=False, on_epoch=True, logger=True)

      return loss

  def configure_optimizers(self):
      """
      Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
      usually useful for training video models.
      """
      return torch.optim.Adam(self.parameters(), lr=5e-3)

In [12]:
"""
Создаем функцию для обучения
"""

def train():
    classification_module = VideoClassificationLightningModule()
    data_module = KineticsDataModule()
    trainer = pytorch_lightning.Trainer(max_epochs=10)
    trainer.fit(classification_module, data_module)

In [13]:
"""
Обучаем модкль на 10 эпохах
"""

train()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type               | Params
------------------------------------------------
0 | model    | Net                | 82.0 M
1 | accuracy | MulticlassAccuracy | 0     
------------------------------------------------
82.0 M    Trainable params
0         Non-trainable params
82.0 M    Total params
328.194   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Missing reference picture, default is 65562
mmco: unref short failure


Validation: 0it [00:00, ?it/s]

mmco: unref short failure
Missing reference picture, default is 65562
Missing reference picture, default is 65562
mmco: unref short failure


Validation: 0it [00:00, ?it/s]

mmco: unref short failure
mmco: unref short failure


Validation: 0it [00:00, ?it/s]

mmco: unref short failure


Validation: 0it [00:00, ?it/s]

mmco: unref short failure
mmco: unref short failure


Validation: 0it [00:00, ?it/s]

mmco: unref short failure


Validation: 0it [00:00, ?it/s]

mmco: unref short failure
mmco: unref short failure


Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [83]:
torch.cuda.empty_cache()

##### Метрики: val_loss=2.670, val_acc=0.0737, train_loss=2.680, train_acc=0.0971 на 10 эпохах при batch_size 8, размеру изображения 224 на 224, lr-5e-3

### Preratrained VideoMAE on Kinetics

In [None]:
"""
Читаем данные из CSV файла
"""

df_train_tr = pd.read_csv("train_new.csv")
df_valid_tr = pd.read_csv("valid_new.csv")

In [None]:
"""
Маркируем лейблы
"""

class_labels = list(df_train_tr["label"].unique())

label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

In [None]:
"""
Скачиваем модель VideoMAE из HuggingFace
"""

model_ckpt = "MCG-NJU/videomae-base-finetuned-kinetics"
image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

In [8]:
for param in model.parameters():
    param.requires_grad = False

In [9]:
for param in model.classifier.parameters():
    param.requires_grad = True

In [10]:
"""
Устанавливаем параметры нормализации,
количество фреймов и длительность видео для обучения
"""

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

In [11]:
"""
Подготавливаем список словарей для train датасета
"""

train_dict = []

for idx, row in df_train_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    train_dict.append((name, {"label": label2id[label]}))

In [12]:
"""
Подготавливаем список словарей для val датасета
"""
val_dict = []

for idx, row in df_valid_tr.iterrows():
    
    name = row["filename"]
    label = row["label"]
    val_dict.append((name, {"label": label2id[label]}))

In [13]:
# Преобразование для train датасета
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)


# Преобразование для valid датасета (без аугментации)
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

In [14]:
"""
Создаем датасеты с видео с помощью pytorchvideo
"""

train_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths=train_dict,
    clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
    decode_audio=False,
    transform=train_transform,
)


val_dataset = pytorchvideo.data.LabeledVideoDataset(
    labeled_video_paths=val_dict,
    clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
    decode_audio=False,
    transform=val_transform,
)

In [15]:
"""
Задаем параметры обучения
"""

model_name = model_ckpt.split("/")[-1]
new_model_name = f"{model_name}-finetuned"
num_epochs = 5
batch_size = 8

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
)

In [16]:
"""
Задаем метрики
"""

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [17]:
def collate_fn(examples):
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [18]:
"""
Создаем объект класса Trainer из pytorch lightning
"""

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

/home/sergey/Projects/itmo/action_recognition/videomae-base-finetuned-kinetics-finetuned is already a clone of https://huggingface.co/serjsaraev/videomae-base-finetuned-kinetics-finetuned. Make sure you pull the latest changes with `repo.git_pull()`.


In [19]:
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,0.8946,1.029962,0.726899
1,0.3988,0.88087,0.747433
2,0.5135,0.849743,0.765914
3,0.4215,0.839303,0.761807
4,0.5874,0.833318,0.7577


mmco: unref short failure
mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
mmco: unref short failure


In [20]:
for param in model.parameters():
    param.requires_grad = True

In [21]:
"""
Задаем параметры обучения
"""

model_name = model_ckpt.split("/")[-1]
new_model_name = f"{model_name}-finetuned"
num_epochs = 15
batch_size = 8

args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
)

In [22]:
"""
Создаем объект класса Trainer из pytorch lightning
"""

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

/home/sergey/Projects/itmo/action_recognition/videomae-base-finetuned-kinetics-finetuned is already a clone of https://huggingface.co/serjsaraev/videomae-base-finetuned-kinetics-finetuned. Make sure you pull the latest changes with `repo.git_pull()`.


In [19]:
train_results = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,1.3662,1.422348,0.691992
1,0.4574,0.904139,0.743326
2,0.5387,0.858284,0.747433
3,0.4267,0.831807,0.755647
4,0.5572,0.834854,0.749487
5,0.5291,0.81589,0.7577
6,0.332,0.825939,0.749487
7,0.3562,0.853387,0.749487
8,0.3396,0.849226,0.753593
9,0.4382,0.859426,0.741273


mmco: unref short failure
mmco: unref short failure
 (repeated 2 more times)
Missing reference picture, default is 65562
mmco: unref short failure


Several commits (2) will be pushed upstream.
