## Imports

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import os
import pickle
import gc
import json

import av
from huggingface_hub import hf_hub_download

from transformers import VivitImageProcessor, VivitModel
from transformers import AutoImageProcessor, VideoMAEModel
from transformers import TimesformerConfig, TimesformerModel
from transformers import XCLIPProcessor, XCLIPModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
from transformers import pipeline

from datasets import load_dataset

import librosa
from moviepy.editor import VideoFileClip

from minio import Minio

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


## Auxiliary code

In [127]:
np.random.seed(0)
torch.manual_seed(0)


def get_model_params_count(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def read_video_pyav(container, indices):
    """
    Decode the video with PyAV decoder.

    Args:
        container (av.container.input.InputContainer): PyAV container.
        indices (List[int]): List of frame indices to decode.

    Returns:
        np.ndarray: Decoded frames of shape (num_frames, height, width, 3).
    """
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    if not frames:
        raise ValueError("No frames were decoded. Check the frame indices and video file.")
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    """
    Sample a given number of frame indices from the video.

    Args:
        clip_len (int): Total number of frames to sample.
        frame_sample_rate (int): Sample every n-th frame.
        seg_len (int): Total number of frames in the video.

    Returns:
        np.ndarray: List of sampled frame indices.
    """
    converted_len = int(clip_len * frame_sample_rate)
    if seg_len < converted_len:
        end_idx = seg_len - 1
    else:
        end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


def process_video(file_path, image_processor, model, clip_len=32, frame_sample_rate=1, device='cpu'):
    """
    Process a single video and return the last_hidden_states.

    Args:
        file_path (str): Path to the video file.
        image_processor (VivitImageProcessor): Image processor instance.
        model (VivitModel): ViViT model instance.
        clip_len (int, optional): Number of frames to sample. Defaults to 32.
        frame_sample_rate (int, optional): Sampling rate. Defaults to 1.
        device (str, optional): Device to run the model on ('cpu' or 'cuda'). Defaults to 'cpu'.

    Returns:
        torch.Tensor: The last hidden states from the model.
    """
    try:
        container = av.open(file_path)
        video_stream = container.streams.video[0]
        total_frames = video_stream.frames
        if total_frames is None:
            # Sometimes PyAV cannot retrieve frame count; estimate it
            total_frames = int(video_stream.duration * video_stream.average_rate)
        
        indices = sample_frame_indices(clip_len, frame_sample_rate, total_frames)
        video = read_video_pyav(container, indices)
        
        # Preprocess frames
        inputs = image_processor(list(video), return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)
        
        last_hidden_states = outputs.last_hidden_state.cpu()
        return last_hidden_states
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

## ViViT

Download the model

In [3]:
IS_LOADED = True

if not IS_LOADED:
    model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")
    image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
    
    model.save_pretrained('models/vivit_model')
    image_processor.save_pretrained('models/vivit_image_processor')
else:
    model = VivitModel.from_pretrained("models/vivit_model")
    image_processor = VivitImageProcessor.from_pretrained("models/vivit_image_processor")

Download video + move model to gpu

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()

data_dir = 'data'
video_files = [f for f in os.listdir(data_dir) if f.lower().endswith('.mp4')]
video_file = 'Битлджус 2 (Beetlejuice Beetlejuice) - ненужный сиквел хорошего фильма [Глянул на днях].mp4'
file_path = os.path.join(data_dir, video_file)

Split video into several windows (because the model takes videos with 32 frames)

In [5]:
container = av.open(file_path)
video_stream = container.streams.video[0]
total_frames = video_stream.frames

if total_frames == 0:
    total_frames = int(video_stream.duration * video_stream.average_rate * video_stream.time_base)

WINDOW_SIZE = 32
clip_len = 2 * WINDOW_SIZE
frame_sample_rate = 10
indices = sample_frame_indices(clip_len, frame_sample_rate, total_frames)
video = read_video_pyav(container, indices)

In [21]:
total_frames

16183

In [20]:
fps = float(video_stream.average_rate)
duration = float(video_stream.duration * video_stream.time_base)
frame_sample_rate = max(1, int(round((fps * 64) / 96)))

fps, duration, frame_sample_rate

(25.0, 647.32, 17)

Inference

In [6]:
%%time
inputs = image_processor(list(video), return_tensors="pt")
inputs = {k: torch.concat(torch.split(v, WINDOW_SIZE, dim=1)).to(device) for k, v in inputs.items()}

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

output = outputs.pooler_output.cpu()

CPU times: user 1.35 s, sys: 73.9 ms, total: 1.43 s
Wall time: 1.43 s


In [9]:
inputs['pixel_values'].shape

torch.Size([2, 32, 3, 224, 224])

**Вывод**

Время инференса 1 окна на GPU -- 0.6 секунд

Если брать 1 минуту и уменьшить частоту с 25 Гц до 2.5 Гц, то понадобится `math.ceil(150 / 32) * 0.6 = 3` секунды

## TimeSformer

Downlaod the model

In [4]:
IS_LOADED = True

if not IS_LOADED:
    model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400")
    image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
    
    model.save_pretrained('models/timesformer_model')
    image_processor.save_pretrained('models/timesformer_image_processor')
else:
    model = TimesformerModel.from_pretrained("models/timesformer_model")
    image_processor = AutoImageProcessor.from_pretrained("models/timesformer_image_processor")

config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Download video + move model to gpu

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()

data_dir = 'data'
video_files = [f for f in os.listdir(data_dir) if f.lower().endswith('.mp4')]
video_file = 'Битлджус 2 (Beetlejuice Beetlejuice) - ненужный сиквел хорошего фильма [Глянул на днях].mp4'
file_path = os.path.join(data_dir, video_file)

Split video into several windows

In [6]:
container = av.open(file_path)
video_stream = container.streams.video[0]
total_frames = video_stream.frames

if total_frames == 0:
    total_frames = int(video_stream.duration * video_stream.average_rate * video_stream.time_base)

WINDOW_SIZE = 8
clip_len = 19 * WINDOW_SIZE
frame_sample_rate = 10
indices = sample_frame_indices(clip_len, frame_sample_rate, total_frames)
video = read_video_pyav(container, indices)

Inference

In [19]:
%%time
inputs = image_processor(list(video), return_tensors="pt")
inputs = {k: torch.concat(torch.split(v, WINDOW_SIZE, dim=1)).to(device) for k, v in inputs.items()}

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

output = outputs.last_hidden_state.cpu()

CPU times: user 3.03 s, sys: 79.6 ms, total: 3.11 s
Wall time: 3.06 s


In [27]:
print("Number of model's parameters:", get_model_params_count(model))

Number of model's parameters: 121258752


## Video-MAE

Download the model

In [8]:
IS_LOADED = True

if not IS_LOADED:
    image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
    model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
    
    model.save_pretrained('models/videomae_model')
    image_processor.save_pretrained('models/videomae_image_processor')
else:
    model = VideoMAEModel.from_pretrained("models/videomae_model")
    image_processor = AutoImageProcessor.from_pretrained("models/videomae_image_processor")


config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/repos/91/b5/91b59ab5f7189d5b5d9289172d47e785957f89733f8ed6e444edc31a85cef58a/bc053ca2840a038b1068269a4eec06ca569689e9a1ed9376a5b2b8a111be5290?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1727261723&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNzI2MTcyM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy85MS9iNS85MWI1OWFiNWY3MTg5ZDViNWQ5Mjg5MTcyZDQ3ZTc4NTk1N2Y4OTczM2Y4ZWQ2ZTQ0NGVkYzMxYTg1Y2VmNThhL2JjMDUzY2EyODQwYTAzOGIxMDY4MjY5YTRlZWMwNmNhNTY5Njg5ZTlhMWVkOTM3NmE1YjJiOGExMTFiZTUyOTA%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=GHvYMJVt-qLXcQ1ISkZQI-iQHXgDoONDc3neSHmb9mgLXN5d-c9kwFJbmtYZaLvJHV3PZgJwTLYjSiC1sX5tpR%7E1LhmiRM-RGa5%7EZmOyZPP6gESUizf9moaxxbuHq1prMN4r8pPI8NxmOAnHOPPPb1vQ6%7EkJmupD6taA7B8SnDlIV%7En2FTtA0JDvlmppxviNqVebm0NbGAOoVRJdFyVdK5jM41AzTDrpfx7EAnlg2X8pM3

model.safetensors:  83%|########3 | 315M/377M [00:00<?, ?B/s]

Download video + move model to gpu

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()

data_dir = 'data'
video_files = [f for f in os.listdir(data_dir) if f.lower().endswith('.mp4')]
video_file = 'Битлджус 2 (Beetlejuice Beetlejuice) - ненужный сиквел хорошего фильма [Глянул на днях].mp4'
file_path = os.path.join(data_dir, video_file)

Split video into several windows

In [7]:
container = av.open(file_path)
video_stream = container.streams.video[0]
total_frames = video_stream.frames

if total_frames == 0:
    total_frames = int(video_stream.duration * video_stream.average_rate * video_stream.time_base)

WINDOW_SIZE = 16
clip_len = 10 * WINDOW_SIZE
frame_sample_rate = 10
indices = sample_frame_indices(clip_len, frame_sample_rate, total_frames)
video = read_video_pyav(container, indices)

Inference

In [11]:
%%time
inputs = image_processor(list(video), return_tensors="pt")
inputs = {k: torch.concat(torch.split(v, WINDOW_SIZE, dim=1)).to(device) for k, v in inputs.items()}

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

output = outputs.last_hidden_state.cpu()

  return torch.tensor(value)


CPU times: user 2.59 s, sys: 63.5 ms, total: 2.65 s
Wall time: 2.62 s


In [12]:
print("Number of model's parameters:", get_model_params_count(model))

Number of model's parameters: 86227200


## X-CLIP (baseline)

Download the model

In [128]:
IS_LOADED = True
model_name = "microsoft/xclip-base-patch16-zero-shot"

if not IS_LOADED:
    image_processor = XCLIPProcessor.from_pretrained(model_name)
    model = XCLIPModel.from_pretrained(model_name)
    
    model.save_pretrained('models/xclip_model')
    image_processor.save_pretrained('models/xclip_image_processor')
else:
    model = XCLIPModel.from_pretrained("models/xclip_model")
    image_processor = XCLIPProcessor.from_pretrained("models/xclip_image_processor")


Download video + move model to gpu

In [129]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()

data_dir = 'data'
video_files = [f for f in os.listdir(data_dir) if f.lower().endswith('.mp4')]
video_file = 'Битлджус 2 (Beetlejuice Beetlejuice) - ненужный сиквел хорошего фильма [Глянул на днях].mp4'
file_path = os.path.join(data_dir, video_file)

Split video into several windows

In [130]:
container = av.open(file_path)
video_stream = container.streams.video[0]
total_frames = video_stream.frames

if total_frames == 0:
    total_frames = int(video_stream.duration * video_stream.average_rate * video_stream.time_base)

WINDOW_SIZE = 32
clip_len = WINDOW_SIZE
frame_sample_rate = 60
indices = sample_frame_indices(clip_len, frame_sample_rate, total_frames)
video = read_video_pyav(container, indices)

Inference

In [131]:
categories = """Auto-moto,
Anime,
Audiobooks,
Business,
Video games,
Interview,
Art,
Movie,
Beauty,
Cooking,
Life Hacks,
Music,
Cartoons,
News,
Training,
Hunting and fishing,
Politics,
Psychology,
Journeys,
Serials,
Sport,
Humor,
Lifestyle,
Realty,
Health,
Nature,
Design,
Machinery and equipment,
Business and entrepreneurship,
Culture,
Religion,
Construction and renovation,
Garden and vegetable garden,
Food,
Entertainment,
Esotericism,
The science,
Audio,
Technology and the Internet,
TV shows,
For children,
Hobby,
Various,
Animals,
News and media,
Films,
Bloggers,
Podcasts,
"""

text_prompt = [f"The video belongs to category '{x}'" for x in categories.split(',\n') if x.strip() != '']

In [138]:
inputs['input_ids'].shape

torch.Size([48, 14])

In [136]:
inputs['attention_mask'].shape, inputs['pixel_values'].shape,

(torch.Size([48, 14]), torch.Size([1, 32, 3, 224, 224]))

In [None]:
image_processor

In [132]:
%%time
inputs = image_processor(
    text=text_prompt,
    videos=list(video),
    return_tensors="pt",
    padding=True
)

inputs['pixel_values'] = torch.concat(torch.split(inputs['pixel_values'], WINDOW_SIZE, dim=1))
inputs = {k: v.to(device) for k, v in inputs.items()}

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

CPU times: user 859 ms, sys: 232 ms, total: 1.09 s
Wall time: 2.08 s


Какое место заняла истинная категория (45, "Фильмы")?

In [22]:
inputs['pixel_values'].shape

torch.Size([1, 32, 3, 224, 224])

In [29]:
outputs.logits_per_video.shape

torch.Size([1, 48])

In [20]:
outputs.vision_model_output.pooler_output.shape

torch.Size([32, 768])

In [17]:
outputs.video_embeds.shape

torch.Size([1, 512])

In [26]:
outputs.mit_output.pooler_output.shape

torch.Size([1, 512])

In [16]:
outputs.keys()

odict_keys(['logits_per_video', 'logits_per_text', 'text_embeds', 'video_embeds', 'text_model_output', 'vision_model_output', 'mit_output'])

In [14]:
logits = outputs.logits_per_video.cpu()

prob = logits.softmax(dim=1).numpy().reshape(-1)
order = logits.argsort(dim=1, descending=True).numpy().reshape(-1)
place = np.flatnonzero(order == 45)[0] + 1

print(f"Правильная категория заняла {place}-ое место")
for x, y in zip(np.array(text_prompt)[order[:place]], prob[order[:place]]):
    print(round(y, 3), '|', x)

Правильная категория заняла 8-ое место
0.116 | The video belongs to category 'Art'
0.103 | The video belongs to category 'Interview'
0.101 | The video belongs to category 'Esotericism'
0.045 | The video belongs to category 'Design'
0.039 | The video belongs to category 'Beauty'
0.037 | The video belongs to category 'Journeys'
0.037 | The video belongs to category 'Sport'
0.036 | The video belongs to category 'Films'


Inference (russian)

In [14]:
rus_categories = """
Авто-мото
Аниме
Аудиокниги
Бизнес
Видеоигры
Интервью
Искусство
Кино
Красота
Кулинария
Лайфхаки
Музыка
Мультфильмы
Новости
Обучение
Охота_и_рыбалка
Политика
Психология
Путешествия
Сериалы
Спорт
Юмор
Лайфстайл
Недвижимость
Здоровье
Природа
Дизайн
Техника_и_оборудование
Бизнес_и_предпринимательство
Культура
Религия
Строительство_и_ремонт
Сад_и_огород
Еда
Развлечения
Эзотерика
Наука
Аудио
Технологии_и_интернет
Телепередачи
Детям
Хобби
Разное
Животные
Новости_и_СМИ
Фильмы
Блогеры
"""

rus_text_prompt = [f"Видео принадлежит категории '{x}'" for x in rus_categories.split() if x.strip() != '']

In [17]:
%%time
inputs = image_processor(
    text=rus_text_prompt,
    videos=list(video),
    return_tensors="pt",
    padding=True
)

inputs['pixel_values'] = torch.concat(torch.split(inputs['pixel_values'], WINDOW_SIZE, dim=1))
inputs = {k: v.to(device) for k, v in inputs.items()}

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

CPU times: user 613 ms, sys: 24.9 ms, total: 638 ms
Wall time: 615 ms


Какое место заняла истинная категория (45, "Фильмы")?

In [18]:
logits = outputs.logits_per_video.cpu()

prob = logits.softmax(dim=1).numpy().reshape(-1)
order = logits.argsort(dim=1, descending=True).numpy().reshape(-1)
place = np.flatnonzero(order == 45)[0] + 1

print(f"Правильная категория заняла {place}-ое место")
for x, y in zip(np.array(text_prompt)[order[:place]], prob[order[:place]]):
    print(round(y, 3), '|', x)

Правильная категория заняла 1-ое место
0.099 | The video belongs to category 'Films'


Размер модели

In [21]:
get_model_params_count(model)

194941441

## Whisper

Download the model

In [3]:
IS_LOADED = True


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"

if not IS_LOADED:
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    processor = AutoProcessor.from_pretrained(model_id)
    
    model.save_pretrained('models/whisper_model')
    processor.save_pretrained('models/whisper_processor')
else:
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        "models/whisper_model", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    processor = AutoProcessor.from_pretrained("models/whisper_processor")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model = model.to(device)
model.eval()
get_model_params_count(model)

1541570560

In [18]:
result.keys()

dict_keys(['text'])

In [16]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])

README.md:   0%|          | 0.00/480 [00:00<?, ?B/s]

(…)-00000-of-00001-913508124a40cb97.parquet:   0%|          | 0.00/1.98M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of Upguards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, Next man!


In [57]:
dataset[0]["audio"]

{'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de',
 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
        0.0005188 ]),
 'sampling_rate': 16000}

In [54]:
processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt", dtype=torch.float16)

{'input_features': tensor([[[ 0.0436, -0.1703, -0.1855,  ...,  0.1478,  0.1967,  0.3374],
         [ 0.1411, -0.0728, -0.0880,  ...,  0.2454,  0.2942,  0.4350],
         [-0.0206, -0.0893, -0.0648,  ...,  0.4284,  0.4698,  0.4153],
         ...,
         [-0.6398, -0.6398, -0.6398,  ..., -0.0424, -0.1619, -0.0533],
         [-0.6398, -0.6398, -0.6398,  ..., -0.1244, -0.1859, -0.0593],
         [-0.6398, -0.6398, -0.6398,  ..., -0.1040, -0.1921, -0.0369]]])}

In [56]:
sample = dataset[0]["audio"]
inputs = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt")
input_features = inputs.input_features.type(torch.float16).to(device)

with torch.no_grad():
    # Forward pass through the encoder
    encoder_outputs = model.model.encoder(input_features)

# Access the last hidden state
last_hidden_state = encoder_outputs.last_hidden_state

print("Shape of the last hidden state:", last_hidden_state.shape)

Shape of the last hidden state: torch.Size([1, 1500, 1280])


## Wav2Vec-2

Загрузка модели

In [3]:
IS_LOADED = True

if not IS_LOADED:
    MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"
    
    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
    
    model.save_pretrained('models/wav2vec2_model')
    processor.save_pretrained('models/wav2vec2_processor')
else:
    model = Wav2Vec2ForCTC.from_pretrained("models/wav2vec2_model")
    processor = Wav2Vec2Processor.from_pretrained("models/wav2vec2_processor")


Download video + move model to gpu

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [7]:
get_model_params_count(model)

315478695

In [15]:
# Preprocessing the datasets.
# We need to read the audio files as arrays
def extract_audio_from_video(video_path, output_audio_path="temp_audio.wav", duration=60):
    """
    Extract audio from the first `duration` seconds of the video.

    Args:
    video_path (str): Path to the input video file.
    output_audio_path (str): Path to save the extracted audio file.
    duration (int): The duration of audio to extract in seconds (default: 60 seconds).

    Returns:
    str: Path to the saved audio file.
    """
    video_clip = VideoFileClip(video_path)
    
    # Extract only the first `duration` seconds of audio
    audio_clip = video_clip.audio.subclip(0, duration)
    
    # Save the audio to the specified path
    audio_clip.write_audiofile(output_audio_path, codec='pcm_s16le')
    
    # Close the clip to release resources
    video_clip.close()
    audio_clip.close()
    
    return output_audio_path


def speech_file_to_array_fn(batch):
    # Extract the audio from the mp4 file
    audio_path = extract_audio_from_video(batch["path"])

    # Load the extracted audio using librosa
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)

    # Prepare for Wav2Vec2 inference
    batch["speech"] = speech_array
    return batch


test_dataset = [
    {"path": "data/49.0_bloggers/Аниме  ＂Волшебница и злой офицер＂.mp4"},
]
test_dataset = [speech_file_to_array_fn(item) for item in test_dataset]

# Convert the audio into a format suitable for Wav2Vec2
inputs = processor([item["speech"] for item in test_dataset], sampling_rate=16_000, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

MoviePy - Writing audio in temp_audio.wav


                                                                                                                                                        

MoviePy - Done.




NameError: name 'processor' is not defined

Inference

In [58]:
with torch.no_grad():
    output = model(inputs['input_values'], attention_mask=inputs['attention_mask'])

# Decode predictions
logits = output.logits
predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

# Print predictions and references
for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Prediction:", predicted_sentence)

----------------------------------------------------------------------------------------------------
Prediction: друз я всем брвет вонагоналя оне могает с вами макс еслинлкам разгово давноя вы с своей второй половенькой со свое женое  ебовницы   чутка сое девошкое смотреле что-нибудь романтическое нежно приятное где а вы деяте стипомни только снемашкое но еще зруб другу такот сегодно я вас засталлю посмотреть месте слебимом челоедувся быт куртое гадте а упкорм дите читься бзете како-то пробушенный свет сет его бнинку и включите тонима  если вас еще не убедил  тодайте покажонтреля расскажу о б это немышке я надеюсь что вом захочется ты сдеть поство пролитвемя слебины роднымите боле подткою хороше вес и почем дастоми пважно етноно  сегодня нас занимаю тысячи двацятого года жедисть даже второй сезон сейчасны о нем поэтому яочень хочу чтобы вы ты фонули сегодня павеже как и пону я приброснотратого немо  сенимо больше погнале смотреться ми паено         мире охоченом гнуснобюдянем зло орган

In [16]:
inputs['input_values'].shape

torch.Size([1, 10359040])

In [34]:
from huggingface_hub import login
from datasets import load_dataset


TOKEN = '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'
SAMPLES = 5

login(token=TOKEN)
dataset = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    "ru",
    split=f"test[:{SAMPLES}]",
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/eduard/.cache/huggingface/token
Login successful


n_shards.json:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

ru_train_0.tar:   0%|          | 0.00/976M [00:00<?, ?B/s]

ru_dev_0.tar:   0%|          | 0.00/392M [00:00<?, ?B/s]

ru_test_0.tar:   0%|          | 0.00/397M [00:00<?, ?B/s]

ru_other_0.tar:   0%|          | 0.00/597M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.huggingface.co/repos/a3/86/a386bf65687d8a6928c1ea57c383aa3faade32f5171150e25af3fc1cfc273db8/ed2a884829f1d4fc92bf654699fa5b518afdf867155b59cd38e4b458c50450d8?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27ru_other_0.tar%3B+filename%3D%22ru_other_0.tar%22%3B&response-content-type=application%2Fx-tar&Expires=1727445161&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNzQ0NTE2MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2EzLzg2L2EzODZiZjY1Njg3ZDhhNjkyOGMxZWE1N2MzODNhYTNmYWFkZTMyZjUxNzExNTBlMjVhZjNmYzFjZmMyNzNkYjgvZWQyYTg4NDgyOWYxZDRmYzkyYmY2NTQ2OTlmYTViNTE4YWZkZjg2NzE1NWI1OWNkMzhlNGI0NThjNTA0NTBkOD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=rN9sq6mipUcfKH1V-rWdGzj2gb7ed4OF1uZG00PalTS7Wr76wC2gy6383MbXkYZT9Df2YL%7ElBJXzdpdtB8d%7E6tmPeCr8-cMlMH-xrH6%7ESuqe944%7E90E3BahL6bYP2wzIVt2NHfPOy9LeTCkLqgWX%7EnekWzfdalGgR

ru_other_0.tar:  32%|###1      | 189M/597M [00:00<?, ?B/s]

ru_invalidated_0.tar:   0%|          | 0.00/387M [00:00<?, ?B/s]

ru_validated_0.tar:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

ru_validated_1.tar:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

ru_validated_2.tar:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

ru_validated_3.tar:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

ru_validated_4.tar:   0%|          | 0.00/114M [00:00<?, ?B/s]

transcript/ru/train.tsv:   0%|          | 0.00/9.85M [00:00<?, ?B/s]

transcript/ru/dev.tsv:   0%|          | 0.00/3.73M [00:00<?, ?B/s]

transcript/ru/test.tsv:   0%|          | 0.00/3.64M [00:00<?, ?B/s]

transcript/ru/other.tsv:   0%|          | 0.00/6.55M [00:00<?, ?B/s]

transcript/ru/invalidated.tsv:   0%|          | 0.00/3.79M [00:00<?, ?B/s]

validated.tsv:   0%|          | 0.00/61.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 26377it [00:00, 323197.70it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 10203it [00:00, 315544.67it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 10203it [00:00, 316167.12it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 17456it [00:00, 320471.02it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 10018it [00:00, 316376.06it/s]


Generating validated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 32351it [00:00, 323479.46it/s][A
Reading metadata...: 64699it [00:00, 307393.36it/s][A
Reading metadata...: 95493it [00:00, 293935.88it/s][A
Reading metadata...: 124951it [00:00, 289836.97it/s][A
Reading metadata...: 163387it [00:00, 292140.27it/s][A


In [38]:
# We need to read the audio files as arrays
def speech_file_to_array_fn_v2(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

dataset = dataset.map(speech_file_to_array_fn_v2)
inputs_2 = processor(dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [65]:
with torch.no_grad():
    output_2 = model(inputs_2['input_values'].to(device), attention_mask=inputs_2['attention_mask'].to(device))

# Decode predictions
logits = output_2.logits
predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

# Print predictions and references
for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Prediction:", predicted_sentence)
    print("Target:", dataset['sentence'][i])

----------------------------------------------------------------------------------------------------
Prediction: масштабы финансово-экономического кризиса и темпывого распространения застали самых опытных специалистов мирах врасплох
Target: МАСШТАБЫ ФИНАНСОВО-ЭКОНОМИЧЕСКОГО КРИЗИСА И ТЕМПЫ ЕГО РАСПРОСТРАНЕНИЯ ЗАСТАЛИ САМЫХ ОПЫТНЫХ СПЕЦИАЛИСТОВ МИРА ВРАСПЛОХ.
----------------------------------------------------------------------------------------------------
Prediction: к сожалению эти предложения не носли отражения в тексте
Target: К СОЖАЛЕНИЮ, ЭТИ ПРЕДЛОЖЕНИЯ НЕ НАШЛИ ОТРАЖЕНИЯ В ТЕКСТЕ.
----------------------------------------------------------------------------------------------------
Prediction: наконец я хочу поблагодарить всех присутствующих здесь сегодня участников
Target: НАКОНЕЦ, Я ХОЧУ ПОБЛАГОДАРИТЬ ВСЕХ ПРИСУТСТВУЮЩИХ ЗДЕСЬ СЕГОДНЯ УЧАСТНИКОВ.
----------------------------------------------------------------------------------------------------
Prediction: толпа звереет будет 

**Вывод**

Модель -- какашка, не использовать

## GigaAM

In [30]:
import hydra
import soundfile as sf
from omegaconf import OmegaConf
import locale

import torch
import torch.nn as nn


locale.getpreferredencoding = lambda: "UTF-8"

# !wget -P /models/giga_am/ https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ssl_model_weights.ckpt
# !wget -P /models/giga_am/ https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/encoder_config.yaml
# !wget -P /models/giga_am/ https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav

In [32]:
def extract_audio_from_video(video_path, output_audio_path="temp_audio.wav"):
    video_clip = VideoFileClip(video_path)
    video_clip.audio.write_audiofile(output_audio_path, codec='pcm_s16le')
    return output_audio_path


class SpecScaler(torch.nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.log(x.clamp_(1e-9, 1e9))


device = "cuda" if torch.cuda.is_available() else "cpu"
dirname = 'models/giga_am/'
encoder_config = dirname+"encoder_config.yaml"
model_weights = dirname+"ssl_model_weights.ckpt"
audio_path = dirname+"example.wav"

conf = OmegaConf.load(encoder_config)

encoder = hydra.utils.instantiate(conf.encoder)
ckpt = torch.load(model_weights, map_location="cpu")
encoder.load_state_dict(ckpt, strict=True)
encoder.to(device)
encoder.eval()

feature_extractor = hydra.utils.instantiate(conf.feature_extractor)

audio_signal, _ = sf.read(audio_path, dtype="float32")
features = feature_extractor(torch.tensor(audio_signal).float())
features = features.to(device)

      ckpt = torch.load(model_weights, map_location="cpu")
    


In [34]:
%%time

with torch.no_grad():
    encoded, _ = encoder.forward(
        audio_signal=features.unsqueeze(0),
        length=torch.tensor([features.shape[-1]]).to(device),
    )
    print(f"encoded signal shape: {encoded.shape}")

encoded signal shape: torch.Size([1, 768, 283])
CPU times: user 68.5 ms, sys: 26.1 ms, total: 94.7 ms
Wall time: 331 ms


Кодируем аудио со всех видео

In [35]:
def extract_audio_from_video(video_path, output_audio_path="temp_audio.wav", duration=60):
    video_clip = VideoFileClip(video_path)
    
    # Extract only the first `duration` seconds of audio
    audio_clip = video_clip.audio.subclip(0, duration)
    
    # Save the audio to the specified path
    audio_clip.write_audiofile(output_audio_path, codec='pcm_s16le')
    
    # Close the clip to release resources
    video_clip.close()
    audio_clip.close()
    
    return output_audio_path


def speech_file_to_array_fn(batch):
    # Extract the audio from the mp4 file
    audio_path = extract_audio_from_video(batch["path"])

    # Load the extracted audio using librosa
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)

    # Prepare for Wav2Vec2 inference
    batch["speech"] = speech_array
    return batch


test_dataset = [
    {"path": "data/49.0_bloggers/Аниме  ＂Волшебница и злой офицер＂.mp4"},
]
test_dataset = [speech_file_to_array_fn(item) for item in test_dataset]
test_dataset[0]['speech'].shape
features_2 = feature_extractor(torch.tensor(test_dataset[0]['speech']).float()).to(device)

MoviePy - Writing audio in temp_audio.wav


                                                                                                                                                        

MoviePy - Done.


In [36]:
%%time

with torch.no_grad():
    encoded_2, _ = encoder.forward(
        audio_signal=features_2.unsqueeze(0),
        length=torch.tensor([features_2.shape[-1]]).to(device),
    )
    print(f"encoded signal shape: {encoded_2.shape}")

encoded signal shape: torch.Size([1, 768, 1501])
CPU times: user 13.9 ms, sys: 11.8 ms, total: 25.7 ms
Wall time: 37.9 ms


In [None]:
audio_signal, _ = sf.read(audio_path, dtype="float32")
features = feature_extractor(torch.tensor(audio_signal).float())
features = features.to(device)

## Stuff

In [20]:
df = pd.read_excel('Categories.xlsx')
df.head()

Unnamed: 0,id,lvl,parent id,name,name (eng),Unnamed: 5,url1,url2,url3,url4,url5,url6,url7,url8,url9,url10,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,0.0,1.0,0.0,Авто-мото,Auto-moto,Вадим,https://rutube.ru/video/77fb221d8c1e75da78bf40...,https://rutube.ru/video/a3a5e23a53a4f295210d79...,https://rutube.ru/shorts/f508b3c1b7efbd72a710f...,https://rutube.ru/video/b2784e7803b339ec734609...,https://rutube.ru/video/671040092dd0f2f25da6ce...,https://rutube.ru/video/a4f693322a0b3a31d2f3e4...,https://rutube.ru/video/460d0196cbfe0a76eb3c01...,https://rutube.ru/video/d83bd4cda356071372e701...,https://rutube.ru/video/0d6f5d3d204b3bcc24d62f...,https://rutube.ru/video/f1869f4b1e664a7886ff86...,,,
1,1.0,1.0,1.0,Аниме,Anime,Вадим,https://rutube.ru/video/71bae0ad0ee61339fc8fae...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/4313978b579b72bb9a19fc...,https://rutube.ru/video/09577d13d97015440875a8...,https://rutube.ru/video/d9229991afcc92464e04d3...,https://rutube.ru/video/6c96553552c8f88866e5f2...,https://rutube.ru/video/ade8f3276e6312e86a9ce5...,https://rutube.ru/video/5e6a566bea497140f58bc0...,https://rutube.ru/video/2e6c5019183bea0a449474...,https://rutube.ru/video/b7de1ae017b850e5f20152...,,,
2,2.0,1.0,2.0,Аудиокниги,Audiobooks,Вадим,https://rutube.ru/video/ed11140b6e776891aa92c4...,https://rutube.ru/video/51b377e74c82fa13252059...,https://rutube.ru/video/3fea6bd428e02c38e32dd7...,https://rutube.ru/video/44304ec49790b2c98d05f4...,https://rutube.ru/video/36f8736e9095805c852b20...,https://rutube.ru/video/f19ad1c071d7dd18575af7...,https://rutube.ru/video/82a4d82bdfa4669170ade5...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/527867fac9d91b32d77d7b...,https://rutube.ru/video/ef27cc0068154d708c7c54...,,,
3,3.0,1.0,3.0,Бизнес,Business,Вадим,https://rutube.ru/video/5b6a2166ff1ba47314baf6...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/b2b9172b9d4264d11145a9...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/c1f56ba54b222ae9f7c9b3...,https://rutube.ru/video/ff2a8becb2fa7f5f82bba4...,https://rutube.ru/video/158265763c7782618af6e4...,https://rutube.ru/video/a49a0b07c149af2b4ed4cc...,,,
4,4.0,1.0,4.0,Видеоигры,Video games,Вадим,https://rutube.ru/video/c2c5a9f12842dc688f95c4...,https://rutube.ru/video/75029800c1b279e65ea37b...,https://rutube.ru/video/664e41a9df7197ad439c67...,https://rutube.ru/video/2ecb0a65a836507ea111ee...,https://rutube.ru/video/6bed3415e99727b9a8e8cb...,https://rutube.ru/video/7fe2d08520b032d3f631e3...,https://rutube.ru/video/eb400b42bc6de612a96fc3...,https://rutube.ru/video/25696c40ca62dbb73bad1a...,https://rutube.ru/video/5a02ffbb2b48f403f7c7f8...,https://rutube.ru/video/0acda6bbe6ced9fb7beb07...,,,


In [33]:
df.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'], inplace=True)

In [32]:
df = df.iloc[]

Unnamed: 0,id,lvl,parent id,name,name (eng),Unnamed: 5,url1,url2,url3,url4,url5,url6,url7,url8,url9,url10,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,0.0,1.0,0.0,Авто-мото,Auto-moto,Вадим,https://rutube.ru/video/77fb221d8c1e75da78bf40...,https://rutube.ru/video/a3a5e23a53a4f295210d79...,https://rutube.ru/shorts/f508b3c1b7efbd72a710f...,https://rutube.ru/video/b2784e7803b339ec734609...,https://rutube.ru/video/671040092dd0f2f25da6ce...,https://rutube.ru/video/a4f693322a0b3a31d2f3e4...,https://rutube.ru/video/460d0196cbfe0a76eb3c01...,https://rutube.ru/video/d83bd4cda356071372e701...,https://rutube.ru/video/0d6f5d3d204b3bcc24d62f...,https://rutube.ru/video/f1869f4b1e664a7886ff86...,,,
1,1.0,1.0,1.0,Аниме,Anime,Вадим,https://rutube.ru/video/71bae0ad0ee61339fc8fae...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/4313978b579b72bb9a19fc...,https://rutube.ru/video/09577d13d97015440875a8...,https://rutube.ru/video/d9229991afcc92464e04d3...,https://rutube.ru/video/6c96553552c8f88866e5f2...,https://rutube.ru/video/ade8f3276e6312e86a9ce5...,https://rutube.ru/video/5e6a566bea497140f58bc0...,https://rutube.ru/video/2e6c5019183bea0a449474...,https://rutube.ru/video/b7de1ae017b850e5f20152...,,,
2,2.0,1.0,2.0,Аудиокниги,Audiobooks,Вадим,https://rutube.ru/video/ed11140b6e776891aa92c4...,https://rutube.ru/video/51b377e74c82fa13252059...,https://rutube.ru/video/3fea6bd428e02c38e32dd7...,https://rutube.ru/video/44304ec49790b2c98d05f4...,https://rutube.ru/video/36f8736e9095805c852b20...,https://rutube.ru/video/f19ad1c071d7dd18575af7...,https://rutube.ru/video/82a4d82bdfa4669170ade5...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/527867fac9d91b32d77d7b...,https://rutube.ru/video/ef27cc0068154d708c7c54...,,,
3,3.0,1.0,3.0,Бизнес,Business,Вадим,https://rutube.ru/video/5b6a2166ff1ba47314baf6...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/b2b9172b9d4264d11145a9...,https://rutube.ru/video/85105a8dee2f1fade90e39...,https://rutube.ru/video/c1f56ba54b222ae9f7c9b3...,https://rutube.ru/video/ff2a8becb2fa7f5f82bba4...,https://rutube.ru/video/158265763c7782618af6e4...,https://rutube.ru/video/a49a0b07c149af2b4ed4cc...,,,
4,4.0,1.0,4.0,Видеоигры,Video games,Вадим,https://rutube.ru/video/c2c5a9f12842dc688f95c4...,https://rutube.ru/video/75029800c1b279e65ea37b...,https://rutube.ru/video/664e41a9df7197ad439c67...,https://rutube.ru/video/2ecb0a65a836507ea111ee...,https://rutube.ru/video/6bed3415e99727b9a8e8cb...,https://rutube.ru/video/7fe2d08520b032d3f631e3...,https://rutube.ru/video/eb400b42bc6de612a96fc3...,https://rutube.ru/video/25696c40ca62dbb73bad1a...,https://rutube.ru/video/5a02ffbb2b48f403f7c7f8...,https://rutube.ru/video/0acda6bbe6ced9fb7beb07...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,,,,рэп,,,,,,,,,,,,,,,
224,,,,Электроника,,,,,,,,,,,,,,,
225,,,,Ретро,,,,,,,,,,,,,,,
226,,,,Рок,,,,,,,,,,,,,,,


In [43]:
def download_rutube_video(video_url, output_path='.', n_minutes=10):
    # Options for yt-dlp
    end_time = n_minutes * 60
    
    ydl_opts = {
        'format': 'best[height<=360]',  # Download the best available quality
        'outtmpl': f'{output_path}/%(title)s.%(ext)s',  # Output file template
        'download-sections': f'*0-{end_time}',
        'force_keyframes_at_cuts': True, 
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])


df = pd.read_excel('Categories.xlsx')
df.head()

df.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'], inplace=True)
df.drop(index=df.index[df.id.isna()], inplace=True)

for index, row in df.iterrows():
    category_id = row['id']
    category_name = row['name (eng)']

    if category_id <= 5:
        continue
    
    # Create a folder named with the category ID and name
    folder_name = f"data/{category_id}_{category_name.lower()}"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Download videos from all URL columns
    for i in range(1, 11):  # Assuming there are up to 10 URLs (url1 to url10)
        url_column = f'url{i}'
        if pd.notna(row.get(url_column)):  # Check if URL exists
            url = row.get(url_column)
            print(f"Downloading video from {url} into folder {folder_name}")

            try:
                download_rutube_video(url, folder_name)
            except Exception as e:
                print(f'Video {i} from category {category_id}. {category_name} was not downloaded')
                continue

Check connection to s3

In [4]:
from minio import Minio

client = Minio(
    endpoint="storage.yandexcloud.net",
    access_key="YCAJESQqZUja9X-F1glArEPSY",
    secret_key="YCP6M_QUdKUF1XBlgz_hOWAlTkcMbnEUyLG5hsQv",
)

bucket = "rutube-tagging"


def upload_file_to_s3(local_file_path, s3_file_name, bucket_name=bucket_name):
    """
    Upload a file from local path to the S3 bucket.
    """
    try:
        # Upload file
        client.fput_object(
            bucket_name, s3_file_name, local_file_path
        )
        print(f"File '{local_file_path}' uploaded to bucket '{bucket_name}' as '{s3_file_name}'.")
    except S3Error as e:
        print(f"Error uploading file to S3: {e}")


def download_file_from_s3(s3_file_name, local_file_path, bucket_name=bucket_name):
    """
    Download a file from the S3 bucket to local path.
    """
    try:
        # Download file
        client.fget_object(
            bucket_name, s3_file_name, local_file_path
        )
        print(f"File '{s3_file_name}' downloaded from bucket '{bucket_name}' to '{local_file_path}'.")
    except S3Error as e:
        print(f"Error downloading file from S3: {e}")


def list_files_in_bucket(bucket_name=bucket_name):
    """
    List all files in the S3 bucket.
    """
    try:
        objects = client.list_objects(bucket_name)
        for obj in objects:
            print(obj.object_name)
    except S3Error as e:
        print(f"Error listing objects in bucket: {e}")

# Examples
# download_file_from_s3("file_on_s3.txt", "path/to/local/file.txt")
# upload_file_to_s3("path/to/local/file.txt", "file_on_s3.txt")
# list_files_in_bucket()

In [117]:
def top_k_accuracy(target: np.ndarray, preds: np.ndarray, k: int = 10) -> float:
    numer = 0
    denom = target.size
    
    for label, labels_set in zip(target, preds):
        if label in labels_set[:k]:
            numer += 1

    return numer * 1.0 / denom

def string_to_numpy(tensor_string):
    tensor_string = tensor_string.replace('tensor(', '').rstrip(')')
    tensor_data = ast.literal_eval(tensor_string)
    tensor = torch.tensor(tensor_data)
    
    return tensor

def string_to_numpy(tensor_string):
    tensor_string = tensor_string.replace('tensor(', '').rstrip(')')
    tensor_data = ast.literal_eval(tensor_string)
    tensor_numpied = torch.tensor(tensor_data).numpy()
    
    return tensor_numpied

label_data = [
    (0, 'Авто-мото'),
    (1, 'Аниме'),
    (2, 'Аудиокниги'),
    (3, 'Бизнес'),
    (4, 'Видеоигры'),
    (5, 'Интервью'),
    (6, 'Искусство'),
    (7, 'Кино'),
    (8, 'Красота'),
    (9, 'Кулинария'),
    (10, 'Лайфхаки'),
    (11, 'Музыка'),
    (12, 'Мультфильмы'),
    (13, 'Новости'),
    (14, 'Обучение'),
    (15, 'Охота и рыбалка'),
    (16, 'Политика'),
    (17, 'Психология'),
    (18, 'Путешествия'),
    (19, 'Сериалы'),
    (20, 'Спорт'),
    (22, 'Юмор'),
    (25, 'Лайфстайл'),
    (26, 'Недвижимость'),
    (27, 'Здоровье'),
    (28, 'Природа'),
    (29, 'Дизайн'),
    (30, 'Техника и оборудование'),
    (31, 'Бизнес и предпринимательство'),
    (32, 'Культура'),
    (33, 'Религия'),
    (34, 'Строительство и ремонт'),
    (35, 'Сад и огород'),
    (36, 'Еда'),
    (37, 'Развлечения'),
    (38, 'Эзотерика'),
    (39, 'Наука'),
    (40, 'Аудио'),
    (41, 'Технологии и интернет'),
    (42, 'Телепередачи'),
    (43, 'Детям'),
    (44, 'Хобби'),
    (45, 'Разное'),
    (46, 'Животные'),
    (47, 'Новости и СМИ'),
    (48, 'Фильмы'),
    (49, 'Блогеры')
]

clip_df = pd.read_csv('data/embeddings/x-clip-v2.csv')

label_df = pd.DataFrame(label_data, columns=['tag_id', 'tag'])
tag_id_to_index = dict(zip(label_df.tag_id, label_df.index))

In [125]:
clip_id_label = clip_df.label.apply(lambda x: tag_id_to_index[x]).values
top_n_predictions = np.vstack(clip_df.probs.apply(lambda x: np.argsort(string_to_numpy(x)).mean(axis=0)[::-1]))

print('Top 1 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 1) * 100)
print('Top 5 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 5) * 100)
print('Top 10 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 10) * 100)

top_n_predictions.shape

Top 1 accuracy: 3.9603960396039604
Top 5 accuracy: 6.9306930693069315
Top 10 accuracy: 10.891089108910892


(404, 47)

In [119]:
clip_id_label = clip_df.label.apply(lambda x: tag_id_to_index[x]).values
top_n_predictions = np.vstack(clip_df.probs.apply(lambda x: np.argsort(string_to_numpy(x))[0, :][::-1]))

print('Top 1 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 1) * 100)
print('Top 5 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 5) * 100)
print('Top 10 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 10) * 100)

Top 1 accuracy: 8.16831683168317
Top 5 accuracy: 19.554455445544555
Top 10 accuracy: 32.17821782178218


In [120]:
clip_id_label = clip_df.label.apply(lambda x: tag_id_to_index[x]).values
top_n_predictions = np.vstack(clip_df.probs.apply(lambda x: np.argsort(string_to_numpy(x))[1, :][::-1]))

print('Top 1 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 1) * 100)
print('Top 5 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 5) * 100)
print('Top 10 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 10) * 100)

Top 1 accuracy: 9.158415841584159
Top 5 accuracy: 20.049504950495052
Top 10 accuracy: 31.18811881188119


In [121]:
clip_id_label = clip_df.label.apply(lambda x: tag_id_to_index[x]).values
top_n_predictions = np.vstack(clip_df.probs.apply(lambda x: np.argsort(string_to_numpy(x))[2, :][::-1]))

print('Top 1 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 1) * 100)
print('Top 5 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 5) * 100)
print('Top 10 accuracy:', top_k_accuracy(clip_id_label, top_n_predictions, 10) * 100)

Top 1 accuracy: 5.445544554455446
Top 5 accuracy: 20.049504950495052
Top 10 accuracy: 30.198019801980198
