# Сбор датасета

In [10]:
import pandas as pd
import requests
import subprocess
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from datasets import load_dataset

category_map = {
    "0": "music",
    "1": "people", 
    "2": "gaming",
    "3": "sports/actions",
    "4": "news/events/politics",
    "5": "education",
    "6": "tv shows",
    "7": "movie/comedy", 
    "8": "animation",
    "9": "vehicles/autos",
    "10": "howto",
    "11": "travel",
    "12": "science/technology",
    "13": "animals/pets",
    "14": "kids/family",
    "15": "documentary",
    "16": "food/drink",
    "17": "cooking",
    "18": "beauty/fashion",
    "19": "advertisement"
}

def download_msrvtt_zip():
    zip_path = "MSRVTT_Videos.zip"
    if not os.path.exists(zip_path):
        url = "https://huggingface.co/datasets/friedrichor/MSR-VTT/resolve/main/MSRVTT_Videos.zip"
        print("Скачиваю архив...")
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
    return zip_path

def extract_video(zip_ref, video_file, output_dir):
    video_path = os.path.join(output_dir, video_file.split('/')[-1])
    if os.path.exists(video_path):
        return video_path
    
    try:
        with zip_ref.open(f"video/{video_file}") as src, open(video_path, 'wb') as dst:
            dst.write(src.read())
    except:
        return None
    
    return video_path

def process_msrvtt_video(zip_ref, item, output_dir):
    video_file = item['video']
    duration = item['end time'] - item['start time']
    
    if duration > 30:
        return None
    
    video_path = extract_video(zip_ref, video_file, output_dir)
    if not video_path:
        return None
    
    return {
        'video_path': video_path,
        'caption': item['caption'],
        'category': category_map.get(str(item['category']), "unknown")
    }

def download_msrvtt_dataset(save_path, num_videos, max_workers=10):
    dataset = load_dataset("friedrichor/MSR-VTT", "test_1k", split="test", streaming=True).shuffle(seed=42, buffer_size=1000)
    
    items = []
    for i, item in enumerate(dataset):
        if item['end time'] - item['start time'] <= 30:
            items.append(item)
        if len(items) >= num_videos:
            break
    
    print(f"Найдено {len(items)} видео до 30 секунд")
    
    zip_path = download_msrvtt_zip()
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref, ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_msrvtt_video, zip_ref, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Извлечение"):
            result = future.result()
            if result:
                with lock:
                    samples.append(result)
    
    df = pd.DataFrame(samples)
    df.to_pickle(os.path.join(save_path, "msrvtt_dataset.pkl"))
    print(f"Обработано {len(df)} видео")
    return df

download_msrvtt_dataset("MSRVTT_videos", 1000)

Найдено 999 видео до 30 секунд
Скачиваю архив...


Извлечение: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 999/999 [00:00<00:00, 1946.47it/s]

Обработано 999 видео





Unnamed: 0,video_path,caption,category
0,MSRVTT_videos/video9778.mp4,a little boy singing in front of judges and crowd,kids/family
1,MSRVTT_videos/video9832.mp4,a video game character rides around on a motor...,vehicles/autos
2,MSRVTT_videos/video7767.mp4,a slideshow with captions,music
3,MSRVTT_videos/video7369.mp4,a man is talking about opening a laptop case,howto
4,MSRVTT_videos/video9731.mp4,a woman is mixing food in a mixing bowl,cooking
...,...,...,...
994,MSRVTT_videos/video8814.mp4,the judges make a decision,kids/family
995,MSRVTT_videos/video9827.mp4,lady gaga sings in a music video,music
996,MSRVTT_videos/video9815.mp4,a mashup of music videos is being played,music
997,MSRVTT_videos/video8901.mp4,men pushing a car down assembly line,vehicles/autos


In [11]:
import pandas as pd
import requests
import subprocess
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import tempfile
import threading
from datasets import load_dataset
import queue

def download_video(video_id):
    url = f"https://huggingface.co/datasets/VLM2Vec/VATEX/resolve/main/raw_videos/{video_id}.mp4"
    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp:
        tmp_path = tmp.name
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                return None
            tmp.write(response.content)
            return tmp_path
        except:
            return None

def cut_video(temp_path, video_id, output_dir):
    parts = video_id.split('_')
    start_time = int(parts[-2])
    end_time = int(parts[-1])
    cut_path = os.path.join(output_dir, f"{video_id}.mp4")
    
    if os.path.exists(cut_path):
        os.unlink(temp_path)
        return cut_path
    
    try:
        result = subprocess.run([
            'ffmpeg',
            '-ss', str(start_time),
            '-i', temp_path,
            '-t', str(end_time - start_time),
            '-c:v', 'libx264',
            '-preset', 'ultrafast',
            '-c:a', 'aac',
            '-y',
            '-loglevel', 'error',
            cut_path
        ], capture_output=True, timeout=30)
    except:
        return None
    
    os.unlink(temp_path)
    return cut_path if result.returncode == 0 else None

def process_video(item, output_dir):
    video_id = item['videoID']
    temp_path = download_video(video_id)
    if not temp_path:
        return None
    return cut_video(temp_path, video_id, output_dir)

def download_vatex_dataset(save_path, num_videos, max_workers=10):
    dataset = load_dataset("VLM2Vec/VATEX", "vatex_test", split="test", streaming=True).shuffle(seed=42, buffer_size=1000)
    items = []
    for i, item in enumerate(dataset):
        if i >= num_videos:
            break
        items.append(item)
    
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_video, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Обработка"):
            video_path = future.result()
            if video_path:
                item = futures[future]
                with lock:
                    samples.append({
                        'video_path': video_path,
                        'caption': item['enCap'],
                        'category': "people"
                    })
    
    df = pd.DataFrame(samples)
    df.to_pickle(os.path.join(save_path, "vatex_dataset.pkl"))
    print(f"Обработано {len(df)} видео")
    return df

download_vatex_dataset("VATEX_videos", 1000)

Обработка: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:06<00:00,  5.36it/s]

Обработано 0 видео





In [12]:
import pandas as pd
import requests
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from datasets import load_dataset

def download_youcook2_zip():
    zip_path = "YouCookIIVideos.zip"
    url = "https://huggingface.co/datasets/lmms-lab/YouCook2/resolve/main/YouCookIIVideos.zip"
    
    if not os.path.exists(zip_path):
        print("Скачиваю архив...")
        response = requests.get(url, stream=True)
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    return zip_path

def extract_video(zip_ref, video_path, output_dir):
    full_video_path = f"YouCookIIVideos/{video_path}"
    filename = os.path.basename(video_path)
    save_path = os.path.join(output_dir, filename)
    
    if os.path.exists(save_path):
        return save_path
    
    try:
        with zip_ref.open(full_video_path) as src, open(save_path, 'wb') as dst:
            dst.write(src.read())
        return save_path
    except:
        return None

def process_youcook2_video(zip_ref, item, output_dir):
    video_path = item['video_path']
    
    if not video_path.endswith('.mp4'):
        return None
    
    start, end = item['segment']
    if end - start > 30:
        return None
    
    extracted_path = extract_video(zip_ref, video_path, output_dir)
    if not extracted_path:
        return None
    
    return {
        'video_path': extracted_path,
        'caption': item['sentence'],
        'category': "cooking"
    }

def download_youcook2_dataset(save_path, num_videos, split="val", max_workers=10):
    dataset = load_dataset("lmms-lab/YouCook2", split=split, streaming=True).shuffle(seed=42, buffer_size=1000)
    
    items = []
    for i, item in enumerate(dataset):
        if not item['video_path'].endswith('.mp4'):
            continue
        
        start, end = item['segment']
        if end - start <= 30:
            items.append(item)
        
        if len(items) >= num_videos:
            break
    
    print(f"Найдено {len(items)} видео до 30 секунд")
    
    zip_path = download_youcook2_zip()
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref, ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_youcook2_video, zip_ref, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Извлечение"):
            result = future.result()
            if result:
                with lock:
                    samples.append(result)
    
    df = pd.DataFrame(samples)
    df.to_pickle(os.path.join(save_path, "youcook2_dataset.pkl"))
    print(f"Обработано {len(df)} видео")
    return df

download_youcook2_dataset("YouCook2_videos", 1000)

Найдено 1000 видео до 30 секунд
Скачиваю архив...


Извлечение: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1674.40it/s]


Обработано 1000 видео


Unnamed: 0,video_path,caption,category
0,YouCook2_videos/sGzBQrg1adY_9.mp4,add marsala powder,cooking
1,YouCook2_videos/pNAwkqm4t3A_5.mp4,remove onion rings from the pot,cooking
2,YouCook2_videos/F564e476ULM_7.mp4,place the lobster on a towel to dry off,cooking
3,YouCook2_videos/InDwfZmSikI_3.mp4,season the pizza with sea salt and basil,cooking
4,YouCook2_videos/eQZEf3NCCo4_4.mp4,place the seaweed down and put the rice on it,cooking
...,...,...,...
995,YouCook2_videos/mV3m2svj3XE_1.mp4,slice chilis and a lime and add the pieces and...,cooking
996,YouCook2_videos/4apR0YypAGc_2.mp4,add some udon noodles to the broth,cooking
997,YouCook2_videos/Nbh64ntT3EM_5.mp4,spread some parmesan cheese and stir the egg m...,cooking
998,YouCook2_videos/G-AUY-jWzck_3.mp4,add pepper and sauerkraut to the pot,cooking


# Получение CLIP-эмбеддингов

In [13]:
# %pip install av einops timm protobuf==3.20.3

In [15]:
DEVICE = torch.device("cuda:6")
print(f"Device: {DEVICE}")

Device: cuda:6


In [16]:
import torch
import av
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from PIL import Image
from transformers import (
    AutoProcessor, AutoModel, AutoConfig, AutoTokenizer,
    CLIPImageProcessor, CLIPTokenizer
)
from sklearn.metrics.pairwise import cosine_similarity

class UniversalVideoModel:
    def __init__(self, model_type="xclip", num_frames=8):
        self.model_type = model_type.lower()
        self.num_frames = num_frames
        self.device = DEVICE
        
        print(f"Загрузка модели: {self.model_type.upper()}...")
        
        if self.model_type == "xclip":
            model_id = "microsoft/xclip-large-patch14" 
            self.processor = AutoProcessor.from_pretrained(model_id)
            self.model = AutoModel.from_pretrained(model_id).to(self.device)
        
        elif self.model_type == "siglip":
            model_id = "google/siglip-so400m-patch14-384"
            self.processor = AutoProcessor.from_pretrained(model_id)
            self.model = AutoModel.from_pretrained(model_id).to(self.device)
        
        self.model.eval()
        print("Модель загружена")

    def _get_video_frames(self, video_path):
        try:
            container = av.open(video_path)
            total_frames = container.streams.video[0].frames
            if total_frames == 0: total_frames = 100
            
            indices = np.linspace(0, total_frames - 1, self.num_frames).astype(int)
            
            frames = []
            container.seek(0)
            for i, frame in enumerate(container.decode(video=0)):
                if i in indices:
                    frames.append(frame.to_image().convert("RGB"))
                    if len(frames) >= self.num_frames:
                        break
            
            if frames and len(frames) < self.num_frames:
                while len(frames) < self.num_frames:
                    frames.append(frames[-1])
                    
            return frames if len(frames) > 0 else None
        except Exception as e:
            return None

    def encode(self, video_path, text):
        frames = self._get_video_frames(video_path)
        if frames is None:
            return None, None
            
        with torch.no_grad():
            if self.model_type == "xclip":
                inputs = self.processor(
                    text=[text], 
                    videos=list(frames), 
                    return_tensors="pt", 
                    padding=True,
                ).to(self.device)
                
                outputs = self.model(**inputs, interpolate_pos_encoding=True)
                
                v_emb = outputs.video_embeds.cpu().numpy()
                t_emb = outputs.text_embeds.reshape(1, -1).cpu().numpy()
                
            elif self.model_type == "siglip":
                inputs_text = self.processor(text=[text], return_tensors="pt", padding="max_length", truncation=True, max_length=64).to(self.device)
                t_feat = self.model.get_text_features(**inputs_text)
                
                inputs_video = self.processor(images=frames, return_tensors="pt").to(self.device)
                frame_feats = self.model.get_image_features(**inputs_video)
                
                v_feat = torch.mean(frame_feats, dim=0, keepdim=True)
                
                v_feat = v_feat / v_feat.norm(p=2, dim=-1, keepdim=True)
                t_feat = t_feat / t_feat.norm(p=2, dim=-1, keepdim=True)
                
                v_emb, t_emb = v_feat.cpu().numpy(), t_feat.cpu().numpy()
                
        return v_emb, t_emb
        

def calculate_metrics(v_emb, t_emb):
    if v_emb.ndim == 3: v_emb = v_emb.squeeze(1)
    if t_emb.ndim == 3: t_emb = t_emb.squeeze(1)
    
    sim_matrix = cosine_similarity(v_emb, t_emb)
    
    ranks = []
    for i in range(len(sim_matrix)):
        sorted_indices = np.argsort(sim_matrix[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        ranks.append(rank)
        
    ranks = np.array(ranks)
    
    print("Итоговые метрики")
    print(f"Recall@1:  {np.mean(ranks == 1) * 100:.2f}%")
    print(f"Recall@5:  {np.mean(ranks <= 5) * 100:.2f}%")
    print(f"Recall@10: {np.mean(ranks <= 10) * 100:.2f}%")
    print(f"Median Rank: {np.median(ranks)}")
    print()


def generate_embeddings(pkl_path, save_path, model_name, num_frames=8, multi_captions=False):
    df_full = pd.read_pickle(pkl_path)
    df = df_full.copy()
    if multi_captions:
        df["caption"] = df["caption"].apply(lambda captions: max(captions, key=len))
    
    engine = UniversalVideoModel(model_type=model_name, num_frames=num_frames)

    emb_id = []
    video_paths = []
    captions = []
    categories = []
    v_embeddings = []
    t_embeddings = []
    
    print("Старт инференса")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if idx == 100:
            break
        v, t = engine.encode(row['video_path'], row['caption'])
        if v is not None:
            emb_id.append(idx)
            video_paths.append(row['video_path'])
            captions.append(row['caption'])
            categories.append(row['category'])
            v_embeddings.append(v)
            t_embeddings.append(t)
            
            
    v_emb_all = np.vstack(v_embeddings)
    t_emb_all = np.vstack(t_embeddings)
    calculate_metrics(v_emb_all, t_emb_all)

    res = pd.DataFrame({
        "video_path": video_paths,
        "caption": captions,
        "category": categories,
        "video_emb": v_embeddings,
        "text_emb": t_embeddings
    }, index=emb_id)

    res.to_pickle(save_path)
        
    return res

In [17]:
pd.read_pickle("MSRVTT_videos/msrvtt_dataset.pkl")

Unnamed: 0,video_path,caption,category
0,MSRVTT_videos/video9778.mp4,a little boy singing in front of judges and crowd,kids/family
1,MSRVTT_videos/video9832.mp4,a video game character rides around on a motor...,vehicles/autos
2,MSRVTT_videos/video7767.mp4,a slideshow with captions,music
3,MSRVTT_videos/video7369.mp4,a man is talking about opening a laptop case,howto
4,MSRVTT_videos/video9731.mp4,a woman is mixing food in a mixing bowl,cooking
...,...,...,...
994,MSRVTT_videos/video8814.mp4,the judges make a decision,kids/family
995,MSRVTT_videos/video9827.mp4,lady gaga sings in a music video,music
996,MSRVTT_videos/video9815.mp4,a mashup of music videos is being played,music
997,MSRVTT_videos/video8901.mp4,men pushing a car down assembly line,vehicles/autos


In [18]:
generate_embeddings(
    pkl_path="MSRVTT_videos/msrvtt_dataset.pkl",
    save_path="MSRVTT_videos/xclip_embeddings.pkl",
    model_name="xclip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: XCLIP...


preprocessor_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

Модель загружена
Старт инференса


  0%|                                                                                                                                                                          | 0/999 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
generate_embeddings(
    pkl_path="MSRVTT_videos/msrvtt_dataset.pkl",
    save_path="MSRVTT_videos/siglip_embeddings.pkl",
    model_name="siglip",
    num_frames=8,
    multi_captions=False
)

In [19]:
generate_embeddings(
    pkl_path="VATEX_videos/vatex_dataset.pkl",
    save_path="VATEX_videos/xclip_embeddings.pkl",
    model_name="xclip",
    num_frames=8,
    multi_captions=True
)

KeyError: 'caption'

In [None]:
generate_embeddings(
    pkl_path="VATEX_videos/vatex_dataset.pkl",
    save_path="VATEX_videos/siglip_embeddings.pkl",
    model_name="siglip",
    num_frames=8,
    multi_captions=True
)

In [20]:
generate_embeddings(
    pkl_path="YouCook2_videos/youcook2_dataset.pkl",
    save_path="YouCook2_videos/xclip_embeddings.pkl",
    model_name="xclip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: XCLIP...
Модель загружена
Старт инференса


  0%|                                                                                                                                                                         | 0/1000 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'shape'

In [22]:
%pip install sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1
Note: you may need to restart the kernel to use updated packages.


In [23]:
generate_embeddings(
    pkl_path="YouCook2_videos/youcook2_dataset.pkl",
    save_path="YouCook2_videos/siglip_embeddings.pkl",
    model_name="siglip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: SIGLIP...


ImportError: 
SiglipTokenizer requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
