# Сбор датасета

In [9]:
import pandas as pd
import requests
import subprocess
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from datasets import load_dataset

category_map = {
    "0": "music",
    "1": "people", 
    "2": "gaming",
    "3": "sports/actions",
    "4": "news/events/politics",
    "5": "education",
    "6": "tv shows",
    "7": "movie/comedy", 
    "8": "animation",
    "9": "vehicles/autos",
    "10": "howto",
    "11": "travel",
    "12": "science/technology",
    "13": "animals/pets",
    "14": "kids/family",
    "15": "documentary",
    "16": "food/drink",
    "17": "cooking",
    "18": "beauty/fashion",
    "19": "advertisement"
}

def download_msrvtt_zip():
    zip_path = "../data/MSRVTT_Videos.zip"
    if not os.path.exists(zip_path):
        url = "https://huggingface.co/datasets/friedrichor/MSR-VTT/resolve/main/MSRVTT_Videos.zip"
        print("Скачиваю архив...")
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
    return zip_path

def extract_video(zip_ref, video_file, output_dir):
    video_path = os.path.join(output_dir, video_file.split('/')[-1])
    if os.path.exists(video_path):
        return video_path
    
    try:
        with zip_ref.open(f"../data/MSRVTT_videos/{video_file}") as src, open(video_path, 'wb') as dst:
            dst.write(src.read())
    except:
        return None
    
    return video_path

def process_msrvtt_video(zip_ref, item, output_dir):
    video_file = item['video']
    duration = item['end time'] - item['start time']
    
    if duration > 30:
        return None
    
    video_path = extract_video(zip_ref, video_file, output_dir)
    if not video_path:
        return None
    
    return {
        'video_path': video_path,
        'caption': item['caption'],
        'category': category_map.get(str(item['category']), "unknown")
    }

def download_msrvtt_dataset(save_path, num_videos, max_workers=10):
    dataset = load_dataset("friedrichor/MSR-VTT", "test_1k", split="test", streaming=True).shuffle(seed=42, buffer_size=1000)
    
    items = []
    for i, item in enumerate(dataset):
        if item['end time'] - item['start time'] <= 30:
            items.append(item)
        if len(items) >= num_videos:
            break
    
    print(f"Найдено {len(items)} видео до 30 секунд")
    
    zip_path = download_msrvtt_zip()
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref, ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_msrvtt_video, zip_ref, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Извлечение"):
            result = future.result()
            if result:
                with lock:
                    samples.append(result)
    
    df = pd.DataFrame(samples)
    df.to_pickle(os.path.join(save_path, "msrvtt_dataset.pkl"))
    print(f"Обработано {len(df)} видео")
    return df

download_msrvtt_dataset("../data/MSRVTT_videos", 1000)

Найдено 999 видео до 30 секунд


Извлечение: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 999/999 [00:00<00:00, 397543.61it/s]

Обработано 0 видео





In [80]:
import pandas as pd
import requests
import subprocess
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import tempfile
import threading
from datasets import load_dataset
import queue

def download_video(video_id):
    url = f"https://huggingface.co/datasets/VLM2Vec/VATEX/resolve/main/raw_videos/{video_id}.mp4"
    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp:
        tmp_path = tmp.name
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                return None
            tmp.write(response.content)
            return tmp_path
        except:
            return None

def cut_video(temp_path, video_id, output_dir):
    parts = video_id.split('_')
    start_time = int(parts[-2])
    end_time = int(parts[-1])
    cut_path = os.path.join(output_dir, f"{video_id}.mp4")
    
    if os.path.exists(cut_path):
        os.unlink(temp_path)
        return cut_path
    
    try:
        result = subprocess.run([
            'ffmpeg',
            '-ss', str(start_time),
            '-i', temp_path,
            '-t', str(end_time - start_time),
            '-c:v', 'libx264',
            '-preset', 'ultrafast',
            '-c:a', 'aac',
            '-y',
            '-loglevel', 'error',
            cut_path
        ], capture_output=True, timeout=30)
    except:
        return None
    
    os.unlink(temp_path)
    return cut_path if result.returncode == 0 else None

def process_video(item, output_dir):
    video_id = item['videoID']
    temp_path = download_video(video_id)
    if not temp_path:
        return None
    return cut_video(temp_path, video_id, output_dir)

def download_vatex_dataset(save_path, num_videos, max_workers=20):
    dataset = load_dataset("VLM2Vec/VATEX", "vatex_test", split="test", streaming=True).shuffle(seed=42, buffer_size=1000)
    items = []
    for i, item in enumerate(dataset):
        if i >= num_videos:
            break
        items.append(item)
    
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_video, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Обработка"):
            video_path = future.result()
            if video_path:
                item = futures[future]
                with lock:
                    samples.append({
                        'video_path': video_path,
                        'caption': item['enCap'],
                        'category': "people"
                    })
    
    df = pd.DataFrame(samples)
    df.to_pickle(os.path.join(save_path, "vatex_dataset.pkl"))
    print(f"Обработано {len(df)} видео")
    return df

download_vatex_dataset("VATEX_videos", 1000)

Обработка: 100%|██████████| 1000/1000 [11:57<00:00,  1.39it/s]

Обработано 977 видео





Unnamed: 0,video_path,caption,category
0,VATEX_videos/d27aRRPH8wI_000001_000011.mp4,[A man working out with weights in a park on ...,people
1,VATEX_videos/xoC9cDva8Fw_000258_000268.mp4,[A man is performing intricate work with feath...,people
2,VATEX_videos/0zNEeg9UCrM_000010_000020.mp4,[A teen girl smoothly rides her skateboard alo...,people
3,VATEX_videos/H1iU3kQ05yQ_000009_000019.mp4,[A baby was crawling on the floor and was smil...,people
4,VATEX_videos/U7AVv6ONUoY_000011_000021.mp4,[An older gentleman plays the bagpipe in doors...,people
...,...,...,...
972,VATEX_videos/Fegs9N1v5X8_000000_000010.mp4,[Dogs run down a snowy driveway that is being ...,people
973,VATEX_videos/5VVhorzxF-8_000027_000037.mp4,[A woman is exercising outdoors using an exerc...,people
974,VATEX_videos/dfI2Xm-hdSc_000000_000010.mp4,[Woman in ice skates skating down a street tha...,people
975,VATEX_videos/DZc1LcVyL0c_000092_000102.mp4,[A boy removes the batteries from a toy then s...,people


In [81]:
import pandas as pd
import requests
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from datasets import load_dataset

def download_youcook2_zip():
    zip_path = "YouCookIIVideos.zip"
    url = "https://huggingface.co/datasets/lmms-lab/YouCook2/resolve/main/YouCookIIVideos.zip"
    
    if not os.path.exists(zip_path):
        print("Скачиваю архив...")
        response = requests.get(url, stream=True)
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    return zip_path

def extract_video(zip_ref, video_path, output_dir):
    full_video_path = f"YouCookIIVideos/{video_path}"
    filename = os.path.basename(video_path)
    save_path = os.path.join(output_dir, filename)
    
    if os.path.exists(save_path):
        return save_path
    
    try:
        with zip_ref.open(full_video_path) as src, open(save_path, 'wb') as dst:
            dst.write(src.read())
        return save_path
    except:
        return None

def process_youcook2_video(zip_ref, item, output_dir):
    video_path = item['video_path']
    
    if not video_path.endswith('.mp4'):
        return None
    
    start, end = item['segment']
    if end - start > 30:
        return None
    
    extracted_path = extract_video(zip_ref, video_path, output_dir)
    if not extracted_path:
        return None
    
    return {
        'video_path': extracted_path,
        'caption': item['sentence'],
        'category': "cooking"
    }

def download_youcook2_dataset(save_path, num_videos, split="val", max_workers=10):
    dataset = load_dataset("lmms-lab/YouCook2", split=split, streaming=True).shuffle(seed=42, buffer_size=1000)
    
    items = []
    for i, item in enumerate(dataset):
        if not item['video_path'].endswith('.mp4'):
            continue
        
        start, end = item['segment']
        if end - start <= 30:
            items.append(item)
        
        if len(items) >= num_videos:
            break
    
    print(f"Найдено {len(items)} видео до 30 секунд")
    
    zip_path = download_youcook2_zip()
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref, ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_youcook2_video, zip_ref, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Извлечение"):
            result = future.result()
            if result:
                with lock:
                    samples.append(result)
    
    df = pd.DataFrame(samples)
    df.to_pickle(os.path.join(save_path, "youcook2_dataset.pkl"))
    print(f"Обработано {len(df)} видео")
    return df

download_youcook2_dataset("YouCook2_videos", 1000)

Найдено 1000 видео до 30 секунд


Извлечение: 100%|██████████| 1000/1000 [00:08<00:00, 120.83it/s]

Обработано 1000 видео





Unnamed: 0,video_path,caption,category
0,YouCook2_videos/eQZEf3NCCo4_4.mp4,place the seaweed down and put the rice on it,cooking
1,YouCook2_videos/c9eELn4axpg_0.mp4,cut the avocado and place in a bowl,cooking
2,YouCook2_videos/F564e476ULM_7.mp4,place the lobster on a towel to dry off,cooking
3,YouCook2_videos/vVZsj1t9R70_6.mp4,boil the chicken stock and add the chicken mus...,cooking
4,YouCook2_videos/sGzBQrg1adY_9.mp4,add marsala powder,cooking
...,...,...,...
995,YouCook2_videos/Nbh64ntT3EM_5.mp4,spread some parmesan cheese and stir the egg m...,cooking
996,YouCook2_videos/peld2w63tpM_8.mp4,stir the bowl until all the ingredients are co...,cooking
997,YouCook2_videos/4apR0YypAGc_2.mp4,add some udon noodles to the broth,cooking
998,YouCook2_videos/JPbFE731Y0c_2.mp4,place the bratwurst on the grill and cook them...,cooking


# Получение CLIP-эмбеддингов

In [2]:
%pip install av einops timm protobuf==3.20.3

Collecting av
  Using cached av-16.0.1-cp311-cp311-manylinux_2_28_x86_64.whl (40.2 MB)
Collecting einops
  Using cached einops-0.8.1-py3-none-any.whl (64 kB)
Collecting timm
  Using cached timm-1.0.22-py3-none-any.whl (2.5 MB)
Collecting protobuf==3.20.3
  Using cached protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Installing collected packages: protobuf, einops, av, timm
Successfully installed av-16.0.1 einops-0.8.1 protobuf-3.20.3 timm-1.0.22
Note: you may need to restart the kernel to use updated packages.


In [2]:
DEVICE = torch.device("cuda:6")
print(f"Device: {DEVICE}")

Device: cuda:6


In [3]:
import torch
import av
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from PIL import Image
from transformers import (
    AutoProcessor, AutoModel, AutoConfig, AutoTokenizer,
    CLIPImageProcessor, CLIPTokenizer
)
from sklearn.metrics.pairwise import cosine_similarity


class UniversalVideoModel:
    def __init__(self, model_type="xclip", num_frames=8):
        self.model_type = model_type.lower()
        self.num_frames = num_frames
        self.device = DEVICE
        
        print(f"Загрузка модели: {self.model_type.upper()}...")
        
        if self.model_type == "xclip":
            model_id = "microsoft/xclip-large-patch14" 
            self.processor = AutoProcessor.from_pretrained(model_id)
            self.model = AutoModel.from_pretrained(model_id).to(self.device)
        
        elif self.model_type == "siglip":
            model_id = "google/siglip-so400m-patch14-384"
            self.processor = AutoProcessor.from_pretrained(model_id)
            self.model = AutoModel.from_pretrained(model_id).to(self.device)
        
        self.model.eval()
        print("Модель загружена")

    def _get_video_frames(self, video_path):
        try:
            container = av.open(video_path)
            total_frames = container.streams.video[0].frames
            if total_frames == 0: total_frames = 100
            
            indices = np.linspace(0, total_frames - 1, self.num_frames).astype(int)
            
            frames = []
            container.seek(0)
            for i, frame in enumerate(container.decode(video=0)):
                if i in indices:
                    frames.append(frame.to_image().convert("RGB"))
                    if len(frames) >= self.num_frames:
                        break
            
            if frames and len(frames) < self.num_frames:
                while len(frames) < self.num_frames:
                    frames.append(frames[-1])
                    
            return frames if len(frames) > 0 else None
        except Exception as e:
            return None

    def encode(self, video_path, text):
        frames = self._get_video_frames(video_path)
        if frames is None:
            return None, None
            
        with torch.no_grad():
            if self.model_type == "xclip":
                inputs = self.processor(
                    text=[text], 
                    videos=list(frames), 
                    return_tensors="pt", 
                    padding=True,
                ).to(self.device)
                
                outputs = self.model(**inputs, interpolate_pos_encoding=True)
                
                v_emb = outputs.video_embeds.cpu().numpy()
                t_emb = outputs.text_embeds.reshape(1, -1).cpu().numpy()
                
            elif self.model_type == "siglip":
                inputs_text = self.processor(text=[text], return_tensors="pt", padding="max_length", truncation=True, max_length=64).to(self.device)
                t_feat = self.model.get_text_features(**inputs_text)
                
                inputs_video = self.processor(images=frames, return_tensors="pt").to(self.device)
                frame_feats = self.model.get_image_features(**inputs_video)
                
                v_feat = torch.mean(frame_feats, dim=0, keepdim=True)
                
                v_feat = v_feat / v_feat.norm(p=2, dim=-1, keepdim=True)
                t_feat = t_feat / t_feat.norm(p=2, dim=-1, keepdim=True)
                
                v_emb, t_emb = v_feat.cpu().numpy(), t_feat.cpu().numpy()
                
        return v_emb, t_emb
        

def calculate_metrics(v_emb, t_emb):
    if v_emb.ndim == 3: v_emb = v_emb.squeeze(1)
    if t_emb.ndim == 3: t_emb = t_emb.squeeze(1)
    
    sim_matrix = cosine_similarity(v_emb, t_emb)
    
    ranks = []
    for i in range(len(sim_matrix)):
        sorted_indices = np.argsort(sim_matrix[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        ranks.append(rank)
        
    ranks = np.array(ranks)
    
    print("Итоговые метрики")
    print(f"Recall@1:  {np.mean(ranks == 1) * 100:.2f}%")
    print(f"Recall@5:  {np.mean(ranks <= 5) * 100:.2f}%")
    print(f"Recall@10: {np.mean(ranks <= 10) * 100:.2f}%")
    print(f"Median Rank: {np.median(ranks)}")
    print()


def generate_embeddings(pkl_path, save_path, model_name, num_frames=8, multi_captions=False):
    df_full = pd.read_pickle(pkl_path)
    df = df_full.copy()
    if multi_captions:
        df["caption"] = df["caption"].apply(lambda captions: max(captions, key=len))
    
    engine = UniversalVideoModel(model_type=model_name, num_frames=num_frames)

    emb_id = []
    video_paths = []
    captions = []
    categories = []
    v_embeddings = []
    t_embeddings = []
    
    print("Старт инференса")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if idx == 100:
            break
        v, t = engine.encode(row['video_path'], row['caption'])
        if v is not None:
            emb_id.append(idx)
            video_paths.append(row['video_path'])
            captions.append(row['caption'])
            categories.append(row['category'])
            v_embeddings.append(v)
            t_embeddings.append(t)
            
            
    v_emb_all = np.vstack(v_embeddings)
    t_emb_all = np.vstack(t_embeddings)
    calculate_metrics(v_emb_all, t_emb_all)

    res = pd.DataFrame({
        "video_path": video_paths,
        "caption": captions,
        "category": categories,
        "video_emb": v_embeddings,
        "text_emb": t_embeddings
    }, index=emb_id)

    res.to_pickle(save_path)
        
    return res

In [3]:
pd.read_pickle("../data/MSRVTT_videos/msrvtt_dataset.pkl")

Unnamed: 0,video_path,caption,category
0,MSRVTT_videos/video8344.mp4,a man is driving a car through the countryside,vehicles/autos
1,MSRVTT_videos/video9782.mp4,a man talking about a womans genital problems,people
2,MSRVTT_videos/video9957.mp4,a group of people are dancing in a room,music
3,MSRVTT_videos/video7358.mp4,it is a vine compilation,sports/actions
4,MSRVTT_videos/video9795.mp4,a person playing a video game and commentating,gaming
...,...,...,...
994,MSRVTT_videos/video9600.mp4,handsome man plays guitar and sings,gaming
995,MSRVTT_videos/video8678.mp4,a lady is walking in the beach,music
996,MSRVTT_videos/video9254.mp4,band playing music and people dancing,music
997,MSRVTT_videos/video9657.mp4,a monkey and a man feeding monkey with hand di...,animals/pets


In [7]:
generate_embeddings(
    pkl_path="MSRVTT_videos/msrvtt_dataset.pkl",
    save_path="MSRVTT_videos/xclip_embeddings.pkl",
    model_name="xclip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: XCLIP...
Модель загружена
Старт инференса


  return self.preprocess(images, **kwargs)
 10%|█         | 100/999 [00:58<08:43,  1.72it/s]

Итоговые метрики
Recall@1:  38.00%
Recall@5:  68.00%
Recall@10: 75.00%
Median Rank: 2.5






In [8]:
generate_embeddings(
    pkl_path="MSRVTT_videos/msrvtt_dataset.pkl",
    save_path="MSRVTT_videos/siglip_embeddings.pkl",
    model_name="siglip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: SIGLIP...


preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

Модель загружена
Старт инференса


 10%|█         | 100/999 [03:30<31:33,  2.11s/it]

Итоговые метрики
Recall@1:  62.00%
Recall@5:  84.00%
Recall@10: 90.00%
Median Rank: 1.0






Unnamed: 0,video_path,caption,category,video_emb,text_emb
0,MSRVTT_videos/video8344.mp4,a man is driving a car through the countryside,vehicles/autos,"[[0.005568306, 0.018281836, 0.0033502562, -0.0...","[[0.04114747, -0.020645713, -0.017524177, -0.0..."
1,MSRVTT_videos/video9782.mp4,a man talking about a womans genital problems,people,"[[0.006083999, 0.030000951, -0.004325057, 0.02...","[[0.03685377, -0.034494396, -0.035467677, 0.01..."
2,MSRVTT_videos/video9957.mp4,a group of people are dancing in a room,music,"[[-0.013810642, 0.03088814, 0.004287778, 0.012...","[[0.006228385, 0.021298371, 0.00011487911, -0...."
3,MSRVTT_videos/video7358.mp4,it is a vine compilation,sports/actions,"[[-0.019884914, -0.010000564, 0.009245677, 0.0...","[[-0.0026536807, -0.019370096, 0.01162443, -0...."
4,MSRVTT_videos/video9795.mp4,a person playing a video game and commentating,gaming,"[[0.016168682, 0.018595964, -0.026182847, 0.00...","[[0.009020409, 0.008731681, -0.016151076, 0.01..."
...,...,...,...,...,...
95,MSRVTT_videos/video7826.mp4,a girl is preparing potato ball and explains t...,food/drink,"[[0.024526069, 0.021513576, -0.031370573, 0.01...","[[0.032636873, -0.029693805, -0.032719813, -0...."
96,MSRVTT_videos/video8311.mp4,a man is filming as he and a woman watch the n...,news/events/politics,"[[0.004804106, 0.02237392, 0.030321484, -0.024...","[[0.01939655, 0.028341116, -0.006826788, 0.038..."
97,MSRVTT_videos/video8861.mp4,a white male raps while another plays guitar,sports/actions,"[[0.015914131, 0.021098292, -0.0037321204, -0....","[[0.025347028, 0.041452583, -0.00933835, -0.02..."
98,MSRVTT_videos/video8489.mp4,a tv channel named how to cook great foodcom i...,food/drink,"[[0.008864168, 0.03375285, -0.004276591, 0.029...","[[-0.00027470264, -0.014061792, -0.002076669, ..."


In [9]:
generate_embeddings(
    pkl_path="VATEX_videos/vatex_dataset.pkl",
    save_path="VATEX_videos/xclip_embeddings.pkl",
    model_name="xclip",
    num_frames=8,
    multi_captions=True
)

Загрузка модели: XCLIP...
Модель загружена
Старт инференса


  return self.preprocess(images, **kwargs)
 10%|█         | 100/977 [01:19<11:37,  1.26it/s]

Итоговые метрики
Recall@1:  53.00%
Recall@5:  80.00%
Recall@10: 90.00%
Median Rank: 1.0






Unnamed: 0,video_path,caption,category,video_emb,text_emb
0,VATEX_videos/d27aRRPH8wI_000001_000011.mp4,"On a grassy field, a man is doing leg squats w...",people,"[[0.0063255075, 0.00681206, -0.0029704461, 0.0...","[[0.025065178, -0.0038286413, 0.012830251, -0...."
1,VATEX_videos/xoC9cDva8Fw_000258_000268.mp4,Man examines feathered fishing lure with hangi...,people,"[[0.008018075, -0.056263532, -0.04000052, -0.0...","[[0.023749618, -0.01196129, 0.011794416, -0.00..."
2,VATEX_videos/0zNEeg9UCrM_000010_000020.mp4,"A person rides a skateboard as people walk, an...",people,"[[-0.028438143, -0.037691377, 0.0122595085, 0....","[[0.021119647, -0.017354745, 0.013320466, 0.03..."
3,VATEX_videos/H1iU3kQ05yQ_000009_000019.mp4,A little boy starts to crawl across a tiled fl...,people,"[[-0.009611565, -2.9507093e-05, 0.041168258, -...","[[0.020701647, -0.020210799, 0.03599124, 0.008..."
4,VATEX_videos/U7AVv6ONUoY_000011_000021.mp4,An older gentleman plays the bagpipe in doors ...,people,"[[-0.04337162, -0.0016207125, 0.030112438, 0.0...","[[0.018673802, -0.011787751, 0.024677666, 0.01..."
...,...,...,...,...,...
95,VATEX_videos/N1J1AgIV0dE_000051_000061.mp4,A person is putting three knots in a rope and ...,people,"[[-0.01231091, -0.022889571, 0.00031686947, 0....","[[0.028448246, -0.01496662, 0.009031912, 0.024..."
96,VATEX_videos/WdqzgMVGRvM_000021_000031.mp4,Bandmembers wearing white shirts and maroon pa...,people,"[[0.010988326, -0.009875147, -0.024606504, 0.0...","[[0.03445376, -0.0085826935, 0.013450398, 0.04..."
97,VATEX_videos/v-E6NalwaAg_000012_000022.mp4,A boy is splitting wood with a long axe when h...,people,"[[-0.03757493, -0.015379061, -0.005095528, -0....","[[-0.012647054, -0.029635614, 0.029953485, 0.0..."
98,VATEX_videos/79PeclarQcI_000000_000010.mp4,In a backyard a man is holding two axes on his...,people,"[[-0.04418258, -0.050258484, -0.015053316, -0....","[[-0.009564586, -0.014351341, 0.0053071333, 0...."


In [10]:
generate_embeddings(
    pkl_path="VATEX_videos/vatex_dataset.pkl",
    save_path="VATEX_videos/siglip_embeddings.pkl",
    model_name="siglip",
    num_frames=8,
    multi_captions=True
)

Загрузка модели: SIGLIP...
Модель загружена
Старт инференса


 10%|█         | 100/977 [03:45<33:00,  2.26s/it]

Итоговые метрики
Recall@1:  85.00%
Recall@5:  98.00%
Recall@10: 99.00%
Median Rank: 1.0






Unnamed: 0,video_path,caption,category,video_emb,text_emb
0,VATEX_videos/d27aRRPH8wI_000001_000011.mp4,"On a grassy field, a man is doing leg squats w...",people,"[[0.030421853, 0.0056569004, -0.020903332, -0....","[[0.0716869, -0.009495197, -0.028387925, -0.00..."
1,VATEX_videos/xoC9cDva8Fw_000258_000268.mp4,Man examines feathered fishing lure with hangi...,people,"[[-0.015034669, 0.04615889, 0.0057943426, 0.02...","[[0.01859426, -0.008128827, -0.02023826, 0.004..."
2,VATEX_videos/0zNEeg9UCrM_000010_000020.mp4,"A person rides a skateboard as people walk, an...",people,"[[0.009827573, -0.025678769, -0.019723598, -0....","[[0.02648049, 0.016830022, -0.000309548, -0.00..."
3,VATEX_videos/H1iU3kQ05yQ_000009_000019.mp4,A little boy starts to crawl across a tiled fl...,people,"[[-0.0032877177, 0.020898664, 0.012495684, -0....","[[0.00822017, 0.06392025, -0.014704492, 0.0204..."
4,VATEX_videos/U7AVv6ONUoY_000011_000021.mp4,An older gentleman plays the bagpipe in doors ...,people,"[[0.038320612, 0.011203081, -0.02071322, -0.01...","[[0.0023840168, 0.014588093, -0.01652322, -0.0..."
...,...,...,...,...,...
95,VATEX_videos/N1J1AgIV0dE_000051_000061.mp4,A person is putting three knots in a rope and ...,people,"[[0.012692091, 0.011022026, 0.015570089, -0.01...","[[0.024555273, 0.016592037, -0.004004332, 0.03..."
96,VATEX_videos/WdqzgMVGRvM_000021_000031.mp4,Bandmembers wearing white shirts and maroon pa...,people,"[[-0.0076653548, 0.0036949476, 0.015182001, -0...","[[0.03413237, -0.02582269, -0.032291654, 0.013..."
97,VATEX_videos/v-E6NalwaAg_000012_000022.mp4,A boy is splitting wood with a long axe when h...,people,"[[0.011557384, 0.010007614, 0.009592113, 0.011...","[[0.014801285, 0.02868833, -0.01819171, -0.003..."
98,VATEX_videos/79PeclarQcI_000000_000010.mp4,In a backyard a man is holding two axes on his...,people,"[[0.004466114, 0.02329243, -0.014850066, -0.02...","[[0.015043298, 0.033359744, -0.036666, 0.01131..."


In [12]:
generate_embeddings(
    pkl_path="YouCook2_videos/youcook2_dataset.pkl",
    save_path="YouCook2_videos/xclip_embeddings.pkl",
    model_name="xclip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: XCLIP...
Модель загружена
Старт инференса


  return self.preprocess(images, **kwargs)
 10%|█         | 100/1000 [01:32<13:51,  1.08it/s]

Итоговые метрики
Recall@1:  22.00%
Recall@5:  54.00%
Recall@10: 63.00%
Median Rank: 4.0






Unnamed: 0,video_path,caption,category,video_emb,text_emb
0,YouCook2_videos/eQZEf3NCCo4_4.mp4,place the seaweed down and put the rice on it,cooking,"[[-0.055789243, 0.026038729, -0.013334454, 0.0...","[[0.020044597, -0.03407648, 0.01915744, 0.0255..."
1,YouCook2_videos/c9eELn4axpg_0.mp4,cut the avocado and place in a bowl,cooking,"[[-0.015321425, -0.0031113576, 0.00830998, -0....","[[0.030471895, -0.024026817, 0.04135763, 0.004..."
2,YouCook2_videos/F564e476ULM_7.mp4,place the lobster on a towel to dry off,cooking,"[[-0.002775279, 0.023662115, -0.03618147, 0.02...","[[0.019301496, -0.022063183, 0.038658563, 0.00..."
3,YouCook2_videos/vVZsj1t9R70_6.mp4,boil the chicken stock and add the chicken mus...,cooking,"[[0.009529003, 0.027561307, 0.05359916, 0.0276...","[[0.023898594, -0.022719027, 0.0367974, 0.0206..."
4,YouCook2_videos/sGzBQrg1adY_9.mp4,add marsala powder,cooking,"[[-0.06052983, 0.015003544, -0.0471395, 0.0146...","[[0.013476306, -0.011583632, 0.03308287, -0.00..."
...,...,...,...,...,...
95,YouCook2_videos/sGzBQrg1adY_0.mp4,add black cardamom to a large pot,cooking,"[[0.0061335512, -0.0031297728, -0.016839009, 0...","[[0.007411627, -0.010071175, 0.04222338, 0.024..."
96,YouCook2_videos/YRZ8zZElALQ_2.mp4,cover with plastic wrap and the mat and flip over,cooking,"[[-0.017831137, 0.017088057, -0.014022268, 0.0...","[[0.009080215, -0.020269133, 0.01785999, 0.017..."
97,YouCook2_videos/Mzn6Q4gUDBo_0.mp4,slice the potato pumpkin and eggplant,cooking,"[[-0.006812448, -0.009627244, 0.043367404, -0....","[[0.03180502, -0.037126847, 0.031412125, 0.014..."
98,YouCook2_videos/QISvGTL2VDc_3.mp4,place the noodles and bean sprout in boiling w...,cooking,"[[-0.035623465, 0.008497529, 0.020271558, 0.01...","[[0.034849003, -0.0066987863, 0.041697916, 0.0..."


In [13]:
generate_embeddings(
    pkl_path="YouCook2_videos/youcook2_dataset.pkl",
    save_path="YouCook2_videos/siglip_embeddings.pkl",
    model_name="siglip",
    num_frames=8,
    multi_captions=False
)

Загрузка модели: SIGLIP...
Модель загружена
Старт инференса


 10%|█         | 100/1000 [03:59<35:53,  2.39s/it]

Итоговые метрики
Recall@1:  69.00%
Recall@5:  90.00%
Recall@10: 91.00%
Median Rank: 1.0






Unnamed: 0,video_path,caption,category,video_emb,text_emb
0,YouCook2_videos/eQZEf3NCCo4_4.mp4,place the seaweed down and put the rice on it,cooking,"[[0.022520188, 0.022497643, 0.010788886, 0.010...","[[0.0013357521, 0.044625245, 0.020840168, 0.04..."
1,YouCook2_videos/c9eELn4axpg_0.mp4,cut the avocado and place in a bowl,cooking,"[[-0.0050012264, 0.028763447, -0.035141815, 0....","[[0.034264877, 0.014012894, -0.025376717, -0.0..."
2,YouCook2_videos/F564e476ULM_7.mp4,place the lobster on a towel to dry off,cooking,"[[0.018896105, 0.0071423445, -0.023572667, 0.0...","[[0.020457974, 0.011588331, -0.010102598, 0.00..."
3,YouCook2_videos/vVZsj1t9R70_6.mp4,boil the chicken stock and add the chicken mus...,cooking,"[[0.0050523994, 0.0054796115, -0.011549139, 0....","[[0.025584359, -0.0024581614, -0.02377987, 0.0..."
4,YouCook2_videos/sGzBQrg1adY_9.mp4,add marsala powder,cooking,"[[0.018815126, 0.013348903, -0.021074533, -0.0...","[[0.03046106, 0.007963675, 0.00880684, 0.05478..."
...,...,...,...,...,...
95,YouCook2_videos/sGzBQrg1adY_0.mp4,add black cardamom to a large pot,cooking,"[[0.012083572, 0.017505946, -0.0250624, -0.005...","[[0.03488043, -0.009024936, -0.022062473, -0.0..."
96,YouCook2_videos/YRZ8zZElALQ_2.mp4,cover with plastic wrap and the mat and flip over,cooking,"[[0.016129881, 0.02586133, 0.022507144, 0.0228...","[[-0.032752875, -0.00043203577, -0.018012578, ..."
97,YouCook2_videos/Mzn6Q4gUDBo_0.mp4,slice the potato pumpkin and eggplant,cooking,"[[0.022826636, -0.0076447376, 0.0012467931, 0....","[[0.029759703, -0.033345066, -0.0025492143, 0...."
98,YouCook2_videos/QISvGTL2VDc_3.mp4,place the noodles and bean sprout in boiling w...,cooking,"[[0.048664644, 0.01457016, 0.0063882302, -0.00...","[[0.02497317, 0.00725591, -0.03443087, 0.01109..."
