In [2]:
import pandas as pd
import requests
import subprocess
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from datasets import load_dataset

category_map = {
    "0": "music",
    "1": "people", 
    "2": "gaming",
    "3": "sports/actions",
    "4": "news/events/politics",
    "5": "education",
    "6": "tv shows",
    "7": "movie/comedy", 
    "8": "animation",
    "9": "vehicles/autos",
    "10": "howto",
    "11": "travel",
    "12": "science/technology",
    "13": "animals/pets",
    "14": "kids/family",
    "15": "documentary",
    "16": "food/drink",
    "17": "cooking",
    "18": "beauty/fashion",
    "19": "advertisement"
}

def download_msrvtt_zip():
    zip_path = "MSRVTT_Videos.zip"
    if not os.path.exists(zip_path):
        url = "https://huggingface.co/datasets/friedrichor/MSR-VTT/resolve/main/MSRVTT_Videos.zip"
        print("Скачиваю архив...")
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
    return zip_path

def extract_video(zip_ref, video_file, output_dir):
    video_path = os.path.join(output_dir, video_file.split('/')[-1])
    if os.path.exists(video_path):
        return video_path
    
    try:
        with zip_ref.open(f"video/{video_file}") as src, open(video_path, 'wb') as dst:
            dst.write(src.read())
    except:
        return None
    
    return video_path

def process_msrvtt_video(zip_ref, item, output_dir):
    video_file = item['video']
    duration = item['end time'] - item['start time']
    
    if duration > 30:
        return None
    
    video_path = extract_video(zip_ref, video_file, output_dir)
    if not video_path:
        return None
    
    return {
        'video_path': video_path,
        'caption': item['caption'],
        'category': category_map.get(str(item['category']), "unknown")
    }

def download_msrvtt_dataset(save_path, num_videos, max_workers=10):
    dataset = load_dataset("friedrichor/MSR-VTT", "train_9k", split="train", streaming=True)
    
    items = []
    for i, item in enumerate(dataset):
        if item['end time'] - item['start time'] <= 30:
            items.append(item)
        if len(items) >= num_videos:
            break
    
    print(f"Найдено {len(items)} видео до 30 секунд")
    
    zip_path = download_msrvtt_zip()
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref, ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_msrvtt_video, zip_ref, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Извлечение"):
            result = future.result()
            if result:
                with lock:
                    samples.append(result)
    
    df = pd.DataFrame(samples)
    df.to_csv(os.path.join(save_path, "msrvtt_dataset.csv"), index=False)
    print(f"Обработано {len(df)} видео")
    return df

download_msrvtt_dataset("MSRVTT_videos", 100, max_workers=12)

Найдено 100 видео до 30 секунд
Скачиваю архив...


Извлечение: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1640.50it/s]

Обработано 100 видео





Unnamed: 0,video_path,caption,category
0,MSRVTT_videos/video1.mp4,[in a kitchen a woman adds different ingredien...,food/drink
1,MSRVTT_videos/video2.mp4,"[a guying showing a tool, a man fixes a car, a...",vehicles/autos
2,MSRVTT_videos/video0.mp4,"[a car is shown, a group is dancing, a man dri...",vehicles/autos
3,MSRVTT_videos/video4.mp4,"[a girl wearing a black shirt, a man is arguin...",kids/family
4,MSRVTT_videos/video12.mp4,"[a man is angrily talking to another man, a ma...",movie/comedy
...,...,...,...
95,MSRVTT_videos/video96.mp4,[a cellphone s apps are being scrolled page by...,howto
96,MSRVTT_videos/video90.mp4,"[a little dog runs across the floor, a little ...",animals/pets
97,MSRVTT_videos/video97.mp4,"[a family gathered together in a room, a man i...",music
98,MSRVTT_videos/video92.mp4,"[a few horses are riding down a track, a horse...",animals/pets


In [3]:
import pandas as pd
import requests
import subprocess
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import tempfile
import threading
from datasets import load_dataset
import queue

def download_video(video_id):
    url = f"https://huggingface.co/datasets/VLM2Vec/VATEX/resolve/main/raw_videos/{video_id}.mp4"
    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp:
        tmp_path = tmp.name
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                return None
            tmp.write(response.content)
            return tmp_path
        except:
            return None

def cut_video(temp_path, video_id, output_dir):
    parts = video_id.split('_')
    start_time = int(parts[-2])
    end_time = int(parts[-1])
    cut_path = os.path.join(output_dir, f"{video_id}.mp4")
    
    if os.path.exists(cut_path):
        os.unlink(temp_path)
        return cut_path
    
    try:
        result = subprocess.run([
            'ffmpeg',
            '-ss', str(start_time),
            '-i', temp_path,
            '-t', str(end_time - start_time),
            '-c:v', 'libx264',
            '-preset', 'ultrafast',
            '-c:a', 'aac',
            '-y',
            '-loglevel', 'error',
            cut_path
        ], capture_output=True, timeout=30)
    except:
        return None
    
    os.unlink(temp_path)
    return cut_path if result.returncode == 0 else None

def process_video(item, output_dir):
    video_id = item['videoID']
    temp_path = download_video(video_id)
    if not temp_path:
        return None
    return cut_video(temp_path, video_id, output_dir)

def download_vatex_dataset(save_path, num_videos, max_workers=20):
    dataset = load_dataset("VLM2Vec/VATEX", "vatex_test", split="test", streaming=True)
    items = []
    for i, item in enumerate(dataset):
        if i >= num_videos:
            break
        items.append(item)
    
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_video, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Обработка"):
            video_path = future.result()
            if video_path:
                item = futures[future]
                with lock:
                    samples.append({
                        'video_path': video_path,
                        'caption': item['enCap'],
                        'category': "people"
                    })
    
    df = pd.DataFrame(samples)
    df.to_csv(os.path.join(save_path, "vatex_dataset.csv"), index=False)
    print(f"Обработано {len(df)} видео")
    return df

download_vatex_dataset("VATEX_videos", 100, max_workers=12)

README.md:   0%|          | 0.00/393 [00:00<?, ?B/s]

Обработка: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.09it/s]

Обработано 0 видео





In [4]:
import pandas as pd
import requests
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from datasets import load_dataset

def download_youcook2_zip():
    zip_path = "YouCookIIVideos.zip"
    url = "https://huggingface.co/datasets/lmms-lab/YouCook2/resolve/main/YouCookIIVideos.zip"
    
    if not os.path.exists(zip_path):
        print("Скачиваю архив...")
        response = requests.get(url, stream=True)
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    return zip_path

def extract_video(zip_ref, video_path, output_dir):
    full_video_path = f"YouCookIIVideos/{video_path}"
    filename = os.path.basename(video_path)
    save_path = os.path.join(output_dir, filename)
    
    if os.path.exists(save_path):
        return save_path
    
    try:
        with zip_ref.open(full_video_path) as src, open(save_path, 'wb') as dst:
            dst.write(src.read())
        return save_path
    except:
        return None

def process_youcook2_video(zip_ref, item, output_dir):
    video_path = item['video_path']
    
    if not video_path.endswith('.mp4'):
        return None
    
    start, end = item['segment']
    if end - start > 30:
        return None
    
    extracted_path = extract_video(zip_ref, video_path, output_dir)
    if not extracted_path:
        return None
    
    return {
        'video_path': extracted_path,
        'caption': item['sentence'],
        'category': "cooking"
    }

def download_youcook2_dataset(save_path, num_videos, split="val", max_workers=10):
    dataset = load_dataset("lmms-lab/YouCook2", split=split, streaming=True)
    
    items = []
    for i, item in enumerate(dataset):
        if not item['video_path'].endswith('.mp4'):
            continue
        
        start, end = item['segment']
        if end - start <= 30:
            items.append(item)
        
        if len(items) >= num_videos:
            break
    
    print(f"Найдено {len(items)} видео до 30 секунд")
    
    zip_path = download_youcook2_zip()
    os.makedirs(save_path, exist_ok=True)
    
    samples = []
    lock = threading.Lock()
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref, ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_youcook2_video, zip_ref, item, save_path): item for item in items}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Извлечение"):
            result = future.result()
            if result:
                with lock:
                    samples.append(result)
    
    df = pd.DataFrame(samples)
    df.to_csv(os.path.join(save_path, "youcook2_dataset.csv"), index=False)
    print(f"Обработано {len(df)} видео")
    return df

download_youcook2_dataset("YouCook2_videos", 100, max_workers=12)

README.md:   0%|          | 0.00/608 [00:00<?, ?B/s]

Найдено 100 видео до 30 секунд
Скачиваю архив...


Извлечение: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 866.46it/s]

Обработано 100 видео





Unnamed: 0,video_path,caption,category
0,YouCook2_videos/xHr8X2Wpmno_0.mp4,pick the ends off the verdalago,cooking
1,YouCook2_videos/xHr8X2Wpmno_5.mp4,add the fried pita to the salad and mix,cooking
2,YouCook2_videos/xHr8X2Wpmno_1.mp4,combine lemon juice sumac garlic salt and oil ...,cooking
3,YouCook2_videos/V53XmPeyjIU_1.mp4,place chicken in a small bowl and pour brine o...,cooking
4,YouCook2_videos/xHr8X2Wpmno_2.mp4,chop lettuce and place it in a bowl,cooking
...,...,...,...
95,YouCook2_videos/vDDeMg2dhEM_3.mp4,drain the soaked shiitake mushroom discard the...,cooking
96,YouCook2_videos/vDDeMg2dhEM_1.mp4,cut the carrots to julienne,cooking
97,YouCook2_videos/c9eELn4axpg_7.mp4,spread avocado mixture on one toast and on the...,cooking
98,YouCook2_videos/vDDeMg2dhEM_11.mp4,deep fry the egg rolls in a pan with hot oil u...,cooking
