# Преобразование видео в пары звук-картинка

In [None]:
import os
import random
import concurrent.futures
from pathlib import Path
import moviepy.editor as mp
import cv2
import logging
import warnings

# Настройка логгирования и предупреждений
logging.getLogger('moviepy').setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

def get_random_frame(cap, max_attempts=5):
    """Более надежный способ получения случайного кадра"""
    for _ in range(max_attempts):
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if frame_count <= 0:
            return None
            
        random_frame = random.randint(0, frame_count - 1)
        cap.set(cv2.CAP_PROP_POS_FRAMES, random_frame)
        
        # Даем видео несколько попыток декодирования
        for _ in range(3):
            ret, frame = cap.read()
            if ret:
                return frame
            cap.set(cv2.CAP_PROP_POS_FRAMES, random_frame)
    
    return None

def process_video(video_path, sounds_dir, images_dir):
    try:
        # Обработка аудио
        audio_path = sounds_dir / f"{video_path.stem}.wav"
        clip = mp.VideoFileClip(str(video_path))
            
        if clip.duration > 10:
            audio = clip.subclip(0, 10).audio
        else:
            audio = clip.audio
        
        
        
        # Обработка изображения с улучшенным чтением кадров
        cap = cv2.VideoCapture(str(video_path))
        frame = get_random_frame(cap)
        
        if frame is not None:
            resized = cv2.resize(frame, (128, 128))
            img_path = images_dir / f"{video_path.stem}.jpg"
            cv2.imwrite(str(img_path), resized)

            audio.write_audiofile(str(audio_path), fps=44100, codec='pcm_s16le', verbose=False, logger=None)
        else:
            print(f"Не удалось декодировать файл {video_path}")
        
        cap.release()
        clip.close()
        
    except Exception as e:
        print(f"Ошибка при обработке {video_path}: {str(e)}")

def process_all_videos(root_dir):
    root_path = Path(root_dir)
    sounds_dir = root_path / "sounds"
    images_dir = root_path / "images"

    root_path = root_path / "videos"
    
    sounds_dir.mkdir(exist_ok=True)
    images_dir.mkdir(exist_ok=True)
    
    video_files = []
    for dirpath, _, filenames in os.walk(root_path):
        for filename in filenames:
            if filename.lower().endswith('.mp4'):
                video_files.append(Path(dirpath) / filename)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_video, video_path, sounds_dir, images_dir) 
                 for video_path in video_files]
        concurrent.futures.wait(futures)

In [None]:
root_directory = "data"
# process_all_videos(root_directory)

# Преобразование звука в записанные эмбединги

In [1]:
from torch.utils.data import Dataset, DataLoader
import glob
from torchvision.io import read_image
import torchaudio
import torch
from models.AudioEncoder import AudioEncoder


class SoundDataset(Dataset):
    def __init__(self , image_path, sound_path):
        names = []
        for path in glob.glob(f'{image_path}/*.jpg'):
            name = path.split('/')[-1][:-4]
            names.append(name)

        self.names = names
        self.im_path = image_path
        self.au_path = sound_path
        self.stanart_len = 480000

    def __len__(self):
        return len(self.names)

    def __getitem__(self , index):
        audio_path = f"{self.au_path}/{self.names[index]}.wav"
        waveform, _ = torchaudio.load(audio_path)

        # необходим стерео звук, если он одноканальный то приводим к стерео
        if waveform.shape[0] == 1:
            stereo = torch.zeros((2, waveform.shape[1]), dtype=torch.float)
            stereo[0] = waveform[0]
            stereo[1] = waveform[0]
            waveform = stereo

        elif waveform.shape[0] != 2:
            raise ValueError(f"audio {self.names[index]} must be stereo or mono, but {waveform.shape[0]} channels were given")

        # все тензоры должны быть стандартного размера (только для обучения)
        if waveform.shape[1] < self.stanart_len:
            ext_waveform = torch.zeros((2, self.stanart_len), dtype=torch.float)
            ext_waveform[:, :waveform.shape[1]] = waveform
            waveform = ext_waveform
        elif waveform.shape[1] > self.stanart_len:
            waveform = waveform[:, :self.stanart_len]

        return waveform.float(), self.names[index]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
image_path = "data/images"
sound_path = "data/sounds"

data = SoundDataset(image_path, sound_path)
loader = DataLoader(data, 128)

encode = AudioEncoder("facebook/wav2vec2-base-960h",
                    torch.device(f'cuda:{0}'),
                    48000)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import h5py
import numpy as np

def write_tensors(loader, encode):
    loader = iter(loader)
    
    
    # Параметры
    TENSOR_SHAPE = (499, 768)
    STRING_LENGTH = 32
    CHUNK_SIZE = 128
    TOTAL_RECORDS = len(data)
    
    # Определяем структурированный тип данных
    dt = np.dtype([
        ('tensor', np.float32, TENSOR_SHAPE),
        ('metadata', f'S{STRING_LENGTH}')  # Строка фиксированной длины
    ])
    
    # Создаем HDF5 файл и датасет
    with h5py.File('data/embeds/sound_embeds.h5', 'w') as f:
        dset = f.create_dataset(
            'data',
            shape=(0,),
            maxshape=(None,),
            dtype=dt,
            chunks=(CHUNK_SIZE,)  # Размер чанка = размеру пакета
        )
    
        # Цикл записи по пакетам
        for i in range(0, TOTAL_RECORDS, CHUNK_SIZE):
            num_to_write = min(CHUNK_SIZE, TOTAL_RECORDS - i)

            sound, metadata_strings = next(loader)
            tensors = encode(sound.cuda()).cpu().numpy()
    
            # Подготавливаем буфер для записи
            buffer = np.empty(num_to_write, dtype=dt)
            for k in range(num_to_write):
                buffer[k]['tensor'] = tensors[k]
                buffer[k]['metadata'] = metadata_strings[k].encode('utf-8')
    
            # Расширяем датасет и записываем данные
            dset.resize(dset.shape[0] + num_to_write, axis=0)
            dset[-num_to_write:] = buffer

            print(f"\rWritten rows: {i+CHUNK_SIZE} out of {TOTAL_RECORDS}", end="")

In [4]:
write_tensors(loader, encode)

Written rows: 197888 out of 197889

RuntimeError: Given groups=1, weight of size [512, 1, 10], expected input[1, 160000, 1] to have 1 channels, but got 160000 channels instead