In [None]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import numpy as np
from IPython.display import YouTubeVideo

import requests
import json

import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

from scipy.signal import peak_prominences
from scipy.signal import find_peaks

import matplotlib.pyplot as plt
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence


In [None]:
path = r"/Users/scottmerrill/Documents/UNC/MultiModal/VMR/Youtube8m"

In [None]:
filenames = os.listdir(path + '/video')
file_name = filenames[0]

### DataLoader
- data loader class will load batches of batchsize 
- will also precompute highest and lowest optical flow segment for each video so these don't need to be recomputed

In [4]:
class OpticalFlowProcessor:
    def __init__(self, method='video', window_size=20, max_segments=10, min_frames=10):
        self.method = method
        self.window_size = window_size
        self.max_segments = max_segments
        self.min_frames = min_frames

    def get_best_worst_flow(self, rgb, audio):
        flow = self._compute_flow(rgb, audio)
        segments = self._optical_flow_segments(flow)
        ranks = self._rank_averages(self._compute_segment_means(segments, flow))
        return self._extract_best_worst_segments(segments, ranks)

    def _compute_flow(self, rgb, audio):
        if self.method == 'video':
            return self._moving_average(self._calculate_optical_flow_euclidean(rgb))
        elif self.method == 'audio':
            return self._moving_average(self._calculate_optical_flow_euclidean(audio))
        else:
            raise ValueError("Method must be 'video' or 'audio'")

    @staticmethod
    def _calculate_optical_flow_euclidean(embedding_seq):
        return np.linalg.norm(embedding_seq[1:] - embedding_seq[:-1], axis=1)

    @staticmethod
    def _moving_average(arr, window_size=5):
        return np.convolve(arr, np.ones(window_size) / window_size, mode='valid')

    def _optical_flow_segments(self, optical_flow):
        peaks, _ = find_peaks(optical_flow)
        prominences = peak_prominences(optical_flow, peaks)[0]
        peak_index = peaks[np.argsort(prominences)[-self.max_segments:]]
        peak_index = self._merge_intervals(np.sort(peak_index))
        return np.insert(np.append(peak_index, len(optical_flow)), 0, 0)

    def _merge_intervals(self, arr):
        merged = [arr[0]]
        for i in range(1, len(arr)):
            if arr[i] - merged[-1] >= self.min_frames:
                merged.append(arr[i])
        return np.array(merged)

    @staticmethod
    def _compute_segment_means(segments, values):
        return [values[start:end].mean() if start < end else 0 for start, end in zip(segments[:-1], segments[1:])]

    @staticmethod
    def _rank_averages(averages):
        sorted_indices = np.argsort(averages)[::-1]
        ranks = np.zeros_like(sorted_indices) + 1
        for rank, idx in enumerate(sorted_indices):
            ranks[idx] = rank + 1
        return ranks

    def _extract_best_worst_segments(self, segments, ranks):
        top_start, top_end = segments[np.where(ranks == 1)[0][0]], segments[np.where(ranks == 1)[0][0] + 1]
        bottom_start, bottom_end = segments[np.where(ranks == max(ranks))[0][0]], segments[np.where(ranks == max(ranks))[0][0] + 1]
        return (top_start, top_end), (bottom_start, bottom_end)

In [5]:
class VideoAudioDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.filenames = os.listdir(os.path.join(path, 'video'))
    
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, idx):
        filename = self.filenames[idx]
        video_data = np.load(os.path.join(self.path, 'video', filename))
        audio_data = np.load(os.path.join(self.path, 'audio', filename))
        video_data = video_data[:, :1024]
        return video_data, audio_data

In [6]:
def collate_fn(batch, processor):
    video_batch, audio_batch = zip(*batch)
    video_batch = [torch.tensor(v, dtype=torch.float32) for v in video_batch]
    audio_batch = [torch.tensor(a, dtype=torch.float32) for a in audio_batch]
    flow_ranks = [processor.get_best_worst_flow(video_batch[i], audio_batch[i]) for i in range(len(video_batch))]
    return video_batch, audio_batch, flow_ranks

def get_dataloader(path, batch_size=32, shuffle=True, method='video', window_size=20):
    dataset = VideoAudioDataset(path)
    processor = OpticalFlowProcessor(method=method, window_size=window_size)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=lambda batch: collate_fn(batch, processor))


In [9]:
dataloader = get_dataloader(path, batch_size=2, shuffle=True, method='video', window_size=20)
for video_batch, audio_batch, flow_ranks in dataloader:
    print(video_batch, audio_batch)
    break

[tensor([[0.0708, 0.2848, 0.9329,  ..., 0.0251, 1.0495, 1.0415],
        [0.0709, 0.2851, 0.9354,  ..., 0.0251, 1.0503, 1.0413],
        [0.0709, 0.2852, 0.9370,  ..., 0.0251, 1.0503, 1.0408],
        ...,
        [0.1030, 0.0788, 0.5218,  ..., 0.1216, 0.1901, 0.4744],
        [0.1121, 0.0774, 0.5024,  ..., 0.1099, 0.2072, 0.4266],
        [0.1112, 0.0767, 0.5026,  ..., 0.1036, 0.2095, 0.4245]]), tensor([[1.2378, 0.1007, 0.1081,  ..., 0.0797, 2.4997, 0.0496],
        [1.4817, 0.0463, 0.0477,  ..., 0.1112, 0.5615, 0.0599],
        [0.8693, 0.2124, 0.2118,  ..., 0.0164, 1.3910, 0.1514],
        ...,
        [0.2726, 0.0000, 0.1176,  ..., 0.0000, 0.0000, 0.0060],
        [0.2726, 0.0000, 0.1176,  ..., 0.0000, 0.0000, 0.0060],
        [0.2726, 0.0000, 0.1176,  ..., 0.0000, 0.0000, 0.0060]])] [tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2288, 0.1777],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2288, 0.1777],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2288, 0.1777],
        