In [1]:
import numpy as np
import pandas as pd
import os

import numpy as np
from IPython.display import YouTubeVideo

import requests
import json

import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

from scipy.signal import peak_prominences
from scipy.signal import find_peaks

import matplotlib.pyplot as plt
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import itertools
import random
import ruptures as rpt

In [2]:
path = r"/Users/scottmerrill/Documents/UNC/MultiModal/VMR/Youtube8m"
filenames = os.listdir(path + '/video')
file_name = filenames[0]

### 1.  DataLoader

In [3]:
class VideoAudioDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.filenames = os.listdir(os.path.join(path, 'video'))
    
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, idx):
        filename = self.filenames[idx]
        video_data = np.load(os.path.join(self.path, 'video', filename))
        audio_data = np.load(os.path.join(self.path, 'audio', filename))
        video_data = video_data[:, :1024]
        return video_data, audio_data

### 2. Transformer Class

In [182]:
class Transformer(nn.Module):
    def __init__(self, input_dim=1024, embed_dim=512, num_heads=8, num_layers=2, max_seq_len=50):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, embed_dim)  # Project input to embedding dim
        self.pos_encoder = self._generate_sinusoidal_positional_encoding(max_seq_len, embed_dim)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads), num_layers=num_layers)
        self.output_proj = nn.Linear(embed_dim, embed_dim)  # Project to final embedding

    def _generate_sinusoidal_positional_encoding(self, max_len, embed_dim):
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(math.log(10000.0) / embed_dim))
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # Shape: (1, max_len, embed_dim)

    def forward(self, x, mask):
        x = self.input_proj(x)  # Shape: (seq_len, embed_dim)
        seq_len = x.size(0)
        x = x + self.pos_encoder[:, :seq_len, :].squeeze(0).to(x.device)
        x = self.transformer(x.unsqueeze(1), src_key_padding_mask=mask).squeeze(1)
        x = x.mean(dim=0)  # Aggregate sequence to fixed-size embedding
        return self.output_proj(x)

### Optical Flow Class

In [222]:
class OpticalFlowProcessor:
    def __init__(self, method='video', window_size=10, min_segments=9, min_frames=10):
        self.method = method
        self.window_size = window_size
        self.min_segments = min_segments
        self.min_frames = min_frames

    def get_of_ranks(self, rgb, audio):
        flow = self._compute_flow(rgb, audio)
        segments = self._optical_flow_segments(flow)
        ranks = self._rank_averages(self._compute_segment_means(segments, flow))
        return ranks

    def get_best_worst_flow(self, rgb, audio):
        flow = self._compute_flow(rgb, audio)
        segments = self._optical_flow_segments(flow)
        ranks = self._rank_averages(self._compute_segment_means(segments, flow))
        return self._extract_best_worst_segments(segments, ranks)

    def _compute_flow(self, rgb, audio):
        if self.method == 'video':
            return self._moving_average(self._calculate_optical_flow_euclidean(rgb))
        elif self.method == 'audio':
            return self._moving_average(self._calculate_optical_flow_euclidean(audio))
        else:
            raise ValueError("Method must be 'video' or 'audio'")

    @staticmethod
    def _calculate_optical_flow_euclidean(embedding_seq):
        return np.linalg.norm(embedding_seq[1:] - embedding_seq[:-1], axis=1)

    @staticmethod
    def _moving_average(arr, window_size=5):
        return np.convolve(arr, np.ones(window_size) / window_size, mode='valid')

    def _optical_flow_segments_old(self, optical_flow):
        peaks, _ = find_peaks(optical_flow)
        prominences = peak_prominences(optical_flow, peaks)[0]
        peak_index = peaks[np.argsort(prominences)[-self.max_segments:]]
        peak_index = self._merge_intervals(np.sort(peak_index))
        return np.insert(np.append(peak_index, len(optical_flow)), 0, 0)

    def _optical_flow_segments(self, optical_flow_video, max_seq_len=100):
    
        algo = rpt.Dynp(model='l2', min_size=self.min_segments, jump=3).fit(optical_flow_video)
        change_points = algo.predict(n_bkps=min_segments)  # The 'pen' parameter controls sensitivity

        # insert zero for start segment
        change_points.insert(0,0)
        return change_points

    
    def _merge_intervals(self, arr):
        merged = [arr[0]]
        for i in range(1, len(arr)):
            if arr[i] - merged[-1] >= self.min_frames:
                merged.append(arr[i])
        return np.array(merged)

    @staticmethod
    def _compute_segment_means(segments, values):
        return [values[start:end].mean() if start < end else 0 for start, end in zip(segments[:-1], segments[1:])]

    @staticmethod
    def _rank_averages(averages):
        sorted_indices = np.argsort(averages)[::-1]
        ranks = np.zeros_like(sorted_indices) + 1
        for rank, idx in enumerate(sorted_indices):
            ranks[idx] = rank + 1
        return ranks

    def _extract_best_worst_segments(self, segments, ranks):
        top_start, top_end = segments[np.where(ranks == 1)[0][0]], segments[np.where(ranks == 1)[0][0] + 1]
        bottom_start, bottom_end = segments[np.where(ranks == max(ranks))[0][0]], segments[np.where(ranks == max(ranks))[0][0] + 1]
        return (top_start, top_end), (bottom_start, bottom_end)

### Script Functions

In [223]:
def collate_fn(batch, processor):
    video_batch, audio_batch = zip(*batch)
    video_batch = [torch.tensor(v, dtype=torch.float32) for v in video_batch]
    audio_batch = [torch.tensor(a, dtype=torch.float32) for a in audio_batch]
    flow_ranks = [processor.get_of_ranks(video_batch[i], audio_batch[i]) for i in range(len(video_batch))]
    return video_batch, audio_batch, flow_ranks

def get_dataloader(path, batch_size=32, shuffle=True, method='video', window_size=20):
    dataset = VideoAudioDataset(path)
    processor = OpticalFlowProcessor(method=method, window_size=window_size)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=lambda batch: collate_fn(batch, processor))

def perform_feature_padding(video_features, audio_features, start_segment, end_segment, max_seq_len):
    vf = video_features.clone().detach()
    af = audio_features.clone().detach()
    vf =vf[start_segment:end_segment,:]
    af = af[start_segment:end_segment,:]

    pvf = torch.zeros(max_seq_len, 1024)
    pvf[:vf.shape[0], :] = vf

    paf = torch.zeros(max_seq_len, 128)
    paf[:af.shape[0], :] = af

    # Create mask (True for padding positions)
    mask = torch.arange(max_seq_len) >= vf.shape[0]
    mask = mask.unsqueeze(0)  # Convert to 2D (batch_size=1, seq_len)
    return pvf, paf, mask

# Function to find pairs with approximately equal differences
def find_matching_index_pairs(array1, array2, tolerance=5):
    # Calculate differences in array1 and array2
    array1_diffs = np.diff(array1)
    array2_diffs = np.diff(array2)

    matching_pairs = []

    # Loop through differences in array1
    for i, diff1 in enumerate(array1_diffs):
        # Find pairs of consecutive indices in array2 with similar differences
        for j, diff2 in enumerate(array2_diffs):
            if abs(diff1 - diff2) <= tolerance:  # If the difference is within the tolerance
                matching_pairs.append(((i, i + 1), (j, j + 1), diff1, diff2))

    return matching_pairs


def get_similar_length_segments(positive_segments, negative_segments, tolerance = 5):
    
    while True:
        matching_indexes = find_matching_index_pairs(positive_segments, negative_segments, tolerance=tolerance)
        tolerance += 5
        if len(matching_indexes) > 0:
            break
            
    # sample randomly for all segments within the tolerance band
    pos_segment, negative_segment, pos_time, neg_time = matching_indexes[np.random.randint(0, len(matching_indexes))]
    
    return pos_segment, negative_segment, pos_time, neg_time

def get_positive_negative_embeddings(filenames, tolerance=5):

    positive_record_file = np.random.choice(filenames)
    negative_record_file = np.random.choice(filenames)

    positive_rgb, positive_audio = sample_dataset(positive_record_file)
    negative_rgb, negative_audio = sample_dataset(negative_record_file)

    optical_flow_pos = calculate_optical_flow_euclidean(positive_rgb)
    optical_flow_pos = moving_average(optical_flow_pos, window_size=20)

    optical_flow_neg = calculate_optical_flow_euclidean(negative_rgb)
    optical_flow_neg = moving_average(optical_flow_neg, window_size=20)

    positive_segments = optical_flow_segments(optical_flow_pos)
    negative_segments = optical_flow_segments(optical_flow_neg)

    pos_segment, negative_segment, pos_time, neg_time = get_similar_length_segments(positive_segments, negative_segments, tolerance = 5)

    # converting segment index to time in seconds
    pos_start, pos_end = pos_segment
    pos_start = positive_segments[pos_start]
    pos_end = positive_segments[pos_end]
    print(pos_start, pos_end)

    # retrieving segment and padding it appropriately
    pos_video, pos_audio, pos_mask = perform_feature_padding(positive_rgb, positive_audio, pos_start, pos_end, max_seq_len)

    # computing embedding
    positive_video_embedding = video_model(pos_video, pos_mask)
    positive_audio_embedding = audio_model(pos_audio, pos_mask)

    # converting segment index to time in seconds
    neg_start, neg_end = negative_segment
    neg_start = negative_segments[neg_start]
    neg_end = negative_segments[neg_end]

    # retrieving segment and padding it appropriately
    neg_video, neg_audio, neg_mask = perform_feature_padding(negative_rgb, negative_audio, neg_start, neg_end, max_seq_len)

    # computing embedding
    negative_video_embedding = video_model(neg_video, neg_mask)
    negative_audio_embedding = audio_model(neg_audio, neg_mask)
    return positive_video_embedding, positive_audio_embedding, negative_video_embedding, negative_audio_embedding

def save_checkpoint(model, optimizer, epoch, filename):
    """Saves model and optimizer state dict."""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch} to {filename}")


In [224]:
def get_positive_negative_embeddings(video_batch, tolerance=5):

    positive_record_file = np.random.choice(filenames)
    negative_record_file = np.random.choice(filenames)

    positive_rgb, positive_audio = sample_dataset(positive_record_file)
    negative_rgb, negative_audio = sample_dataset(negative_record_file)

    optical_flow_pos = calculate_optical_flow_euclidean(positive_rgb)
    optical_flow_pos = moving_average(optical_flow_pos, window_size=20)

    optical_flow_neg = calculate_optical_flow_euclidean(negative_rgb)
    optical_flow_neg = moving_average(optical_flow_neg, window_size=20)

    positive_segments = optical_flow_segments(optical_flow_pos)
    negative_segments = optical_flow_segments(optical_flow_neg)

    pos_segment, negative_segment, pos_time, neg_time = get_similar_length_segments(positive_segments, negative_segments, tolerance = 5)

    # converting segment index to time in seconds
    pos_start, pos_end = pos_segment
    pos_start = positive_segments[pos_start]
    pos_end = positive_segments[pos_end]
    print(pos_start, pos_end)

    # retrieving segment and padding it appropriately
    pos_video, pos_audio, pos_mask = perform_feature_padding(positive_rgb, positive_audio, pos_start, pos_end, max_seq_len)

    # computing embedding
    positive_video_embedding = video_model(pos_video, pos_mask)
    positive_audio_embedding = audio_model(pos_audio, pos_mask)

    # converting segment index to time in seconds
    neg_start, neg_end = negative_segment
    neg_start = negative_segments[neg_start]
    neg_end = negative_segments[neg_end]

    # retrieving segment and padding it appropriately
    neg_video, neg_audio, neg_mask = perform_feature_padding(negative_rgb, negative_audio, neg_start, neg_end, max_seq_len)

    # computing embedding
    negative_video_embedding = video_model(neg_video, neg_mask)
    negative_audio_embedding = audio_model(neg_audio, neg_mask)
    return positive_video_embedding, positive_audio_embedding, negative_video_embedding, negative_audio_embedding

In [225]:
max_seq_len = 100


def get_segmentd_embeddings(video_model, audio_model, vid, aud):
    vid_segment_embeddings = []
    

    flow = OpticalFlowProcessor()._compute_flow(vid, aud)
    segments = OpticalFlowProcessor()._optical_flow_segments(flow)

    vid_segment_embeddings = []
    aud_segment_embeddings = []
    for i in range(1, len(segments)):
        start = segments[i-1]
        end = segments[i]

        vid_emb, aud_emb, mask = perform_feature_padding(vid, aud, start, end, max_seq_len)
        
        vid_segment_embeddings.append(video_model(vid_emb, mask))
        aud_segment_embeddings.append(audio_model(aud_emb, mask))
    return vid_segment_embeddings, aud_segment_embeddings

In [226]:
num_heads =1
num_layers=1
audio_model = Transformer(input_dim=128, embed_dim=256, num_heads=num_heads, num_layers=num_layers, max_seq_len=max_seq_len)
video_model = Transformer(input_dim=1024, embed_dim=256, num_heads=num_heads, num_layers=num_layers, max_seq_len=max_seq_len)

def get_batch_embeddings(video_model, audio_model, video_batch, audio_batch):
    # We precompute the segment embeddings in each batch.  We do this once and then proceed to processing batch
    batch_vid_embeddings = []
    batch_aud_embeddings = []
    for i in range(len(video_batch)):
        vid = video_batch[i]
        aud = audio_batch[i]
        vid_sgmt_emb, aud_sgmt_emb = get_segmentd_embeddings(video_model, audio_model, vid, aud)
        batch_vid_embeddings.extend(vid_sgmt_emb)
        batch_aud_embeddings.extend(aud_sgmt_emb)
        
    # Shape will by (total segments X embedding dim)
    # total segments is clip dependent
    batch_aud_embeddings = torch.stack(batch_aud_embeddings)
    batch_vid_embeddings = torch.stack(batch_vid_embeddings)
    
    # MAKE SURE VECTORS ARE NORMALIZED FIRST idk if I want to do here or later..
    batch_aud_embeddings = torch.nn.functional.normalize(batch_aud_embeddings, p=2, dim=1)
    batch_vid_embeddings = torch.nn.functional.normalize(batch_vid_embeddings, p=2, dim=1)

    return batch_aud_embeddings, batch_vid_embeddings

In [227]:
def get_intermodal_loss(batch_vid_embeddings, batch_aud_embeddings, k=5, min_val=0):
    # batch_vid_embeddings and  batch_aud_embeddings should already be normalized so 
    # multiplying them is a similarity metric

    # convert simliarity to distance by (-1) >> high value indicates the distance between the samples is long
    dist_xy = (-1) *torch.matmul(batch_vid_embeddings, batch_aud_embeddings.T)

    positive_pairs = torch.diag(dist_xy)

    # Get non-diagonal elements (negative examples)
    # First, create a mask for non-diagonal elements
    mask = ~torch.eye(dist_xy.size(0), dtype=torch.bool)

    # Apply the mask to extract non-diagonal elements
    negative_pairs = dist_xy[mask]

    # First we find the positive pairs that are furthest in embedding space
    topk_pos_values, _ = torch.topk(positive_pairs.flatten(), k, largest=True)

    # next we find the negative pairs that are closest in embedding space
    topk_neg_values, _ = torch.topk(negative_pairs.flatten(), k, largest=False)

    # expand so we compare all possible combinations of pos/neg pairs
    topk_pos_values_expanded = topk_pos_values.unsqueeze(1)  # Shape: (k, 1)
    topk_neg_values_expanded = topk_neg_values.unsqueeze(0)  # Shape: (1, k)
    loss = torch.maximum(torch.tensor(min_value), topk_pos_values_expanded - topk_neg_values_expanded)
    loss = loss.mean()
    return loss

## Training Loop

In [228]:
### Parameters

In [366]:
lambda1 = 0.33
lambda2 = 0.33
lambda3 = 0.33

lr = 1e-4

# Define the Adam optimizer for the audio model
audio_optimizer = optim.Adam(audio_model.parameters(), lr=lr)

# Define the Adam optimizer for the video model
video_optimizer = optim.Adam(video_model.parameters(), lr=lr)

batch_size = 10
window_size = 20 # for optical flow smoothing (ie 20 frame mavg flow)

dataloader = get_dataloader(path, batch_size=batch_size, shuffle=True, method='video', window_size=window_size)
num_flow_matching = 10
k = 20
margin = 0.1
triplet_loss = nn.TripletMarginLoss(margin=margin, p=4, eps=1e-3)


In [367]:
# Batch iterator
for video_batch, audio_batch, flow_ranks in dataloader:
    try:
        audio_optimizer.zero_grad()
        video_optimizer.zero_grad()
        total_loss = 0

        # create segments for each batch and compute embeddings for the segments
        # stack all the embeddings into single tensors
        batch_aud_embeddings, batch_vid_embeddings = get_batch_embeddings(video_model, audio_model, video_batch, audio_batch)

        
        # 1. Inter-modal loss
        inter_modal_loss = get_intermodal_loss(batch_vid_embeddings, batch_aud_embeddings, k=5, min_val=0)


        # 2. optical flow loss

        # this code finds the top and bottom ranked optical flow for a particular
        # video.  This is specified in flow ranks.  It then converts these indexes to 
        # their corresponding position in the stacked embeddings
        top_rank_idxs = []
        bottom_rank_idxs = []
        current_index = 0
        for ranks in flow_ranks:
            top_rank_idxs.append(current_index + np.argmin(ranks))
            bottom_rank_idxs.append(current_index + np.argmax(ranks))
            current_index += len(ranks)

        # Randomly choose num_flow_matching pairs to match (top_rank, top_rank, bottom rank)
        top_matching_samples = list(itertools.product(top_rank_idxs, top_rank_idxs, bottom_rank_idxs))
        top_matching_samples = [random.choice(top_matching_samples) for _ in range(num_flow_matching)]

        # Randomly choose num_flow_matching pairs to match (bottom, bottom, top_rank rank)
        bottom_matching_samples = list(itertools.product(bottom_rank_idxs, bottom_rank_idxs, top_rank_idxs))
        bottom_matching_samples = [random.choice(bottom_matching_samples) for _ in range(num_flow_matching)]

        of_loss_top = 0
        for anchor, pos, neg in top_matching_samples:
            of_loss_top += triplet_loss(batch_vid_embeddings[anchor], batch_vid_embeddings[pos], batch_vid_embeddings[neg])
            of_loss_top += triplet_loss(batch_aud_embeddings[anchor], batch_aud_embeddings[pos], batch_aud_embeddings[neg])

        of_loss_bottom = 0
        for anchor, pos, neg in bottom_matching_samples:
            of_loss_bottom += triplet_loss(batch_vid_embeddings[anchor], batch_vid_embeddings[pos], batch_vid_embeddings[neg])
            of_loss_bottom += triplet_loss(batch_aud_embeddings[anchor], batch_aud_embeddings[pos], batch_aud_embeddings[neg])

        loss = lambda1*inter_modal_loss + lambda2*of_loss_top + lambda3*of_loss_bottom
        loss.backward()
        audio_optimizer.step()
        video_optimizer.step()
        
    except Exception as e:
        # adding a wrapper just in case
        print(e)
    break

In [354]:
for video_batch, audio_batch, flow_ranks in dataloader:
    break

In [None]:
flow_ranks

In [None]:
batch_aud_embeddings, batch_vid_embeddings = get_batch_embeddings(video_model, audio_model, video_batch, audio_batch)


In [None]:
k=1
min_val = 0
# batch_vid_embeddings and  batch_aud_embeddings should already be normalized so 
# multiplying them is a similarity metric

# convert simliarity to distance by (-1) >> high value indicates the distance between the samples is long
dist_xy = (-1) *torch.matmul(batch_vid_embeddings, batch_aud_embeddings.T)

positive_pairs = torch.diag(dist_xy)

# Get non-diagonal elements (negative examples)
# First, create a mask for non-diagonal elements
mask = ~torch.eye(dist_xy.size(0), dtype=torch.bool)

# Apply the mask to extract non-diagonal elements
negative_pairs = dist_xy[mask]

# top k positive and negative pairs
topk_pos_values, _ = torch.topk(positive_pairs.flatten(), k)
topk_neg_values, _ = torch.topk(negative_pairs.flatten(), k)

#
topk_pos_values_expanded = topk_pos_values.unsqueeze(1)  # Shape: (3, 1)
topk_neg_values_expanded = topk_neg_values.unsqueeze(0)  # Shape: (1, 3)
loss = torch.maximum(torch.tensor(0.0), topk_pos_values_expanded - topk_neg_values_expanded)
loss.mean()

In [None]:
top_k_neg_pair_xy

In [316]:
dist_pos_pair

<tf.Tensor: shape=(13, 13), dtype=float32, numpy=
array([[-3.3602990e-02, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06],
       [-1.0000000e+06, -4.2709179e-02, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06],
       [-1.0000000e+06, -1.0000000e+06, -2.5183383e-02, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06],
       [-1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -3.7141740e-02,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.0000000e+06, -1.0000000e+06, -1.0000000e+06, -1.0000000e+06,
        -1.

In [317]:
# convert simliarity to distance by (-1) >> high value indicates the distance between the samples is long
dist_xy = (-1) *torch.matmul(batch_vid_embeddings, batch_aud_embeddings.T)

positive_pairs = torch.diag(dist_xy)

# Get non-diagonal elements (negative examples)
# First, create a mask for non-diagonal elements
mask = ~torch.eye(dist_xy.size(0), dtype=torch.bool)

# Apply the mask to extract non-diagonal elements
negative_pairs = dist_xy[mask]

# First we find the positive pairs that are furthest in embedding space
topk_pos_values, _ = torch.topk(positive_pairs.flatten(), k, largest=True)

# next we find the negative pairs that are closest in embedding space
topk_neg_values, _ = torch.topk(negative_pairs.flatten(), k, largest=False)

# expand so we compare all possible combinations of pos/neg pairs
topk_pos_values_expanded = topk_pos_values.unsqueeze(1)  # Shape: (k, 1)
topk_neg_values_expanded = topk_neg_values.unsqueeze(0)  # Shape: (1, k)
loss = torch.maximum(torch.tensor(min_value), topk_pos_values_expanded - topk_neg_values_expanded)
loss = loss.mean()

In [318]:
loss.mean()

tensor(0.0671, grad_fn=<MeanBackward0>)

In [319]:
topk_neg_values

tensor([0.0460, 0.0485, 0.0492, 0.0504, 0.0513, 0.0522, 0.0524, 0.0524, 0.0525,
        0.0540, 0.0542, 0.0548, 0.0556, 0.0557, 0.0561, 0.0562, 0.0563, 0.0564,
        0.0564, 0.0564], grad_fn=<TopkBackward0>)

In [320]:
topk__values

NameError: name 'topk__values' is not defined

In [321]:
-0.0261-( -0.0535)

0.027399999999999997

In [322]:
#dist_xy = dist_xy.deatch().numpy()
dist_xy = (-1) *torch.matmul(batch_vid_embeddings, batch_aud_embeddings.T)
dist_xy = dist_xy.detach().numpy()
dist_pos_pair = tf.where(aff_xy, dist_xy, tf.ones_like(dist_xy, dtype=tf.float32) * (-1e+6))
dist_neg_pair = tf.where(tf.logical_not(aff_xy), dist_xy, tf.ones_like(dist_xy, dtype=tf.float32) * (1e+6))

top_k_pos_pair_xy, _ = tf.nn.top_k(dist_pos_pair, k)
top_k_pos_pair_yx, _ = tf.nn.top_k(tf.transpose(dist_pos_pair), k)

top_k_neg_pair_xy, _ = tf.nn.top_k(tf.negative(dist_neg_pair), k=k)
top_k_neg_pair_yx, _ = tf.nn.top_k(tf.transpose(tf.negative(dist_neg_pair)), k=k)

InvalidArgumentError: {{function_node __wrapped__SelectV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} condition [13,13], then [100,100], and else [100,100] must be broadcastable [Op:SelectV2] name: 

In [323]:
torch.topk(positive_pairs.flatten(), k, largest=True)

torch.return_types.topk(
values=tensor([0.1260, 0.1260, 0.1246, 0.1228, 0.1220, 0.1218, 0.1208, 0.1208, 0.1202,
        0.1201, 0.1199, 0.1191, 0.1190, 0.1187, 0.1185, 0.1185, 0.1184, 0.1180,
        0.1168, 0.1168], grad_fn=<TopkBackward0>),
indices=tensor([73, 79, 42, 43, 74, 21, 45, 32, 92, 36, 44, 33, 14, 29, 96, 31, 37, 10,
         6, 57]))

In [324]:
top_k_pos_pair_yx

<tf.Tensor: shape=(13, 3), dtype=float32, numpy=
array([[-3.3602990e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.2709179e-02, -1.0000000e+06, -1.0000000e+06],
       [-2.5183383e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.7141740e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.5998885e-02, -1.0000000e+06, -1.0000000e+06],
       [-1.7300233e-02, -1.0000000e+06, -1.0000000e+06],
       [-2.6054785e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.9366400e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.5822185e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.0678170e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.6392711e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.5263870e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.6147674e-02, -1.0000000e+06, -1.0000000e+06]], dtype=float32)>

In [325]:
top_k_pos_pair_xy

<tf.Tensor: shape=(13, 3), dtype=float32, numpy=
array([[-3.3602990e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.2709179e-02, -1.0000000e+06, -1.0000000e+06],
       [-2.5183383e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.7141740e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.5998885e-02, -1.0000000e+06, -1.0000000e+06],
       [-1.7300233e-02, -1.0000000e+06, -1.0000000e+06],
       [-2.6054785e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.9366400e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.5822185e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.0678170e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.6392711e-02, -1.0000000e+06, -1.0000000e+06],
       [-3.5263870e-02, -1.0000000e+06, -1.0000000e+06],
       [-4.6147674e-02, -1.0000000e+06, -1.0000000e+06]], dtype=float32)>

In [326]:
top_k_neg_pair_xy

<tf.Tensor: shape=(13, 3), dtype=float32, numpy=
array([[0.05375832, 0.05159993, 0.05075669],
       [0.05293407, 0.05169568, 0.05119767],
       [0.05683632, 0.05351436, 0.05201124],
       [0.04753361, 0.04561421, 0.04552523],
       [0.05258773, 0.05053359, 0.04976318],
       [0.02967066, 0.02767048, 0.02642583],
       [0.0351287 , 0.03366867, 0.0335088 ],
       [0.05294877, 0.05142174, 0.05061736],
       [0.05047916, 0.04883974, 0.0488084 ],
       [0.04248676, 0.0408433 , 0.03894409],
       [0.04294197, 0.04116744, 0.04040598],
       [0.03703857, 0.0348169 , 0.03311012],
       [0.04446257, 0.04444519, 0.04253142]], dtype=float32)>

In [327]:
top_k_neg_pair_yx

<tf.Tensor: shape=(13, 3), dtype=float32, numpy=
array([[0.03689217, 0.03509057, 0.03457905],
       [0.04213858, 0.04175154, 0.04088757],
       [0.02850987, 0.02848868, 0.02655753],
       [0.04275013, 0.04201677, 0.041509  ],
       [0.04033705, 0.03785755, 0.03764169],
       [0.04292034, 0.04264598, 0.04182005],
       [0.04283515, 0.04231308, 0.04121768],
       [0.05102314, 0.04979153, 0.04940118],
       [0.04943028, 0.04871679, 0.04826463],
       [0.05351436, 0.05169568, 0.05159993],
       [0.04795438, 0.04740145, 0.04738085],
       [0.05201124, 0.05119767, 0.05075669],
       [0.05683632, 0.05375832, 0.05294877]], dtype=float32)>

In [328]:
topk_neg_values2

tensor([-0.0568, -0.0538, -0.0535], grad_fn=<TopkBackward0>)

In [419]:
batch_size = 20
dataloader = get_dataloader(path, batch_size=batch_size, shuffle=True, method='video', window_size=window_size)

for video_batch, audio_batch, flow_ranks in dataloader:

    break

In [420]:
# To test we need to conver these segmented embeddings to 
# stacked segmented emebeddings each corresponding to a single video/audio
batch_vid_embeddings_long = batch_vid_embeddings.reshape(batch_size, -1)
batch_aud_embeddings_long = batch_aud_embeddings.reshape(batch_size, -1)

In [421]:
similarity_matrix = torch.matmul(batch_vid_embeddings_long, batch_aud_embeddings_long.T)

In [422]:
similarity_matrix.shape

torch.Size([20, 20])

In [423]:
def top_k_recall1(similarity_matrix, k):
    """
    Compute top-k recall from a similarity matrix.
    
    similarity_matrix: Tensor of shape (batch_size, batch_size), where
                       diagonal elements represent positive pairs.
    k: The value of k for top-k recall.
    
    Returns:
        recall: The top-k recall for each row (video or audio example).
    """
    batch_size = similarity_matrix.size(0)
    
    # Create a mask for the diagonal (positive examples)
    diagonal_mask = torch.eye(batch_size, device=similarity_matrix.device)
    
    # We will calculate recall for each row (video or audio)
    recall_list = []
    
    for i in range(batch_size):
        # Get the similarity values for the current row
        row_similarities = similarity_matrix[i]
        
        # Find the top K indices based on similarity (excluding the diagonal element)
        _, top_k_indices = torch.topk(row_similarities, k)
        
        # Check if the true positive (diagonal element) is in the top-k indices
        # The diagonal element is at index (i, i)
        true_positive_idx = i
        
        # If the true positive is in the top K, add 1 to recall
        if true_positive_idx in top_k_indices:
            recall_list.append(1.0)
        else:
            recall_list.append(0.0)
    
    # Return the mean recall across all examples (rows)
    recall = torch.mean(torch.tensor(recall_list, device=similarity_matrix.device))
    return recall


In [424]:
def top_k_recall(similarity_matrix, k):
    """
    Compute top-k recall from a similarity matrix (vectorized version).
    
    similarity_matrix: Tensor of shape (batch_size, batch_size), where
                       diagonal elements represent positive pairs.
    k: The value of k for top-k recall.
    
    Returns:
        recall: The top-k recall (mean recall across all examples).
    """
    batch_size = similarity_matrix.size(0)

    # Get the indices of the top-k most similar items for each row (video/audio example)
    _, top_k_indices = torch.topk(similarity_matrix, k, dim=1)

    # Create a tensor for diagonal indices (i, i) for each row
    diagonal_indices = torch.arange(batch_size, device=similarity_matrix.device)

    # Check if the diagonal index of each row is in the top-K indices of that row
    # `top_k_indices` is of shape (batch_size, k)
    is_true_positive_in_top_k = (top_k_indices == diagonal_indices.unsqueeze(1))

    # Calculate recall: for each row, check if the diagonal index is in the top-K
    recall_per_row = is_true_positive_in_top_k.any(dim=1).float()

    # Return the mean recall across all examples (rows)
    return recall_per_row.mean()

In [425]:
top_k_recall2(similarity_matrix, 1)

tensor(0.0500)

In [426]:
top_k_recall1(similarity_matrix, 1)

tensor(0.0500)

In [427]:
1/32

0.03125

In [462]:
def compute_mean_and_covariance(embeddings):
    """
    Computes the mean and covariance matrix of the embeddings.
    
    Args:
        embeddings (torch.Tensor): A tensor of shape (num_samples, feature_dim)
    
    Returns:
        mean (torch.Tensor): The mean of the embeddings.
        covariance (torch.Tensor): The covariance matrix of the embeddings.
    """
    mean = embeddings.mean(dim=0)
    centered_embeddings = embeddings - mean
    covariance = torch.matmul(centered_embeddings.T, centered_embeddings) / (embeddings.size(0) - 1)
    return mean, covariance

def calculate_frechet_audio_distance(embeddings_ground_truth, embeddings_retrieved, epsilon=1e-6):
    """
    Calculates the Fréchet Audio Distance (FAD) between two sets of embeddings.
    
    Args:
        embeddings_ground_truth (torch.Tensor): Ground truth audio embeddings (num_samples, feature_dim).
        embeddings_retrieved (torch.Tensor): Retrieved audio embeddings (num_samples, feature_dim).
        epsilon (float): Regularization term to ensure positive semi-definiteness of covariance matrices.
        
    Returns:
        fad (float): The Fréchet Audio Distance between the two sets of embeddings.
    """
    # Compute the mean and covariance for ground truth and retrieved embeddings
    mu_x, sigma_x = compute_mean_and_covariance(embeddings_ground_truth)
    mu_y, sigma_y = compute_mean_and_covariance(embeddings_retrieved)

    # Calculate the term: ||mu_x - mu_y||^2
    diff_mu = mu_x - mu_y
    mu_term = torch.sum(diff_mu ** 2)

    # Add epsilon to the covariance matrices to ensure they are positive semi-definite
    sigma_x += epsilon * torch.eye(sigma_x.size(0), device=sigma_x.device)
    sigma_y += epsilon * torch.eye(sigma_y.size(0), device=sigma_y.device)

    # Perform Cholesky decomposition to compute the matrix square root of covariance matrices
    try:
        sigma_x_sqrt = torch.linalg.cholesky(sigma_x)
        sigma_y_sqrt = torch.linalg.cholesky(sigma_y)
    except RuntimeError:
        raise ValueError("Covariance matrices are not positive semi-definite, even with regularization")
        return float('inf')
    # Calculate the term: Tr(sigma_x + sigma_y - 2(sigma_x^0.5 * sigma_y * sigma_x^0.5)^0.5)
    term = torch.trace(sigma_x + sigma_y - 2 * torch.matmul(sigma_x_sqrt, torch.matmul(sigma_y_sqrt, sigma_x_sqrt.T)))

    # The FAD is the sum of the two terms
    fad = mu_term + term
    return fad.item()


In [463]:
# Get the indices of the maximum values along each row (dim=1)
_, most_similar_indices = torch.max(similarity_matrix, dim=1)
retrieved_audio_embeddings = batch_aud_embeddings_long[most_similar_indices]
calculate_frechet_audio_distance(batch_aud_embeddings_long, retrieved_audio_embeddings)

In [464]:
retrieved_audio_embeddings = batch_aud_embeddings_long[most_similar_indices]

In [466]:
calculate_frechet_audio_distance(batch_aud_embeddings_long, retrieved_audio_embeddings)

0.3900473713874817

In [475]:
import torch
import numpy as np
#from scipy.signal import find_peaks
#from sklearn.metrics import peak_prominences
import itertools

# Function to detect local maxima in embeddings using scipy's find_peaks
def find_local_maxima_in_embeddings(embeddings, prominence_threshold=0.5):
    """
    Detect local maxima in the embeddings using scipy's find_peaks and sklearn's peak_prominences.

    Args:
        embeddings (torch.Tensor): Tensor of shape (batch_size, feature_dim).
        prominence_threshold (float): Minimum prominence required to consider a peak.

    Returns:
        peaks_list (list): List of indices where the local maxima (peaks) occur in the embeddings.
    """
    embeddings = embeddings.cpu().detach().numpy()  # Convert to numpy for peak detection

    peaks_list = []
    for i in range(embeddings.shape[0]):  # Iterate through each embedding (e.g., audio or video)
        # Find local maxima (peaks) in the embedding
        peaks, _ = find_peaks(embeddings[i])
        prominences = peak_prominences(embeddings[i], peaks)[0]
        
        # Filter peaks based on prominence
        significant_peaks = peaks[prominences >= prominence_threshold]
        peaks_list.append(significant_peaks)

    return peaks_list


# Function to calculate Intersection over Union (IoU) between audio and video peaks
def calc_intersection_over_union(audio_peaks, video_peaks):
    """
    Calculate Intersection over Union (IoU) between the audio and video peaks.

    Args:
        audio_peaks (list): Indices of audio peaks.
        video_peaks (list): Indices of video peaks.

    Returns:
        float: IoU score between audio and video peaks.
    """
    intersection = len(set(audio_peaks).intersection(set(video_peaks)))
    union = len(set(audio_peaks).union(set(video_peaks)))
    iou_score = intersection / union
    return iou_score


# Function to compute the AV-Align score from audio and video embeddings
def compute_av_align_score(audio_embeddings, video_embeddings, prominence_threshold=0.5):
    """
    Compute the AV-Align score between the audio and video embeddings.

    Args:
        audio_embeddings (torch.Tensor): Audio embeddings of shape (batch_size, feature_dim).
        video_embeddings (torch.Tensor): Video embeddings of shape (batch_size, feature_dim).
        prominence_threshold (float): Minimum prominence to filter significant peaks.

    Returns:
        float: AV-Align score (IoU between audio and video peaks).
    """
    # Detect peaks in the audio and video embeddings
    audio_peaks = find_local_maxima_in_embeddings(audio_embeddings, prominence_threshold)
    video_peaks = find_local_maxima_in_embeddings(video_embeddings, prominence_threshold)

    # Flatten the list of peaks before calculating the IoU
    audio_peaks_flattened = list(itertools.chain(*audio_peaks))
    video_peaks_flattened = list(itertools.chain(*video_peaks))

    # Calculate the Intersection over Union (IoU) for the audio and video peaks
    iou_score = calc_intersection_over_union(audio_peaks_flattened, video_peaks_flattened)
    
    return iou_score



In [476]:
audio_peaks = find_local_maxima_in_embeddings(retrieved_audio_embeddings, prominence_threshold=0.1)

In [479]:
compute_av_align_score(batch_vid_embeddings_long, retrieved_audio_embeddings, prominence_threshold=0.1)

0.10040983606557377

In [481]:
compute_av_align_score(batch_vid_embeddings_long, batch_aud_embeddings_long, prominence_threshold=0.1)

0.125

In [541]:
import torch
import numpy as np
import itertools

class Eval():
    def __init__(self):
        pass

    # Function to detect local maxima in embeddings using scipy's find_peaks
    def find_local_maxima_in_embeddings(self, embeddings, prominence_threshold=0.5):
        """
        Detect local maxima in the embeddings using scipy's find_peaks and sklearn's peak_prominences.

        Args:
            embeddings (torch.Tensor): Tensor of shape (batch_size, feature_dim).
            prominence_threshold (float): Minimum prominence required to consider a peak.

        Returns:
            peaks_list (list): List of indices where the local maxima (peaks) occur in the embeddings.
        """
        embeddings = embeddings.cpu().detach().numpy()  # Convert to numpy for peak detection

        peaks_list = []
        for i in range(embeddings.shape[0]):  # Iterate through each embedding (e.g., audio or video)
            # Find local maxima (peaks) in the embedding
            peaks, _ = find_peaks(embeddings[i])
            prominences = peak_prominences(embeddings[i], peaks)[0]

            # Filter peaks based on prominence
            significant_peaks = peaks[prominences >= prominence_threshold]
            peaks_list.append(significant_peaks)

        return peaks_list

    # Function to calculate Intersection over Union (IoU) between audio and video peaks
    def calc_intersection_over_union(self, audio_peaks, video_peaks):
        """
        Calculate Intersection over Union (IoU) between the audio and video peaks.

        Args:
            audio_peaks (list): Indices of audio peaks.
            video_peaks (list): Indices of video peaks.

        Returns:
            float: IoU score between audio and video peaks.
        """
        intersection = len(set(audio_peaks).intersection(set(video_peaks)))
        union = len(set(audio_peaks).union(set(video_peaks)))
        iou_score = intersection / union
        return iou_score

    # Function to compute the AV-Align score from audio and video embeddings
    def compute_av_align_score(self, audio_embeddings, video_embeddings, prominence_threshold=0.1):
        """
        Compute the AV-Align score between the audio and video embeddings.

        Args:
            audio_embeddings (torch.Tensor): Audio embeddings of shape (batch_size, feature_dim).
            video_embeddings (torch.Tensor): Video embeddings of shape (batch_size, feature_dim).
            prominence_threshold (float): Minimum prominence to filter significant peaks.

        Returns:
            float: AV-Align score (IoU between audio and video peaks).
        """
        # Detect peaks in the audio and video embeddings
        audio_peaks = self.find_local_maxima_in_embeddings(audio_embeddings, prominence_threshold)
        video_peaks = self.find_local_maxima_in_embeddings(video_embeddings, prominence_threshold)

        # Flatten the list of peaks before calculating the IoU
        audio_peaks_flattened = list(itertools.chain(*audio_peaks))
        video_peaks_flattened = list(itertools.chain(*video_peaks))

        # Calculate the Intersection over Union (IoU) for the audio and video peaks
        iou_score = self.calc_intersection_over_union(audio_peaks_flattened, video_peaks_flattened)

        return iou_score

    def compute_mean_and_covariance(self, embeddings):
        """
        Computes the mean and covariance matrix of the embeddings.

        Args:
            embeddings (torch.Tensor): A tensor of shape (num_samples, feature_dim)

        Returns:
            mean (torch.Tensor): The mean of the embeddings.
            covariance (torch.Tensor): The covariance matrix of the embeddings.
        """
        mean = embeddings.mean(dim=0)
        centered_embeddings = embeddings - mean
        covariance = torch.matmul(centered_embeddings.T, centered_embeddings) / (embeddings.size(0) - 1)
        return mean, covariance

    def calculate_frechet_audio_distance(self, embeddings_ground_truth, embeddings_retrieved, epsilon=1e-6):
        """
        Calculates the Fréchet Audio Distance (FAD) between two sets of embeddings.

        Args:
            embeddings_ground_truth (torch.Tensor): Ground truth audio embeddings (num_samples, feature_dim).
            embeddings_retrieved (torch.Tensor): Retrieved audio embeddings (num_samples, feature_dim).
            epsilon (float): Regularization term to ensure positive semi-definiteness of covariance matrices.

        Returns:
            fad (float): The Fréchet Audio Distance between the two sets of embeddings.
        """
        # Compute the mean and covariance for ground truth and retrieved embeddings
        mu_x, sigma_x = self.compute_mean_and_covariance(embeddings_ground_truth)
        mu_y, sigma_y = self.compute_mean_and_covariance(embeddings_retrieved)

        # Calculate the term: ||mu_x - mu_y||^2
        diff_mu = mu_x - mu_y
        mu_term = torch.sum(diff_mu ** 2)

        # Add epsilon to the covariance matrices to ensure they are positive semi-definite
        sigma_x += epsilon * torch.eye(sigma_x.size(0), device=sigma_x.device)
        sigma_y += epsilon * torch.eye(sigma_y.size(0), device=sigma_y.device)

        # Perform Cholesky decomposition to compute the matrix square root of covariance matrices
        try:
            sigma_x_sqrt = torch.linalg.cholesky(sigma_x)
            sigma_y_sqrt = torch.linalg.cholesky(sigma_y)
        except RuntimeError:
            raise ValueError("Covariance matrices are not positive semi-definite, even with regularization")
            return float('inf')

        # Calculate the term: Tr(sigma_x + sigma_y - 2(sigma_x^0.5 * sigma_y * sigma_x^0.5)^0.5)
        term = torch.trace(sigma_x + sigma_y - 2 * torch.matmul(sigma_x_sqrt, torch.matmul(sigma_y_sqrt, sigma_x_sqrt.T)))

        # The FAD is the sum of the two terms
        fad = mu_term + term
        return fad.item()

    def top_k_recall(self, similarity_matrix, k):
        """
        Compute top-k recall from a similarity matrix (vectorized version).

        similarity_matrix: Tensor of shape (batch_size, batch_size), where
                           diagonal elements represent positive pairs.
        k: The value of k for top-k recall.

        Returns:
            recall: The top-k recall (mean recall across all examples).
        """
        batch_size = similarity_matrix.size(0)

        # Get the indices of the top-k most similar items for each row (video/audio example)
        _, top_k_indices = torch.topk(similarity_matrix, k, dim=1)

        # Create a tensor for diagonal indices (i, i) for each row
        diagonal_indices = torch.arange(batch_size, device=similarity_matrix.device)

        # Check if the diagonal index of each row is in the top-K indices of that row
        # `top_k_indices` is of shape (batch_size, k)
        is_true_positive_in_top_k = (top_k_indices == diagonal_indices.unsqueeze(1))

        # Calculate recall: for each row, check if the diagonal index is in the top-K
        recall_per_row = is_true_positive_in_top_k.any(dim=1).float()

        # Return the mean recall across all examples (rows)
        return recall_per_row.mean()


In [619]:
def compute_evaluations(video_model, audio_model, batch_size, window_size, path, epoch, ks=[1, 5]):

    metrics = Eval()
    testloader = get_dataloader(path, batch_size=batch_size, shuffle=True, method='video', window_size=window_size)
    audio_model.eval()
    video_model.eval()

    recalls = []
    fads = []
    av_aligns = []

    
    for video_batch, audio_batch, flow_ranks in testloader:
        batch_aud_embeddings, batch_vid_embeddings = get_batch_embeddings(video_model, audio_model, video_batch, audio_batch)

        # These were in (#segments*batchsize, 256)
        # Now they are in (batchsize, 256 * #segments)
        batch_vid_embeddings = batch_vid_embeddings.reshape(batch_size, -1)
        batch_aud_embeddings = batch_aud_embeddings.reshape(batch_size, -1)

        # we are going to do a naive cosine similarity based retrieval strategy
        similarity_matrix = torch.matmul(batch_vid_embeddings, batch_aud_embeddings.T)

        # Get the most similar audio embeddings for each video
        _, most_similar_indices = torch.max(similarity_matrix, dim=1)

        # recall@k
        recall = [metrics.top_k_recall(similarity_matrix, k) for k in ks]

        retrieved_audio_embeddings = batch_aud_embeddings[most_similar_indices]
        fad = metrics.calculate_frechet_audio_distance(batch_aud_embeddings, retrieved_audio_embeddings)
        av_align = metrics.compute_av_align_score(batch_vid_embeddings, retrieved_audio_embeddings, prominence_threshold=0.1)


        recalls.append(recall)
        fads.append(fad)
        av_aligns.append(av_align)
        break
    mean_recalls = np.mean(recalls, axis=0)
    print(f'Mean Recall@{ks}: {mean_recalls}')
    print(f'Mean AV-Align: {np.mean(av_aligns)}')
    print(f'Mean AV-Align: {np.mean(fads)}')
    
    tmp = pd.DataFrame({'epoch':epoch,
       'fad':np.mean(av_aligns),
       'av_align':np.mean(fads)
      }, index=[0])

    mean_recalls = np.mean(recalls, axis=0)
    for i, k in enumerate(ks):
        tmp[f'recall@{k}'] = mean_recalls[i]
        
    return tmp

In [None]:
import pandas as pd

In [612]:
tmp = pd.DataFrame({'epoch':1,
       'fad':np.mean(av_aligns),
       'av_align':np.mean(fads)
      }, index=[0])
mean_recalls = np.mean(recalls, axis=0)
for i, k in enumerate(ks):
    tmp[f'recall@{k}'] = mean_recalls[i]

In [621]:
df = pd.DataFrame()

In [620]:
tmp = compute_evaluations(video_model, audio_model, 6, window_size, path, 1, ks=[1, 5])

Mean Recall@[1, 5]: [0.16666667 0.8333333 ]
Mean AV-Align: 0.07398568019093078
Mean AV-Align: 0.6593323945999146


In [625]:
df = pd.concat([df, tmp])

In [626]:
df

Unnamed: 0,epoch,fad,av_align,recall@1,recall@5
0,1,0.073986,0.659332,0.166667,0.833333
0,1,0.073986,0.659332,0.166667,0.833333
0,1,0.073986,0.659332,0.166667,0.833333


In [632]:
from .OFVMNET.OFProcessor import OpticalFlowProcessor

ImportError: attempted relative import with no known parent package