In [1]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import numpy as np
from IPython.display import YouTubeVideo

import requests
import json

import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

from scipy.signal import peak_prominences
from scipy.signal import find_peaks

import matplotlib.pyplot as plt
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence


In [3]:
path = r"/Users/scottmerrill/Documents/UNC/MultiModal/VMR/Youtube8m"
filenames = os.listdir(path + '/video')
file_name = filenames[0]

### 1.  DataLoader

In [10]:
class VideoAudioDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.filenames = os.listdir(os.path.join(path, 'video'))
    
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, idx):
        filename = self.filenames[idx]
        video_data = np.load(os.path.join(self.path, 'video', filename))
        audio_data = np.load(os.path.join(self.path, 'audio', filename))
        video_data = video_data[:, :1024]
        return video_data, audio_data

### 2. Transformer Class

In [6]:
class Transformer(nn.Module):
    def __init__(self, input_dim=1024, embed_dim=512, num_heads=8, num_layers=2, max_seq_len=50):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, embed_dim)  # Project input to embedding dim
        self.pos_encoder = self._generate_sinusoidal_positional_encoding(max_seq_len, embed_dim)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads), num_layers=num_layers)
        self.output_proj = nn.Linear(embed_dim, embed_dim)  # Project to final embedding

    def _generate_sinusoidal_positional_encoding(self, max_len, embed_dim):
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(math.log(10000.0) / embed_dim))
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # Shape: (1, max_len, embed_dim)

    def forward(self, x, mask):
        x = self.input_proj(x)  # Shape: (seq_len, embed_dim)
        seq_len = x.size(0)
        x = x + self.pos_encoder[:, :seq_len, :].squeeze(0).to(x.device)
        x = self.transformer(x.unsqueeze(1), src_key_padding_mask=mask).squeeze(1)
        x = x.mean(dim=0)  # Aggregate sequence to fixed-size embedding
        return self.output_proj(x)

### Optical Flow Class

In [7]:
class OpticalFlowProcessor:
    def __init__(self, method='video', window_size=20, max_segments=10, min_frames=10):
        self.method = method
        self.window_size = window_size
        self.max_segments = max_segments
        self.min_frames = min_frames

    def get_best_worst_flow(self, rgb, audio):
        flow = self._compute_flow(rgb, audio)
        segments = self._optical_flow_segments(flow)
        ranks = self._rank_averages(self._compute_segment_means(segments, flow))
        return self._extract_best_worst_segments(segments, ranks)

    def _compute_flow(self, rgb, audio):
        if self.method == 'video':
            return self._moving_average(self._calculate_optical_flow_euclidean(rgb))
        elif self.method == 'audio':
            return self._moving_average(self._calculate_optical_flow_euclidean(audio))
        else:
            raise ValueError("Method must be 'video' or 'audio'")

    @staticmethod
    def _calculate_optical_flow_euclidean(embedding_seq):
        return np.linalg.norm(embedding_seq[1:] - embedding_seq[:-1], axis=1)

    @staticmethod
    def _moving_average(arr, window_size=5):
        return np.convolve(arr, np.ones(window_size) / window_size, mode='valid')

    def _optical_flow_segments(self, optical_flow):
        peaks, _ = find_peaks(optical_flow)
        prominences = peak_prominences(optical_flow, peaks)[0]
        peak_index = peaks[np.argsort(prominences)[-self.max_segments:]]
        peak_index = self._merge_intervals(np.sort(peak_index))
        return np.insert(np.append(peak_index, len(optical_flow)), 0, 0)

    def _merge_intervals(self, arr):
        merged = [arr[0]]
        for i in range(1, len(arr)):
            if arr[i] - merged[-1] >= self.min_frames:
                merged.append(arr[i])
        return np.array(merged)

    @staticmethod
    def _compute_segment_means(segments, values):
        return [values[start:end].mean() if start < end else 0 for start, end in zip(segments[:-1], segments[1:])]

    @staticmethod
    def _rank_averages(averages):
        sorted_indices = np.argsort(averages)[::-1]
        ranks = np.zeros_like(sorted_indices) + 1
        for rank, idx in enumerate(sorted_indices):
            ranks[idx] = rank + 1
        return ranks

    def _extract_best_worst_segments(self, segments, ranks):
        top_start, top_end = segments[np.where(ranks == 1)[0][0]], segments[np.where(ranks == 1)[0][0] + 1]
        bottom_start, bottom_end = segments[np.where(ranks == max(ranks))[0][0]], segments[np.where(ranks == max(ranks))[0][0] + 1]
        return (top_start, top_end), (bottom_start, bottom_end)

### Script Functions

In [8]:
def collate_fn(batch, processor):
    video_batch, audio_batch = zip(*batch)
    video_batch = [torch.tensor(v, dtype=torch.float32) for v in video_batch]
    audio_batch = [torch.tensor(a, dtype=torch.float32) for a in audio_batch]
    flow_ranks = [processor.get_best_worst_flow(video_batch[i], audio_batch[i]) for i in range(len(video_batch))]
    return video_batch, audio_batch, flow_ranks

def get_dataloader(path, batch_size=32, shuffle=True, method='video', window_size=20):
    dataset = VideoAudioDataset(path)
    processor = OpticalFlowProcessor(method=method, window_size=window_size)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=lambda batch: collate_fn(batch, processor))

def perform_feature_padding(video_features, audio_features, start_segment, end_segment, max_seq_len):
    vf = torch.tensor(video_features[start_segment:end_segment,:])
    af = torch.tensor(audio_features[start_segment:end_segment,:])

    pvf = torch.zeros(max_seq_len, 1024)
    pvf[:vf.shape[0], :] = vf

    paf = torch.zeros(max_seq_len, 128)
    paf[:af.shape[0], :] = af

    # Create mask (True for padding positions)
    mask = torch.arange(max_seq_len) >= vf.shape[0]
    mask = mask.unsqueeze(0)  # Convert to 2D (batch_size=1, seq_len)
    return pvf, paf, mask

# Function to find pairs with approximately equal differences
def find_matching_index_pairs(array1, array2, tolerance=5):
    # Calculate differences in array1 and array2
    array1_diffs = np.diff(array1)
    array2_diffs = np.diff(array2)

    matching_pairs = []

    # Loop through differences in array1
    for i, diff1 in enumerate(array1_diffs):
        # Find pairs of consecutive indices in array2 with similar differences
        for j, diff2 in enumerate(array2_diffs):
            if abs(diff1 - diff2) <= tolerance:  # If the difference is within the tolerance
                matching_pairs.append(((i, i + 1), (j, j + 1), diff1, diff2))

    return matching_pairs


def get_similar_length_segments(positive_segments, negative_segments, tolerance = 5):
    
    while True:
        matching_indexes = find_matching_index_pairs(positive_segments, negative_segments, tolerance=tolerance)
        tolerance += 5
        if len(matching_indexes) > 0:
            break
            
    # sample randomly for all segments within the tolerance band
    pos_segment, negative_segment, pos_time, neg_time = matching_indexes[np.random.randint(0, len(matching_indexes))]
    
    return pos_segment, negative_segment, pos_time, neg_time

def get_positive_negative_embeddings(filenames, tolerance=5):

    positive_record_file = np.random.choice(filenames)
    negative_record_file = np.random.choice(filenames)

    positive_rgb, positive_audio = sample_dataset(positive_record_file)
    negative_rgb, negative_audio = sample_dataset(negative_record_file)

    optical_flow_pos = calculate_optical_flow_euclidean(positive_rgb)
    optical_flow_pos = moving_average(optical_flow_pos, window_size=20)

    optical_flow_neg = calculate_optical_flow_euclidean(negative_rgb)
    optical_flow_neg = moving_average(optical_flow_neg, window_size=20)

    positive_segments = optical_flow_segments(optical_flow_pos)
    negative_segments = optical_flow_segments(optical_flow_neg)

    pos_segment, negative_segment, pos_time, neg_time = get_similar_length_segments(positive_segments, negative_segments, tolerance = 5)

    # converting segment index to time in seconds
    pos_start, pos_end = pos_segment
    pos_start = positive_segments[pos_start]
    pos_end = positive_segments[pos_end]
    print(pos_start, pos_end)

    # retrieving segment and padding it appropriately
    pos_video, pos_audio, pos_mask = perform_feature_padding(positive_rgb, positive_audio, pos_start, pos_end, max_seq_len)

    # computing embedding
    positive_video_embedding = video_model(pos_video, pos_mask)
    positive_audio_embedding = audio_model(pos_audio, pos_mask)

    # converting segment index to time in seconds
    neg_start, neg_end = negative_segment
    neg_start = negative_segments[neg_start]
    neg_end = negative_segments[neg_end]

    # retrieving segment and padding it appropriately
    neg_video, neg_audio, neg_mask = perform_feature_padding(negative_rgb, negative_audio, neg_start, neg_end, max_seq_len)

    # computing embedding
    negative_video_embedding = video_model(neg_video, neg_mask)
    negative_audio_embedding = audio_model(neg_audio, neg_mask)
    return positive_video_embedding, positive_audio_embedding, negative_video_embedding, negative_audio_embedding

def save_checkpoint(model, optimizer, epoch, filename):
    """Saves model and optimizer state dict."""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch} to {filename}")


In [None]:
def get_positive_negative_embeddings(video_batch, tolerance=5):

    positive_record_file = np.random.choice(filenames)
    negative_record_file = np.random.choice(filenames)

    positive_rgb, positive_audio = sample_dataset(positive_record_file)
    negative_rgb, negative_audio = sample_dataset(negative_record_file)

    optical_flow_pos = calculate_optical_flow_euclidean(positive_rgb)
    optical_flow_pos = moving_average(optical_flow_pos, window_size=20)

    optical_flow_neg = calculate_optical_flow_euclidean(negative_rgb)
    optical_flow_neg = moving_average(optical_flow_neg, window_size=20)

    positive_segments = optical_flow_segments(optical_flow_pos)
    negative_segments = optical_flow_segments(optical_flow_neg)

    pos_segment, negative_segment, pos_time, neg_time = get_similar_length_segments(positive_segments, negative_segments, tolerance = 5)

    # converting segment index to time in seconds
    pos_start, pos_end = pos_segment
    pos_start = positive_segments[pos_start]
    pos_end = positive_segments[pos_end]
    print(pos_start, pos_end)

    # retrieving segment and padding it appropriately
    pos_video, pos_audio, pos_mask = perform_feature_padding(positive_rgb, positive_audio, pos_start, pos_end, max_seq_len)

    # computing embedding
    positive_video_embedding = video_model(pos_video, pos_mask)
    positive_audio_embedding = audio_model(pos_audio, pos_mask)

    # converting segment index to time in seconds
    neg_start, neg_end = negative_segment
    neg_start = negative_segments[neg_start]
    neg_end = negative_segments[neg_end]

    # retrieving segment and padding it appropriately
    neg_video, neg_audio, neg_mask = perform_feature_padding(negative_rgb, negative_audio, neg_start, neg_end, max_seq_len)

    # computing embedding
    negative_video_embedding = video_model(neg_video, neg_mask)
    negative_audio_embedding = audio_model(neg_audio, neg_mask)
    return positive_video_embedding, positive_audio_embedding, negative_video_embedding, negative_audio_embedding

In [55]:
max_seq_len = 100


def get_segmentd_embeddings(video_model, audio_model, vid, aud):
    vid_segment_embeddings = []
    

    of = OpticalFlowProcessor()
    flow = of._compute_flow(vid, aud)
    segments = of._optical_flow_segments(flow)

    vid_segment_embeddings = []
    aud_segment_embeddings = []
    for i in range(1, len(segments)):
        start = segments[i-1]
        end = segments[i]

        vid_emb, aud_emb, mask = perform_feature_padding(vid, aud, start, end, max_seq_len)
        
        vid_segment_embeddings.append(video_model(vid_emb, mask))
        aud_segment_embeddings.append(audio_model(aud_emb, mask))
    return vid_segment_embeddings, aud_segment_embeddings

In [213]:
num_heads =1
num_layers=1
audio_model = Transformer(input_dim=128, embed_dim=256, num_heads=num_heads, num_layers=num_layers, max_seq_len=max_seq_len)
video_model = Transformer(input_dim=1024, embed_dim=256, num_heads=num_heads, num_layers=num_layers, max_seq_len=max_seq_len)

def get_batch_embeddings(video_model, audio_model, video_batch, audio_batch):
    # We precompute the segment embeddings in each batch.  We do this once and then proceed to processing batch
    batch_vid_embeddings = []
    batch_aud_embeddings = []
    for i in range(len(video_batch)):
        vid = video_batch[i]
        aud = audio_batch[i]
        vid_sgmt_emb, aud_sgmt_emb = get_segmentd_embeddings(video_model, audio_model, vid, aud)
        batch_vid_embeddings.extend(vid_sgmt_emb)
        batch_aud_embeddings.extend(aud_sgmt_emb)
    # Shape will by (total segments X embedding dim)
    # total segments is clip dependent
    batch_aud_embeddings = torch.stack(batch_aud_embeddings)
    batch_vid_embeddings = torch.stack(batch_vid_embeddings)
    
    # MAKE SURE VECTORS ARE NORMALIZED FIRST idk if I want to do here or later..
    batch_aud_embeddings = torch.nn.functional.normalize(batch_aud_embeddings, p=2, dim=1)
    batch_vid_embeddings = torch.nn.functional.normalize(batch_vid_embeddings, p=2, dim=1)

    return batch_aud_embeddings, batch_vid_embeddings



In [214]:
batch_aud_embeddings, batch_vid_embeddings = get_batch_embeddings(video_model, audio_model, video_batch, audio_batch)

  vf = torch.tensor(video_features[start_segment:end_segment,:])
  af = torch.tensor(audio_features[start_segment:end_segment,:])


In [215]:
# MAKE SURE VECTORS ARE NORMALIZED FIRST
# embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
video_similities = torch.matmul(batch_vid_embeddings, batch_vid_embeddings.T)

In [216]:
def get_top_k(batch_embeddings):
    similarity = torch.matmul(batch_embeddings, batch_embeddings.T)

    # Set diagonal to -inf to exclude self-similarity (or you could set it to NaN)
    # This is done so the "most similar" excludes the embedding itself.
    similarity.fill_diagonal_(-float('inf'))

    # Find the top k most similar embeddings for each embedding
    k = 5  # Number of similar embeddings to find
    top_k_similarities, top_k_indices = torch.topk(similarity, k, dim=1, largest=True)

    # Print the results
    #print("Top k most similar embeddings:")
    #for i in range(batch_embeddings.shape[0]):
    #    print(f"Embedding {i} - Most Similar Embeddings (Indices): {top_k_indices[i]}")
    #    print(f"Cosine Similarities: {top_k_similarities[i]}")
    return top_k_similarities, top_k_indices

In [217]:
torch.nn.functional.normalize(batch_vid_embeddings, p=2, dim=1).sum(axis=1)

tensor([-1.1292, -0.9560, -1.0216, -1.2049, -0.9785, -0.9997, -1.2286, -0.7535,
        -0.8464, -0.9180, -0.7393, -0.9416, -0.9702, -1.0371, -0.7102, -0.8329,
        -1.1699, -1.0593, -0.9823, -0.8386, -1.4440, -0.9205],
       grad_fn=<SumBackward1>)

In [218]:
top_k_similarities, top_k_indices = get_top_k(batch_vid_embeddings)
top_k_indices[0]
# so for vid 0, we will use negative anchors 3, 4, 7, 5 and 6

tensor([ 6,  3,  2, 20,  8])

## Training Loop

In [219]:
### Parameters

In [220]:
lambda1 = 0.1

In [221]:
lambda2 = 0.1
lambda3 = 0.1
lambda4 = 0.1
lambda5 = 0.1
lambda6 = 0.1

lr = 1e-4

# Define the Adam optimizer for the audio model
audio_optimizer = optim.Adam(audio_model.parameters(), lr=lr)

# Define the Adam optimizer for the video model
video_optimizer = optim.Adam(video_model.parameters(), lr=lr)

batch_size = 2
window_size = 20 # for optical flow smoothing (ie 20 frame mavg flow)

dataloader = get_dataloader(path, batch_size=batch_size, shuffle=True, method='video', window_size=window_size)

### Showing how to mine negative embeddings in vectorized way

In [222]:
batch_aud_embeddings, batch_vid_embeddings = get_batch_embeddings(video_model, audio_model, video_batch, audio_batch)

_, vid_top_k = get_top_k(batch_vid_embeddings)
vid_top_k

  vf = torch.tensor(video_features[start_segment:end_segment,:])
  af = torch.tensor(audio_features[start_segment:end_segment,:])


tensor([[ 3,  6,  2, 20,  8],
        [ 4,  9,  5,  8, 11],
        [ 3,  0,  6,  8,  5],
        [ 0,  6,  2, 20, 13],
        [ 9,  8,  5,  1, 11],
        [ 4,  8,  9,  1, 11],
        [ 0,  3,  2, 20, 13],
        [ 1, 11,  8,  4,  9],
        [ 9,  4,  5,  1, 11],
        [ 8,  4,  5,  1, 11],
        [ 7, 11,  1,  4,  9],
        [21,  1,  9,  4,  8],
        [21, 11, 19, 13,  9],
        [17, 18,  8, 12, 20],
        [19, 15, 17, 13,  9],
        [14, 21, 19, 11,  9],
        [ 3,  6,  0,  2, 13],
        [18, 13, 19,  8,  9],
        [17, 13, 19,  8,  9],
        [17, 18, 14, 21, 12],
        [13,  2, 18, 17,  3],
        [12, 11, 15, 19,  9]])

In [223]:
batch_vid_embeddings[vid_top_k].shape

torch.Size([22, 5, 256])

In [224]:
batch_vid_embeddings.shape

torch.Size([22, 256])

In [225]:
batch_vid_embeddings.unsqueeze(1).shape

torch.Size([22, 1, 256])

In [226]:
((batch_vid_embeddings.unsqueeze(1))*batch_vid_embeddings[vid_top_k]).shape

torch.Size([22, 5, 256])

In [227]:
((batch_vid_embeddings.unsqueeze(1))*batch_vid_embeddings[vid_top_k])[0].sum(axis=1)

tensor([0.9974, 0.9972, 0.9937, 0.9794, 0.9752], grad_fn=<SumBackward1>)

In [228]:
torch.stack([batch_vid_embeddings[0]*batch_vid_embeddings[7],
             batch_vid_embeddings[0]*batch_vid_embeddings[4],
             batch_vid_embeddings[0]*batch_vid_embeddings[2],
             batch_vid_embeddings[0]*batch_vid_embeddings[5],
             batch_vid_embeddings[0]*batch_vid_embeddings[8]]).sum(axis=1)

tensor([0.9550, 0.9706, 0.9937, 0.9725, 0.9752], grad_fn=<SumBackward1>)

In [229]:
torch.matmul(batch_vid_embeddings[0],batch_vid_embeddings[7])

tensor(0.9550, grad_fn=<DotBackward0>)

In [230]:
torch.matmul(batch_vid_embeddings[0],batch_vid_embeddings[4])

tensor(0.9706, grad_fn=<DotBackward0>)

### Showing how to get positive embedding

In [231]:
(batch_vid_embeddings*batch_aud_embeddings).sum(axis=1)

tensor([0.0242, 0.0371, 0.0467, 0.0355, 0.0358, 0.0414, 0.0277, 0.0235, 0.0361,
        0.0443, 0.0335, 0.0372, 0.0560, 0.0570, 0.0380, 0.0585, 0.0453, 0.0495,
        0.0545, 0.0459, 0.0617, 0.0424], grad_fn=<SumBackward1>)

In [232]:
### total loss vectorized
(batch_vid_embeddings*batch_aud_embeddings)

tensor([[ 3.3245e-03,  4.1721e-03,  2.2809e-03,  ...,  4.1571e-03,
          2.5281e-03, -1.6599e-03],
        [ 3.9830e-03,  6.2003e-03,  2.3320e-03,  ...,  5.2013e-03,
          8.6622e-04, -5.2390e-04],
        [ 2.6704e-03,  4.9460e-03,  2.8417e-03,  ...,  4.4937e-03,
          2.8771e-03, -1.6045e-03],
        ...,
        [ 4.1135e-03,  1.1327e-02,  2.9570e-03,  ...,  6.3373e-03,
          8.8758e-04, -2.2910e-04],
        [ 3.5647e-03,  7.9073e-03,  2.0349e-03,  ...,  4.6994e-03,
          1.5677e-03,  2.3363e-04],
        [ 4.0173e-03,  1.0205e-02,  1.8800e-03,  ...,  6.0003e-03,
          5.6711e-04,  3.1909e-05]], grad_fn=<MulBackward0>)

In [233]:
top_k_sims = (((batch_vid_embeddings.unsqueeze(1))*batch_vid_embeddings[vid_top_k])).sum(axis=-1)
top_k_sims

tensor([[0.9974, 0.9972, 0.9937, 0.9794, 0.9752],
        [0.9963, 0.9954, 0.9948, 0.9937, 0.9919],
        [0.9944, 0.9937, 0.9918, 0.9875, 0.9873],
        [0.9974, 0.9967, 0.9944, 0.9818, 0.9788],
        [0.9970, 0.9970, 0.9965, 0.9963, 0.9908],
        [0.9965, 0.9961, 0.9961, 0.9948, 0.9880],
        [0.9972, 0.9967, 0.9918, 0.9773, 0.9733],
        [0.9901, 0.9886, 0.9879, 0.9876, 0.9874],
        [0.9974, 0.9970, 0.9961, 0.9937, 0.9899],
        [0.9974, 0.9970, 0.9961, 0.9954, 0.9916],
        [0.9841, 0.9748, 0.9734, 0.9729, 0.9727],
        [0.9940, 0.9919, 0.9916, 0.9908, 0.9899],
        [0.9964, 0.9896, 0.9887, 0.9870, 0.9866],
        [0.9944, 0.9923, 0.9872, 0.9870, 0.9868],
        [0.9894, 0.9891, 0.9870, 0.9866, 0.9854],
        [0.9891, 0.9890, 0.9885, 0.9873, 0.9856],
        [0.9697, 0.9695, 0.9677, 0.9568, 0.9559],
        [0.9952, 0.9944, 0.9905, 0.9885, 0.9875],
        [0.9952, 0.9923, 0.9897, 0.9869, 0.9858],
        [0.9905, 0.9897, 0.9894, 0.9889, 0.9887],


In [238]:
pos_sims = ((batch_vid_embeddings*batch_aud_embeddings).sum(axis=1)).unsqueeze(1)
pos_sims

tensor([[0.0242],
        [0.0371],
        [0.0467],
        [0.0355],
        [0.0358],
        [0.0414],
        [0.0277],
        [0.0235],
        [0.0361],
        [0.0443],
        [0.0335],
        [0.0372],
        [0.0560],
        [0.0570],
        [0.0380],
        [0.0585],
        [0.0453],
        [0.0495],
        [0.0545],
        [0.0459],
        [0.0617],
        [0.0424]], grad_fn=<UnsqueezeBackward0>)

In [235]:
torch.clip(pos_sims - top_k_sims,0)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], grad_fn=<ClampBackward1>)

In [196]:
loss = torch.sum(torch.clip(pos_sims - top_k_sims,0))

In [197]:
# Batch iterator
for video_batch, audio_batch, flow_ranks in dataloader:
    audio_optimizer.zero_grad()
    video_optimizer.zero_grad()
    total_loss = 0

    # create segments for each batch and compute embeddings for the segments
    # stack all the embeddings into single tensors
    batch_aud_embeddings, batch_vid_embeddings = get_batch_embeddings(video_model, audio_model, video_batch, audio_batch)

    # for each video embedding find the k most similar video embeddings
    _, vid_top_k = get_top_k(batch_vid_embeddings)
    
    # for each audio embedding, find the k most simlar audio embeddings
    _, aud_top_k = get_top_k(batch_aud_embeddings)

    
    
    break

  vf = torch.tensor(video_features[start_segment:end_segment,:])
  af = torch.tensor(audio_features[start_segment:end_segment,:])
