In [2]:
%load_ext autoreload
%autoreload 2

from datasets.MSDWild import MSDWildChunks
from datasets.MSDWildOptimized import UpfrontNPZDataset, LazyNPZDataset
import time
import os

In [3]:
S3_BUCKET_NAME = "mmml-proj"
S3_VIDEO_DIR = "preprocessed"
TRAIN_DATA_PATH = os.path.join("s3://", S3_BUCKET_NAME, S3_VIDEO_DIR)
TRAIN_RTTM_PATH = "few.train.rttm"
SUBSET = 0.025

In [None]:
# Est Time: 27.75 minutes
start = time.time()
dataset = MSDWildChunks(
    data_path=S3_VIDEO_DIR,
    data_bucket=S3_BUCKET_NAME,
    partition_path=TRAIN_RTTM_PATH,
    subset=SUBSET,
    refresh_fileset=False,
)
end = time.time()
execution_time_minutes = (end - start) / 60

print(f"Execution time: {execution_time_minutes:.2f} minutes")

Directory has 523104 pairs


Loading Pair Metadata for Videos: 100%|██████████| 2476/2476 [02:44<00:00, 15.03it/s]


Loaded metadata for 1379378 pairs
Reducing number of pairs from 523104 to 13077


Loading Triplet Files:  30%|██▉       | 3888/13077 [08:39<31:54:43, 12.50s/it]

In [7]:
# Measured Time: 0.73 Minutes
start = time.time()
dataset = UpfrontNPZDataset(
    npz_dir="batched_triplets", num_batches=2, batch_size=1000, bucket=S3_BUCKET_NAME
)
end = time.time()
execution_time_minutes = (end - start) / 60

print(f"Execution time: {execution_time_minutes:.2f} minutes")

mmml-proj batched_triplets/triplet_batch_00000.npz


Unpacking batch 0: 100%|██████████| 5000/5000 [00:01<00:00, 4149.32it/s]


mmml-proj batched_triplets/triplet_batch_00001.npz


KeyboardInterrupt: 

In [None]:
import tqdm
from collections import defaultdict

labels = defaultdict(int)
for _, _, label in tqdm.tqdm(dataset):
    labels[label] += 1

labels

In [17]:
import torch


def collate_fn(batch):
    # Extract each feature: do the zip thing
    video_data, audio_data, is_speaking = list(zip(*batch))
    # Padding: NOTE: Not necessary
    # Stack:
    video_data = torch.stack(video_data)
    audio_data = torch.stack(audio_data)
    is_speaking = torch.tensor(is_speaking)
    # Return tuple((N, video_data, melspectrogram), (N, video_data, melspectrogram), (N, video_data, melspectrogram))
    # (N, C, H, W), (N, Bands, T) x3 (ask Prachi)
    batch_data = {
        "video_data": video_data,
        "audio_data": audio_data,
        "labels": is_speaking,
    }
    return batch_data

In [23]:
from torch.utils.data import DataLoader

loader = DataLoader(
    dataset,
    batch_size=2,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True,
)

In [24]:
import numpy as np
import matplotlib.pyplot as plt


def visualize_visual_triplet(images):
    images_rgb = np.transpose(
        images, axes=(0, 2, 3, 1)
    )  # Transpose to (num_images, height, width, channels)

    # Plot the images side by side
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    fig.suptitle("Three RGB Images Side by Side")

    names = ["Anchor", "Positive", "Negative"]

    for i in range(3):
        axs[i].imshow(images_rgb[i])  # Display each image
        axs[i].set_title(names[i])
        axs[i].axis("off")  # Turn off axis labels for cleaner display

    plt.tight_layout()
    plt.show()
    plt.close()

In [25]:
numiters = 15
for i in loader:
    print(i.keys())
    video = i["video_data"]
    audio = i["audio_data"]
    label = i["labels"]
    print(video.shape)
    print(audio.shape)
    print(label)
    # visualize_visual_triplet(video[0])
    # visualize_visual_triplet(video[1])
    numiters -= 1
    if numiters < 1:
        break

dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_keys(['video_data', 'audio_data', 'labels'])
torch.Size([2, 3, 3, 112, 112])
torch.Size([2, 3, 30, 22])
tensor([1, 1])
dict_key