# Audio Dataset Exploration

This notebook explores the audio-caption dataset from Kaggle.

In [14]:
# Import the dataloader
import sys
sys.path.append('.')

# Reload modules to pick up any changes
import importlib
if 'data.dataloader' in sys.modules:
    importlib.reload(sys.modules['data.dataloader'])

from data.dataloader import get_dataloader, AudioCaptionDataset
import matplotlib.pyplot as plt
import numpy as np

In [15]:
# Configuration: Set USE_SUBSET=True for local testing, False for full dataset on HPC
USE_SUBSET = True
SUBSET_SIZE = 100  # Number of samples to use for local testing

if USE_SUBSET:
    print(f"ðŸ”§ LOCAL MODE: Using subset of {SUBSET_SIZE} samples for testing")
else:
    print("ðŸš€ HPC MODE: Using full dataset")

ðŸ”§ LOCAL MODE: Using subset of 100 samples for testing


In [16]:
# Load the dataloader to explore batches
dataloader = get_dataloader(batch_size=1, shuffle=False, subset_size=SUBSET_SIZE if USE_SUBSET else None)
dataset = dataloader.dataset
print(f"Dataset size: {len(dataset)}")
print(f"\nDataFrame columns: {dataset.df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(dataset.df.head())

Resuming download from 335544320 bytes (1192605731 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/mmoreaux/environmental-sound-classification-50?dataset_version_number=15 (335544320/1528150051) bytes left.
Resuming download from https://www.kaggle.com/api/v1/datasets/download/mmoreaux/environmental-sound-classification-50?dataset_version_number=15 (335544320/1528150051) bytes left.


 28%|â–ˆâ–ˆâ–Š       | 402M/1.42G [01:11<15:21, 1.20MB/s] 


KeyboardInterrupt: 

In [None]:
# Get a single sample
waveform, sample_rate, caption = dataset[0]
print(f"Waveform shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")
print(f"Caption: {caption}")

In [None]:
# Visualize the first audio sample
plt.figure(figsize=(12, 4))
plt.plot(waveform.t().numpy())
plt.title(f"Audio Waveform - {caption}")
plt.xlabel("Sample")
plt.ylabel("Amplitude")
plt.show()

In [None]:
# Create a DataLoader and iterate through batches
dataloader = get_dataloader(batch_size=4, shuffle=False, subset_size=SUBSET_SIZE if USE_SUBSET else None)

# Get one batch
for batch_waveforms, batch_sample_rates, batch_captions in dataloader:
    print(f"Batch size: {len(batch_captions)}")
    print(f"Waveforms shape: {batch_waveforms.shape}")
    print(f"Captions: {batch_captions}")
    break  # Just look at first batch

In [None]:
# Explore caption distribution
print("Caption distribution:")
print(dataset.df[dataset.df.columns[-1]].value_counts())