# VoxTube Dataset Subset Analysis

This notebook adapts the `analyze_voxtube.py` script to visualize a subset of the `voice-is-cool/voxtube` dataset. It allows audio playback, DataFrame inspection, and text distribution analysis.

In [1]:
import psutil
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_from_disk
from IPython.display import Audio, display

sns.set_theme(style="whitegrid")

  from .autonotebook import tqdm as notebook_tqdm


## 1. Helper Functions

In [2]:
def log_memory_usage(stage: str):
    """Print current RAM usage."""
    mem = psutil.virtual_memory()
    used = mem.used // (1024 ** 2)
    total = mem.total // (1024 ** 2)
    print(f"[RAM][{stage}] {used} MB used / {total} MB total")

## 2. Load and Prepare Data

In [3]:
DATASET_PATH = "/workspace/datasets/voxtube_subset"
MAX_SAMPLES = 500

log_memory_usage("start")

dataset = load_from_disk(DATASET_PATH)
print(f"Original dataset size: {len(dataset)} samples")

subset = dataset.select(range(min(len(dataset), MAX_SAMPLES)))
print(f"Subset size: {len(subset)} samples")

df = subset.to_pandas()
log_memory_usage("after loading subset")

display(df.head())

[RAM][start] 46447 MB used / 773712 MB total
Original dataset size: 1000 samples
Subset size: 500 samples
[RAM][after loading subset] 46496 MB used / 773712 MB total


Unnamed: 0,upload_date,segment_id,video_id,channel_id,language,gender,spk_id,spk_estim_age,spk_estim_age_mae,audio
0,2018-05-02,11,vIpK78CL1so,UC7rMVNUr7318I0MKumPbIKA,english,male,684,23.557245,3.61629,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...
1,2018-05-02,3,vIpK78CL1so,UC7rMVNUr7318I0MKumPbIKA,english,male,684,23.557245,3.61629,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...
2,2018-05-02,7,vIpK78CL1so,UC7rMVNUr7318I0MKumPbIKA,english,male,684,23.557245,3.61629,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...
3,2018-05-02,2,vIpK78CL1so,UC7rMVNUr7318I0MKumPbIKA,english,male,684,23.557245,3.61629,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...
4,2018-05-02,8,vIpK78CL1so,UC7rMVNUr7318I0MKumPbIKA,english,male,684,23.557245,3.61629,{'bytes': b'ID3\x04\x00\x00\x00\x00\x00#TSSE\x...


## 5. Audio Playback (If Available)

In [11]:
audio_sample = subset[0]['audio']
audio_bytes = audio_sample['bytes'];
# display(Audio(audio_sample['array'], rate='16000'))
Audio(audio_bytes)

In [None]:
# Check unique speaker IDs
unique_speakers = df['spk_id'].nunique()
print(f"Number of unique speakers: {unique_speakers}")
print(f"Speaker ID range: {df['spk_id'].min()} - {df['spk_id'].max()}")
print(f"Speaker ID distribution:")
print(df['spk_id'].value_counts().head(10))

log_memory_usage("end")