# MusicCaps Cache Preview

Utilities for inspecting the MusicCaps metadata and cached audio clips.

## Imports and configuration

In [None]:
from __future__ import annotations

from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Sequence

import subprocess

import pandas as pd
from datasets import load_dataset
from IPython.display import Audio, display

import yt_dlp
from yt_dlp.utils import DownloadError

try:
    import browser_cookie3
except ImportError:  # browser cookies are optional for yt-dlp
    browser_cookie3 = None

AUDIO_CACHE_DIR = Path('data/musiccaps/audio')
AUDIO_CACHE_DIR.mkdir(parents=True, exist_ok=True)

MUSICCAPS_DATASET = 'google/musiccaps'
MUSICCAPS_SPLIT = 'train'
AUDIO_SAMPLING_RATE = 48_000
CLIP_SECONDS = 10

YTDLP_DOWNLOAD_ARCHIVE: Optional[Path] = AUDIO_CACHE_DIR.parent / 'youtube_download_archive.txt'
YTDLP_COOKIES_FROM_BROWSER: Optional[str] = "chrome"
YTDLP_COOKIES_FILE: Optional[Path] = None
YTDLP_SLEEP_INTERVAL: Optional[float] = 5.0
YTDLP_MAX_SLEEP_INTERVAL: Optional[float] = 10.0
YTDLP_MAX_RETRIES: int = 10
YTDLP_FRAGMENT_RETRIES: int = 10
YTDLP_MAX_WORKERS: int = 4


## Metadata helpers

In [2]:
@dataclass
class MusicCapsSample:
    ytid: str
    start_s: float
    text: str
    audio_path: Path


def load_musiccaps_metadata(sample_limit: Optional[int] = None) -> List[Dict[str, str]]:
    """Load MusicCaps metadata entries from the Hugging Face dataset."""
    dataset = load_dataset(MUSICCAPS_DATASET, split=MUSICCAPS_SPLIT)
    if sample_limit is None or sample_limit >= len(dataset):
        return list(dataset)
    return [dataset[i] for i in range(sample_limit)]


def prepare_musiccaps_samples(
    sample_limit: Optional[int] = None,
    *,
    metadata: Optional[List[Dict[str, str]]] = None,
) -> tuple[List[MusicCapsSample], List[Dict[str, str]]]:
    """
    Collect cached MusicCaps clips and report missing metadata rows.

    Returns a tuple of (cached_samples, missing_rows).
    """
    if metadata is None:
        metadata = load_musiccaps_metadata(sample_limit)
    elif sample_limit is not None:
        metadata = metadata[:sample_limit]

    ordered_rows: List[Dict[str, str]] = list(metadata)
    order_index = {
        (row['ytid'], float(row['start_s'])): idx for idx, row in enumerate(ordered_rows)
    }

    cached_samples: List[MusicCapsSample] = []
    missing_rows: List[Dict[str, str]] = []

    for row in ordered_rows:
        start_s = float(row['start_s'])
        clip_path = AUDIO_CACHE_DIR / f"{row['ytid']}_{int(start_s)}.wav"
        if clip_path.exists():
            cached_samples.append(
                MusicCapsSample(
                    ytid=row['ytid'],
                    start_s=start_s,
                    text=row['caption'],
                    audio_path=clip_path,
                )
            )
        else:
            missing_rows.append(row)

    cached_samples.sort(
        key=lambda sample: order_index[(sample.ytid, float(sample.start_s))]
    )
    return cached_samples, missing_rows


## Download missing audio from YouTube

In [None]:

def download_audio(ytid: str, start_s: float) -> Optional[Path]:
    """Download and trim a single MusicCaps clip via yt-dlp and ffmpeg."""
    start_s = float(start_s)
    clip_path = AUDIO_CACHE_DIR / f"{ytid}_{int(start_s)}.wav"
    if clip_path.exists():
        return clip_path

    temp_template = str(AUDIO_CACHE_DIR / f"{ytid}.%(ext)s")
    ydl_opts = {
        'outtmpl': temp_template,
        'format': 'bestaudio/best',
        'quiet': True,
        'no_warnings': True,
        'ignoreerrors': False,
        'retries': YTDLP_MAX_RETRIES,
        'fragment_retries': YTDLP_FRAGMENT_RETRIES,
        'retry_sleep_functions': {'http': lambda n: 2 ** max(0, n - 1)},
    }
    if YTDLP_SLEEP_INTERVAL is not None:
        ydl_opts['sleep_interval'] = YTDLP_SLEEP_INTERVAL
        if YTDLP_MAX_SLEEP_INTERVAL is not None:
            ydl_opts['max_sleep_interval'] = max(
                YTDLP_SLEEP_INTERVAL,
                YTDLP_MAX_SLEEP_INTERVAL,
            )
    if YTDLP_DOWNLOAD_ARCHIVE is not None:
        YTDLP_DOWNLOAD_ARCHIVE.parent.mkdir(parents=True, exist_ok=True)
        ydl_opts['download_archive'] = str(YTDLP_DOWNLOAD_ARCHIVE)
    if YTDLP_COOKIES_FROM_BROWSER:
        if browser_cookie3 is None:
            raise RuntimeError(
                'browser-cookie3 is required when YTDLP_COOKIES_FROM_BROWSER is set.'
            )
        ydl_opts['cookiesfrombrowser'] = (YTDLP_COOKIES_FROM_BROWSER, None, None, None)
    elif YTDLP_COOKIES_FILE:
        YTDLP_COOKIES_FILE.parent.mkdir(parents=True, exist_ok=True)
        ydl_opts['cookiefile'] = str(YTDLP_COOKIES_FILE)

    url = f'https://www.youtube.com/watch?v={ytid}'
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            result = ydl.extract_info(url, download=True)
            downloaded_path = Path(ydl.prepare_filename(result))
    except DownloadError as exc:
        print(f'[yt-dlp] skip {ytid}: {exc}')
        return None
    except Exception as exc:  # resilience for unexpected extractor errors
        print(f'[yt-dlp] unexpected error for {ytid}: {exc}')
        return None

    try:
        subprocess.run(
            [
                'ffmpeg',
                '-hide_banner',
                '-loglevel',
                'error',
                '-y',
                '-ss',
                str(start_s),
                '-t',
                str(CLIP_SECONDS),
                '-i',
                str(downloaded_path),
                '-ar',
                str(AUDIO_SAMPLING_RATE),
                '-ac',
                '1',
                str(clip_path),
            ],
            check=True,
        )
    except subprocess.CalledProcessError as exc:
        print(f'[ffmpeg] failed to trim {ytid}: {exc}')
        return None
    finally:
        if 'downloaded_path' in locals() and downloaded_path.exists():
            downloaded_path.unlink(missing_ok=True)

    return clip_path


def fetch_missing_audio(missing_rows: Sequence[Dict[str, str]]) -> List[MusicCapsSample]:
    """Download clips for the provided metadata rows."""
    if not missing_rows:
        return []

    worker_count = max(1, YTDLP_MAX_WORKERS)
    fetched_samples: List[MusicCapsSample] = []

    with ThreadPoolExecutor(max_workers=worker_count) as executor:
        future_map = {
            executor.submit(
                download_audio,
                row['ytid'],
                float(row['start_s']),
            ): row
            for row in missing_rows
        }
        for future in as_completed(future_map):
            path = future.result()
            row = future_map[future]
            if path is not None:
                fetched_samples.append(
                    MusicCapsSample(
                        ytid=row['ytid'],
                        start_s=float(row['start_s']),
                        text=row['caption'],
                        audio_path=path,
                    )
                )

    return fetched_samples


## Preview the MusicCaps metadata

In [3]:

METADATA_SAMPLE_LIMIT = 5

metadata_preview = load_musiccaps_metadata(sample_limit=METADATA_SAMPLE_LIMIT)
metadata_df = pd.DataFrame(metadata_preview)

if not metadata_df.empty:
    display(metadata_df[['ytid', 'start_s', 'caption']])
else:
    print('No metadata rows were returned; check the dataset configuration.')

print(f'Total metadata rows fetched: {len(metadata_preview)}')


Unnamed: 0,ytid,start_s,caption
0,-0Gj8-vB1q4,30,The low quality recording features a ballad so...
1,-0SdAVK79lg,30,This song features an electric guitar as the m...
2,-0vPFx-wRRI,30,a male voice is singing a melody with changing...
3,-0xzrMun0Rs,30,This song contains digital drums playing a sim...
4,-1LrH01Ei1w,30,This song features a rubber instrument being p...


Total metadata rows fetched: 5


## Inspect cached audio clips

In [4]:
CACHED_SAMPLE_LIMIT = 100

cached_samples, missing_rows = prepare_musiccaps_samples(sample_limit=CACHED_SAMPLE_LIMIT)

print(f'Cached clips: {len(cached_samples)} | Missing clips: {len(missing_rows)}')

cached_df = pd.DataFrame(
    [
        {
            'ytid': sample.ytid,
            'start_s': sample.start_s,
            'caption': sample.text,
            'audio_path': str(sample.audio_path),
        }
        for sample in cached_samples
    ]
)

if cached_df.empty:
    print('No cached audio clips were found in data/musiccaps/audio.')
else:
    display(cached_df.head())

if cached_samples:
    first_clip = cached_samples[0]
    print(f'Previewing first cached clip: {first_clip.audio_path.name}')
    print(f'Caption: {first_clip.text}')
    display(Audio(filename=str(first_clip.audio_path)))
else:
    print('Add audio clips to AUDIO_CACHE_DIR or run the optional download cell to populate the cache.')


Cached clips: 98 | Missing clips: 2


Unnamed: 0,ytid,start_s,caption,audio_path
0,-0Gj8-vB1q4,30.0,The low quality recording features a ballad so...,data/musiccaps/audio/-0Gj8-vB1q4_30.wav
1,-0SdAVK79lg,30.0,This song features an electric guitar as the m...,data/musiccaps/audio/-0SdAVK79lg_30.wav
2,-0vPFx-wRRI,30.0,a male voice is singing a melody with changing...,data/musiccaps/audio/-0vPFx-wRRI_30.wav
3,-0xzrMun0Rs,30.0,This song contains digital drums playing a sim...,data/musiccaps/audio/-0xzrMun0Rs_30.wav
4,-1LrH01Ei1w,30.0,This song features a rubber instrument being p...,data/musiccaps/audio/-1LrH01Ei1w_30.wav


Previewing first cached clip: -0Gj8-vB1q4_30.wav
Caption: The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft female vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services.
