In [None]:
from datasets import load_dataset
from datasets import Audio
import librosa
import soundfile as sf
import io
import numpy as np

dataset = load_dataset("KoelLabs/L2Arctic")

spontaneous_dataset = dataset['spontaneous']
spontaneous_dataset = spontaneous_dataset.cast_column("audio", Audio(decode=False))

In [30]:
def load_hf_audio(row, target_sr=16000):
    audio_bytes = row["audio"]["bytes"]
    y, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32")

    # stereo â†’ mono
    if y.ndim == 2:
        y = y.mean(axis=1)

    # resample
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr

    return y, sr


row = spontaneous_dataset[0]
y, sr = load_hf_audio(row)

In [31]:
def cap_audio(y, sr, max_sec=75, min_sec=30):
    max_len = int(max_sec * sr)
    min_len = int(min_sec * sr)

    if len(y) < min_len:
        return None  # discard

    return y[:max_len]

In [33]:
from pathlib import Path
import soundfile as sf

out_dir = Path("../data/l2arctic_spontaneous")
out_dir.mkdir(parents=True, exist_ok=True)

saved = []

for i, row in enumerate(spontaneous_dataset):
    y, sr = load_hf_audio(row)
    y = cap_audio(y, sr)

    if y is None: 
        continue
    
    fname = out_dir / f"L2A_{i:03d}.wav" # L2A is arctic L2, i:03d is 3 digit index with leading zeros
    sf.write(fname, y, sr)

    saved.append({
        "sample_id": fname.stem,
        "speaker": row.get("speaker_id", "unknown"),
        "duration_sec": len(y) /sr,
        "source": "L2Arctic_spontaneous",
    })

print(f"Saved {len(saved)} samples")

Saved 20 samples
