In [92]:
from datasets import load_dataset
import random
from collections import defaultdict
from tqdm import tqdm
import IPython
from datasets import Audio, Dataset



In [113]:
import datasets
print(datasets.__version__)

3.2.0


In [137]:
# from huggingface_hub import login
# login(token="")

In [101]:
# data = load_dataset("parler-tts/libritts_r_filtered", "clean", split="dev.clean", streaming=True)
meta = load_dataset("parler-tts/libritts-r-filtered-speaker-descriptions", "clean", split="dev.clean")

In [102]:
meta

Dataset({
    features: ['text', 'text_original', 'speaker_id', 'path', 'chapter_id', 'id', 'snr', 'c50', 'speech_duration', 'speaking_rate', 'phonemes', 'stoi', 'si-sdr', 'pesq', 'gender', 'utterance_pitch_std', 'utterance_pitch_mean', 'pitch', 'noise', 'reverberation', 'speech_monotony', 'sdr_noise', 'pesq_speech_quality', 'accent', 'text_description'],
    num_rows: 5589
})

In [103]:
stream_ds = load_dataset(
    "parler-tts/libritts_r_filtered",
    "clean",
    split="dev.clean",
    streaming=True           # keeps the download limited to dev.clean shards
)

# Fastest / cleanest for most users – build a list then wrap it
materialised = list(stream_ds)             # pulls every example ONCE
data = Dataset.from_list(materialised)  # now a map-style Dataset

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

In [114]:
print(data)

Dataset({
    features: ['audio', 'text_normalized', 'text_original', 'speaker_id', 'path', 'chapter_id', 'id'],
    num_rows: 5589
})


In [105]:
assert len(data) == len(meta)

## Same or different speakers

In [106]:
# count the number of speakers
count = {}
for i in range(len(meta)):
    speaker_id = meta[i]["speaker_id"]
    if speaker_id not in count:
        count[speaker_id] = 0
    count[speaker_id] += 1
print(f"Number of speakers: {len(count)}")
print(f"Average number of samples per speaker: {sum(count.values()) / len(count)}")

Number of speakers: 40
Average number of samples per speaker: 139.725


In [129]:
def build_pair_list(meta, N: int, seed: int = 0):
    """
    Create N random (idx_i, idx_j, label) triples, roughly half same-speaker (label=0)
    and half different-speaker (label=1).

    Parameters
    ----------
    meta : datasets.Dataset
        The metadata split that mirrors `data`. Must contain a 'speaker_id' field.
    N : int
        Desired number of triples in the output list.
    seed : int, optional
        RNG seed for reproducibility.

    Returns
    -------
    list[tuple[int, int, int]]
        List with N elements of the form (idx_i, idx_j, label).
    """
    random.seed(seed)

    # ── 1. Build an index list for every speaker ────────────────────────────────
    speaker2idx = defaultdict(list)
    for idx, row in enumerate(meta):
        speaker2idx[row["speaker_id"]].append(idx)

    speakers          = list(speaker2idx)                      # all speakers

    # ── 2. Decide how many positives (same-speaker = label 0) and negatives ─────
    n_pos = N // 2
    n_neg = N - n_pos

    triples = set()            # use a set to avoid duplicates

    # ── 3. Draw same-speaker pairs (label 0) ───────────────────────────────────
    while len(triples) < n_pos:
        spk = random.choice(speakers)
        i, j = random.sample(speaker2idx[spk], 2)
        assert meta[i]["speaker_id"] == meta[j]["speaker_id"]
        triples.add((i, j, "same"))

    # ── 4. Draw different-speaker pairs (label 1) ───────────────────────────────
    while len(triples) < N:
        spk_a, spk_b = random.sample(speakers, 2)
        assert spk_a != spk_b
        i = random.choice(speaker2idx[spk_a])
        j = random.choice(speaker2idx[spk_b])
        triples.add((i, j, "different"))

    # ── 5. Shuffle for extra randomness and return as list ──────────────────────
    triples_list = list(triples)
    random.shuffle(triples_list)
    return triples_list

In [130]:
triples = build_pair_list(meta, 240, seed=42)
my_data1 = []
for triple in tqdm(triples):
    audio_a = data[triple[0]]['audio']
    audio_b = data[triple[1]]['audio']
    text_normalized_a = data[triple[0]]['text_normalized']
    text_normalized_b = data[triple[1]]['text_normalized']
    speaker_a = data[triple[0]]['speaker_id']
    speaker_b = data[triple[1]]['speaker_id']
    id_a = data[triple[0]]['id']
    id_b = data[triple[1]]['id']

    label = triple[2]
    if label == "same":
        assert speaker_a == speaker_b
    elif label == "different":
        assert speaker_a != speaker_b
    else:
        raise ValueError(f"Unknown label: {label}")

    my_data1.append({
        "audio_a": audio_a,
        "audio_b": audio_b,
        "label": label,
        "text_normalized_a": text_normalized_a,
        "text_normalized_b": text_normalized_b,
        "speaker_a": speaker_a,
        "speaker_b": speaker_b,
        "id_a": id_a,
        "id_b": id_b,
        "meta_a": meta[triple[0]],
        "meta_b": meta[triple[1]],
    })

100%|██████████| 240/240 [01:09<00:00,  3.44it/s]


In [131]:
# play audip in jupyter notebook
idx = 0
IPython.display.Audio(my_data1[idx]['audio_a']["array"], rate=my_data1[idx]['audio_a']["sampling_rate"])

In [132]:
IPython.display.Audio(my_data1[idx]['audio_b']["array"], rate=my_data1[idx]['audio_b']["sampling_rate"])

In [133]:
my_data1[idx]['label']

'same'

In [134]:
hf_data1 = Dataset.from_list(my_data1)
hf_data1 = hf_data1.cast_column("audio_a", Audio()).cast_column("audio_b", Audio())

In [138]:
hf_data1.push_to_hub("potsawee/paralinguistic-judge-speaker")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/potsawee/paralinguistic-judge-speaker/commit/41c7ddc9eadd0fc56027159ae999442e0769d063', commit_message='Upload dataset', commit_description='', oid='41c7ddc9eadd0fc56027159ae999442e0769d063', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/potsawee/paralinguistic-judge-speaker', endpoint='https://huggingface.co', repo_type='dataset', repo_id='potsawee/paralinguistic-judge-speaker'), pr_revision=None, pr_num=None)

## Faster or Slower

In [139]:
meta

Dataset({
    features: ['text', 'text_original', 'speaker_id', 'path', 'chapter_id', 'id', 'snr', 'c50', 'speech_duration', 'speaking_rate', 'phonemes', 'stoi', 'si-sdr', 'pesq', 'gender', 'utterance_pitch_std', 'utterance_pitch_mean', 'pitch', 'noise', 'reverberation', 'speech_monotony', 'sdr_noise', 'pesq_speech_quality', 'accent', 'text_description'],
    num_rows: 5589
})

In [140]:
# count the number of unique speaking_rate descriptions
count = {}
for i in range(len(meta)):
    speaking_rate = meta[i]["speaking_rate"]
    if speaking_rate not in count:
        count[speaking_rate] = 0
    count[speaking_rate] += 1
print(f"Number of unique speaking_rate: {len(count)}")
print(f"Average number of samples per speaking_rate: {sum(count.values()) / len(count)}")
for k, v in count.items():
    print(f"{k}: {v}")

Number of unique speaking_rate: 7
Average number of samples per speaking_rate: 798.4285714285714
moderate speed: 2193
slightly slowly: 279
slightly fast: 2577
fast: 504
slowly: 15
very fast: 18
very slowly: 3


In [141]:
count.keys()

dict_keys(['moderate speed', 'slightly slowly', 'slightly fast', 'fast', 'slowly', 'very fast', 'very slowly'])

In [215]:
import random
from collections import defaultdict
from typing import List, Tuple, Optional

def build_speed_pairs_unique(
        meta,
        seed: Optional[int] = None,
) -> List[Tuple[int, int, str]]:
    """
    Create within-speaker fast/slow pairs where each utterance index is used
    at most once.

    Filtering rules
    ---------------
    • Ignore meta rows whose speaking_rate == 'moderate speed'.  
    • Ignore meta rows whose speech_duration < 1.0 seconds.      # ← NEW  

    Pairing rules
    -------------
    • One utterance must be fast  ('slightly fast', 'fast', 'very fast')
    • One utterance must be slow  ('slightly slowly', 'slowly', 'very slowly')
    • Both from the same speaker
    • Random orientation → roughly 50 / 50 labels

    Labels
    ------
    'a' → idx_i is faster  
    'b' → idx_j is faster
    """
    if seed is not None:
        random.seed(seed)

    FAST = {'slightly fast', 'fast', 'very fast'}
    SLOW = {'slightly slowly', 'slowly', 'very slowly'}

    # ── 1. Collect indices by speaker + speed class, applying the new filters ──
    fast_by_spk, slow_by_spk = defaultdict(list), defaultdict(list)

    for idx, row in enumerate(meta):
        rate = row["speaking_rate"]
        dur  = row["speech_duration"]          # ← use row["speech_row"] if needed
        if rate == "moderate speed" or dur < 1.0:
            continue                           # skip this utterance
        spk = row["speaker_id"]

        if rate in FAST:
            fast_by_spk[spk].append(idx)
        elif rate in SLOW:
            slow_by_spk[spk].append(idx)

    # ── 2. Build pairs without re-using indices ───────────────────────────────
    pairs = []
    for spk in fast_by_spk.keys() & slow_by_spk.keys():      # speakers with both
        fast_idxs = fast_by_spk[spk]
        slow_idxs = slow_by_spk[spk]

        random.shuffle(fast_idxs)
        random.shuffle(slow_idxs)

        # pair up in lock-step so each index is used at most once
        for idx_fast, idx_slow in zip(fast_idxs, slow_idxs):
            if random.random() < 0.5:
                pairs.append((idx_fast, idx_slow, "a"))  # fast first
            else:
                pairs.append((idx_slow, idx_fast, "b"))  # slow first
    random.shuffle(pairs)  # shuffle the pairs for extra randomness
    return pairs

In [216]:
triples = build_speed_pairs_unique(meta, seed=42)
print(len(triples))

187


In [217]:
my_data1 = []
for triple in tqdm(triples):
    audio_a = data[triple[0]]['audio']
    audio_b = data[triple[1]]['audio']
    text_normalized_a = data[triple[0]]['text_normalized']
    text_normalized_b = data[triple[1]]['text_normalized']
    speaker_a = data[triple[0]]['speaker_id']
    speaker_b = data[triple[1]]['speaker_id']
    id_a = data[triple[0]]['id']
    id_b = data[triple[1]]['id']
    assert speaker_a == speaker_b
    label = triple[2]
    my_data1.append({
        "audio_a": audio_a,
        "audio_b": audio_b,
        "label": label,
        "text_normalized_a": text_normalized_a,
        "text_normalized_b": text_normalized_b,
        "speaker_a": speaker_a,
        "speaker_b": speaker_b,
        "id_a": id_a,
        "id_b": id_b,
        "meta_a": meta[triple[0]],
        "meta_b": meta[triple[1]],
    })

100%|██████████| 187/187 [00:38<00:00,  4.90it/s]


In [218]:
# play audip in jupyter notebook
idx = 2
IPython.display.Audio(my_data1[idx]['audio_a']["array"], rate=my_data1[idx]['audio_a']["sampling_rate"])

In [219]:
IPython.display.Audio(my_data1[idx]['audio_b']["array"], rate=my_data1[idx]['audio_b']["sampling_rate"])

In [220]:
my_data1[idx]['label']

'a'

In [221]:
hf_data1 = Dataset.from_list(my_data1)
hf_data1 = hf_data1.cast_column("audio_a", Audio()).cast_column("audio_b", Audio())

In [222]:
hf_data1.push_to_hub("potsawee/paralinguistic-judge-speed")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/potsawee/paralinguistic-judge-speed/commit/9e0f4fb36f2599c02eab4e822f81129d09baaa14', commit_message='Upload dataset', commit_description='', oid='9e0f4fb36f2599c02eab4e822f81129d09baaa14', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/potsawee/paralinguistic-judge-speed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='potsawee/paralinguistic-judge-speed'), pr_revision=None, pr_num=None)

## Pitch -- Higher or Lower

In [223]:
meta

Dataset({
    features: ['text', 'text_original', 'speaker_id', 'path', 'chapter_id', 'id', 'snr', 'c50', 'speech_duration', 'speaking_rate', 'phonemes', 'stoi', 'si-sdr', 'pesq', 'gender', 'utterance_pitch_std', 'utterance_pitch_mean', 'pitch', 'noise', 'reverberation', 'speech_monotony', 'sdr_noise', 'pesq_speech_quality', 'accent', 'text_description'],
    num_rows: 5589
})

In [224]:
# count the number of unique pitch descriptions
count = {}
for i in range(len(meta)):
    pitch = meta[i]["pitch"]
    if pitch not in count:
        count[pitch] = 0
    count[pitch] += 1
print(f"Number of unique pitch: {len(count)}")
print(f"Average number of samples per pitch: {sum(count.values()) / len(count)}")
for k, v in count.items():
    print(f"{k}: {v}")

Number of unique pitch: 6
Average number of samples per pitch: 931.5
very high-pitch: 264
moderate pitch: 2846
high-pitch: 256
slightly low-pitch: 932
low-pitch: 6
slightly high-pitch: 1285


In [225]:
count.keys()

dict_keys(['very high-pitch', 'moderate pitch', 'high-pitch', 'slightly low-pitch', 'low-pitch', 'slightly high-pitch'])

In [231]:
import random
from collections import defaultdict
from typing import List, Tuple, Optional

def build_pitch_pairs_by_gender(
        meta,
        seed: Optional[int] = None,
) -> List[Tuple[int, int, str]]:
    """
    Create (idx_i, idx_j, label) triples comparing utterance pitch,
    where the two utterances come from **any speakers that share the
    same gender**.

    Filtering rules
    ---------------
    • Ignore rows with pitch == 'moderate pitch'.

    Pitch classes
    -------------
    • “High-pitch” : {'very high-pitch', 'high-pitch', 'slightly high-pitch'}
    • “Low-pitch”  : {'low-pitch', 'slightly low-pitch'}

    Pairing & labels
    ----------------
    • Each pair contains one high-pitch utterance and one low-pitch utterance
      of the **same gender**.
    • Every utterance index is used **at most once** (any surplus is dropped).
    • The order of the two indices is chosen at random:

          ┌────────────┬────────────────────────┐
          │ label = 'a'│ idx_i is higher pitch  │
          │ label = 'b'│ idx_j is higher pitch  │
          └────────────┴────────────────────────┘

    Parameters
    ----------
    meta : datasets.Dataset
        Metadata split that mirrors `data`, with 'gender' and 'pitch' columns.
    seed : int or None, optional
        Set for reproducibility.

    Returns
    -------
    list[(int, int, str)]
    """
    if seed is not None:
        random.seed(seed)

    HIGH = {'very high-pitch', 'high-pitch', 'slightly high-pitch'}
    LOW  = {'low-pitch', 'slightly low-pitch'}

    # ── 1. Collect indices per gender & pitch class ────────────────────────────
    high_by_gender: dict[str, list[int]] = defaultdict(list)
    low_by_gender:  dict[str, list[int]] = defaultdict(list)

    for idx, row in enumerate(meta):
        pitch = row['pitch']
        if pitch == 'moderate pitch':          # skip moderate
            continue
        gender = row['gender'].lower()         # 'male' or 'female'

        if pitch in HIGH:
            high_by_gender[gender].append(idx)
        elif pitch in LOW:
            low_by_gender[gender].append(idx)

    # ── 2. Pair without re-using indices ───────────────────────────────────────
    pairs: list[tuple[int, int, str]] = []
    for gender in {'male', 'female'}:          # iterate over the two genders
        highs = high_by_gender.get(gender, [])
        lows  = low_by_gender.get(gender, [])

        if not highs or not lows:
            continue                           # no possible pairs for this gender

        random.shuffle(highs)
        random.shuffle(lows)

        for idx_high, idx_low in zip(highs, lows):
            # random orientation → balanced 'a' / 'b'
            if random.random() < 0.5:
                pairs.append((idx_high, idx_low, 'a'))  # high first
            else:
                pairs.append((idx_low, idx_high, 'b'))  # low first
    random.shuffle(pairs)  # shuffle the pairs for extra randomness
    return pairs

In [232]:
triples = build_pitch_pairs_by_gender(meta, seed=42)
print(len(triples))

938


In [267]:
# play audip in jupyter notebook
idx = 20
print(my_data1[idx]['label'])
IPython.display.Audio(my_data1[idx]['audio_a']["array"], rate=my_data1[idx]['audio_a']["sampling_rate"])

a


In [268]:
IPython.display.Audio(my_data1[idx]['audio_b']["array"], rate=my_data1[idx]['audio_b']["sampling_rate"])

'a'