In [156]:
import os
import json
import random
from datasets import load_dataset, Audio, Dataset

In [12]:
# from huggingface_hub import login
# login(token="")

In [158]:
path = "/data/workspace/ppotsawee/audioLM-as-judge-new/advanced-voice-gen-task-v1/questions1_shuffled_id.json"
with open(path) as f:
    instructions = json.load(f)

In [159]:
model_mapping = {
    "diva+tts": "diva/text_kokoro",
    "gemini1.5-flash+tts": "gemini15flash-api/text_kokoro_tts",
    "gemini2-flash+tts": "gemini2flash-api/text_kokoro_tts",
    "gemini2-flash-exp": "gemini2flash-exp/audio",
    "gpt4o": "gpt4o/audio",
    "moshi": "moshi/audio",
    "qwen2-audio+tts": "qwen2/text_kokoro",
    "typhoon2-audio": "typhoon2/audio"
}

base_path = "/data/workspace/ppotsawee/audioLM-as-judge-new/eval-leaderboard/experiments/advvoiceq1"

candidates1 = [
    "diva+tts",
    "gemini1.5-flash+tts",
    "gemini2-flash+tts",
    "gemini2-flash-exp",
    "moshi",
    "qwen2-audio+tts",
    "typhoon2-audio"
]


# Add a weight for every model (default 1 if you don't mention it)
candidate_weights1 = {
    "diva+tts": 1,
    "gemini1.5-flash+tts": 1,
    "gemini2-flash+tts": 1,
    "gemini2-flash-exp": 9,
    "moshi": 1,
    "qwen2-audio+tts": 1,
    "typhoon2-audio": 1
}

# Build the weights list in the same order as `candidates`
weights1 = [candidate_weights1.get(m, 1) for m in candidates1]

def draw_example1(idx):

    model_a = "gpt4o"

    # # Use `random.choices` (plural) to draw with the given weights
    # model_b = random.choices(candidates1, weights=weights1, k=1)[0]
    model_b = "gemini2-flash-exp"

    # swap 50% chance
    if random.random() < 0.5:
        model_a, model_b = model_b, model_a

    wav_path_a = f"{base_path}/{model_mapping[model_a]}/{idx}.wav"
    wav_path_b = f"{base_path}/{model_mapping[model_b]}/{idx}.wav"

    # check if the wav files exist
    assert os.path.exists(wav_path_a), f"File not found: {wav_path_a}"
    assert os.path.exists(wav_path_b), f"File not found: {wav_path_b}"

    pair = {
        "model_a": model_a,
        "model_b": model_b,
        "wav_path_a": wav_path_a,
        "wav_path_b": wav_path_b,
        "instruction": instructions[idx]["question"],
        "idx": idx
    }
    return pair

candidates2 = [
    "diva+tts",
    "gemini1.5-flash+tts",
    "gemini2-flash+tts",
    "moshi",
    "qwen2-audio+tts",
    "typhoon2-audio"
]


def draw_example2(idx):
    if random.random() < 0.5:
        model_a = "gpt4o"
    else:
        model_a = "gemini2-flash-exp"

    # Use `random.choices` (plural) to draw with the given weights
    model_b = random.choices(candidates2, k=1)[0]

    # swap 50% chance
    if random.random() < 0.5:
        model_a, model_b = model_b, model_a

    wav_path_a = f"{base_path}/{model_mapping[model_a]}/{idx}.wav"
    wav_path_b = f"{base_path}/{model_mapping[model_b]}/{idx}.wav"

    # check if the wav files exist
    assert os.path.exists(wav_path_a), f"File not found: {wav_path_a}"
    assert os.path.exists(wav_path_b), f"File not found: {wav_path_b}"

    pair = {
        "model_a": model_a,
        "model_b": model_b,
        "wav_path_a": wav_path_a,
        "wav_path_b": wav_path_b,
        "instruction": instructions[idx]["question"],
        "idx": idx
    }
    return pair


In [160]:
draw_example1(idx=0)

{'model_a': 'gpt4o',
 'model_b': 'gemini2-flash-exp',
 'wav_path_a': '/data/workspace/ppotsawee/audioLM-as-judge-new/eval-leaderboard/experiments/advvoiceq1/gpt4o/audio/0.wav',
 'wav_path_b': '/data/workspace/ppotsawee/audioLM-as-judge-new/eval-leaderboard/experiments/advvoiceq1/gemini2flash-exp/audio/0.wav',
 'instruction': "Say the word 'tomato' twice starting with a British pronunciation version, then an American pronunciation version, and teach me the difference.",
 'idx': 0}

In [161]:
draw_example2(idx=0)

{'model_a': 'gemini2-flash-exp',
 'model_b': 'diva+tts',
 'wav_path_a': '/data/workspace/ppotsawee/audioLM-as-judge-new/eval-leaderboard/experiments/advvoiceq1/gemini2flash-exp/audio/0.wav',
 'wav_path_b': '/data/workspace/ppotsawee/audioLM-as-judge-new/eval-leaderboard/experiments/advvoiceq1/diva/text_kokoro/0.wav',
 'instruction': "Say the word 'tomato' twice starting with a British pronunciation version, then an American pronunciation version, and teach me the difference.",
 'idx': 0}

In [162]:
instruction_wav_base = "/data/workspace/ppotsawee/audioLM-as-judge-new/advanced-voice-gen-task-v1/questions1_kokoro_wav"

In [163]:
mydata = []
for idx in range(len(instructions)):
    ex1 = draw_example1(idx)

    mydata.append({
        "idx": f"{idx}_{0}",
        "instruction": f"{instruction_wav_base}/{idx}.kokoro.wav",
        "audio_a": ex1["wav_path_a"],
        "audio_b": ex1["wav_path_b"],
        "instruction_text": instructions[idx]["question"],
        "model_a": ex1["model_a"],
        "model_b": ex1["model_b"],
    })

    while True:
        ex2 = draw_example2(idx)
        if ex2["model_a"] != ex1["model_a"] and ex2["model_b"] != ex1["model_b"]:
            break
    
    mydata.append({
        "idx": f"{idx}_{1}",
        "instruction": f"{instruction_wav_base}/{idx}.kokoro.wav",
        "audio_a": ex2["wav_path_a"],
        "audio_b": ex2["wav_path_b"],
        "instruction_text": instructions[idx]["question"],
        "model_a": ex2["model_a"],
        "model_b": ex2["model_b"],
    })

In [164]:
len(mydata)

164

In [165]:
hf_data = Dataset.from_list(mydata)
hf_data = hf_data.cast_column("instruction", Audio()).cast_column("audio_a", Audio()).cast_column("audio_b", Audio())

In [None]:
# hf_data.push_to_hub("potsawee/speakbench-v1-nolabel")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/508 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/potsawee/speakbench-v1-nolabel/commit/4f24ae3b4117f7621cd10c4fcc0df2bff988cf7b', commit_message='Upload dataset', commit_description='', oid='4f24ae3b4117f7621cd10c4fcc0df2bff988cf7b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/potsawee/speakbench-v1-nolabel', endpoint='https://huggingface.co', repo_type='dataset', repo_id='potsawee/speakbench-v1-nolabel'), pr_revision=None, pr_num=None)

## Add annotations / labels

In [15]:
labels="""
b
b
b
a
both_good
a
a
a
a
a
b
a
a
a
a
a
b
b
b
b
a
b
a
b
b
a
b
a
b
a
a
a
b
a
b
b
a
b
b
b
both_bad
both_bad
a
both_bad
both_bad
b
b
b
both_good
a
a
b
b
both_bad
a
a
a
b
a
both_bad
both_good
b
both_bad
both_bad
b
a
both_bad
both_bad
b
a
b
a
a
both_bad
both_good
b
both_bad
both_bad
b
a
both_bad
b
both_bad
a
a
both_bad
a
b
a
b
a
b
a
b
both_bad
both_bad
a
both_bad
both_bad
a
both_bad
both_bad
a
both_good
a
b
b
a
a
both_bad
b
a
b
b
a
a
b
a
b
b
b
a
a
b
both_bad
b
b
b
a
b
both_good
both_good
a
b
both_bad
both_bad
both_good
a
both_bad
both_bad
both_good
a
both_bad
b
both_bad
b
both_bad
both_bad
a
b
a
both_bad
a
b
both_bad
a
a
b
b
both_bad
both_bad
a
a
a
"""

In [16]:
labels = [x.strip() for x in labels.strip().split("\n")]
print("len(labels)", len(labels))

len(labels) 164


In [17]:
from datasets import load_dataset, Audio, Dataset

In [18]:
ds = load_dataset("potsawee/speakbench-v1-nolabel", split="train")
ds

Dataset({
    features: ['idx', 'instruction', 'audio_a', 'audio_b', 'instruction_text', 'model_a', 'model_b'],
    num_rows: 164
})

In [19]:
assert len(labels) == len(ds), "labels length must equal dataset length"

# 3. Add the column
ds = ds.add_column("label", labels)     # the Dataset object is immutable ⇒ assign back

# 4. (Optional) sanity-check
# print(ds)
# print(ds[0])        # first row now has a "label" field

new_order = [
    "idx",
    "instruction",
    "audio_a",
    "audio_b",           # want "label" immediately after this
    "label",
    "instruction_text",
    "model_a",
    "model_b",
]

# 2️⃣  create a reordered copy
ds = ds.select_columns(new_order)

print(ds.column_names)

['idx', 'instruction', 'audio_a', 'audio_b', 'label', 'instruction_text', 'model_a', 'model_b']


In [20]:
ds

Dataset({
    features: ['idx', 'instruction', 'audio_a', 'audio_b', 'label', 'instruction_text', 'model_a', 'model_b'],
    num_rows: 164
})

In [21]:
ds.push_to_hub("potsawee/speakbench-v1-label")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/potsawee/speakbench-v1-label/commit/f6da5391a1ab176d5af448d5ff719acddb85045d', commit_message='Upload dataset', commit_description='', oid='f6da5391a1ab176d5af448d5ff719acddb85045d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/potsawee/speakbench-v1-label', endpoint='https://huggingface.co', repo_type='dataset', repo_id='potsawee/speakbench-v1-label'), pr_revision=None, pr_num=None)