In [120]:
import random
from datasets import Dataset, Audio

In [48]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("potsawee/chatbot-arena-spoken-style")
ds = ds['train']

In [49]:
ds

Dataset({
    features: ['id', 'original_id', 'question_refined_wav', 'assistant_a_wav', 'assistant_b_wav', 'winner_content', 'winner_style', 'style', 'question_original', 'question_refined', 'assistant_a', 'assistant_b', 'model_text_a', 'model_text_b'],
    num_rows: 654
})

In [50]:
ds = ds.remove_columns(['question_refined_wav', 'assistant_a_wav', 'assistant_b_wav', 'style', 'question_refined'])

In [51]:
import pandas as pd
elevenlabs_styles = pd.read_csv("./11lab_speakers.tsv", sep="\t")
elevenlabs_styles_list = []
for i in range(len(elevenlabs_styles)):
    x = elevenlabs_styles.iloc[i]
    elevenlabs_styles_list.append({
        'voice_id': x['voice_id'],
        'name': x['name'],
        'description': x['description'],
    })

In [52]:
len(elevenlabs_styles_list)

20

In [53]:
def add_elevenlabs_style_column(example):
    example["style"] = random.choice(elevenlabs_styles_list)
    return example

In [54]:
ds = ds.map(add_elevenlabs_style_column)

Map:   0%|          | 0/654 [00:00<?, ? examples/s]

In [55]:
ds[0]

{'id': 5434,
 'original_id': '15793788be324bd588e7b9707615c279',
 'winner_content': 'model_b',
 'winner_style': 'model_b',
 'question_original': 'write a blog about eLearning',
 'assistant_a': 'The world of eLearning is rapidly growing, and it is becoming increasingly important for businesses to stay competitive. As more people turn to eLearning as a way to learn new skills, companies must find ways to make their eLearning programs engaging and effective. To do this, companies must focus on creating courses that are visually appealing and easy to navigate, while also providing engaging content that keeps learners engaged and motivated. Additionally, companies must ensure that their eLearning platform is secure, allowing learners to access their courses from any device with an internet connection. By investing in these strategies, companies can create eLearning programs that are both effective and engaging, ensuring that their employees are equipped with the skills they need to succeed.

In [78]:
# ds.save_to_disk("data-chatbot-arena-spoken-style-11labs")

## Data with Voice

In [132]:
import os
from datasets import load_from_disk
ds = load_from_disk("data-chatbot-arena-spoken-style-11labs")

In [133]:
def add_questions_refined(ex):
    id = ex['id']
    refined_question_path = f"./refined_questions/{id}.refined_question.txt"
    with open(refined_question_path) as f:
        x = f.read().strip('"').strip()
    ex['question_refined'] = x
    return ex

def add_question_refined_wav(ex):
    id = ex['id']
    path = f"./refined_questions_kokoro_wav/{id}.wav"
    ex['question_refined_wav'] = path
    return ex

def add_assistant_a_wav(ex):
    id = ex['id']
    winner = ex['winner_style']
    if winner == 'model_a':
        path = f"./generated_11labs_audio/{id}_model_a.11labs.wav"
    elif winner == 'model_b':    
        path = f"./generated_kokoro_audio/{id}_model_a.kokoro.wav"
    else:
        raise Exception()
    if os.path.exists(path):
        ex['assistant_a_wav'] = path
    else:
        ex['assistant_a_wav'] = None
    return ex
    
def add_assistant_b_wav(ex):
    id = ex['id']
    winner = ex['winner_style']
    if winner == 'model_b':
        path = f"./generated_11labs_audio/{id}_model_b.11labs.wav"
    elif winner == 'model_a':    
        path = f"./generated_kokoro_audio/{id}_model_b.kokoro.wav"
    else:
        raise Exception()
    if os.path.exists(path):
        ex['assistant_b_wav'] = path
    else:
        ex['assistant_b_wav'] = None
    return ex
    
ds = ds.map(add_questions_refined)
ds = ds.map(add_question_refined_wav)
ds = ds.map(add_assistant_a_wav)
ds = ds.map(add_assistant_b_wav)
ds = ds.filter(lambda x: x['assistant_a_wav'] is not None)
ds = ds.filter(lambda x: x['assistant_b_wav'] is not None)

In [134]:
len(ds)

632

In [135]:
ds = ds.cast_column("question_refined_wav", Audio()).cast_column("assistant_a_wav", Audio()).cast_column("assistant_b_wav", Audio())

In [136]:
# from huggingface_hub import login
# login(token="")

In [137]:
ds.push_to_hub("potsawee/chatbot-arena-spoken-style-11labs")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/954 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/potsawee/chatbot-arena-spoken-style-11labs/commit/eb65aa348e277a84a958febbd01c1296a6d063dd', commit_message='Upload dataset', commit_description='', oid='eb65aa348e277a84a958febbd01c1296a6d063dd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/potsawee/chatbot-arena-spoken-style-11labs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='potsawee/chatbot-arena-spoken-style-11labs'), pr_revision=None, pr_num=None)

### Same Content

In [138]:
import os
from datasets import load_from_disk
ds = load_from_disk("data-chatbot-arena-spoken-style-11labs")

In [139]:
def add_questions_refined(ex):
    id = ex['id']
    refined_question_path = f"./refined_questions/{id}.refined_question.txt"
    with open(refined_question_path) as f:
        x = f.read().strip('"').strip()
    ex['question_refined'] = x
    return ex

def add_question_refined_wav(ex):
    id = ex['id']
    path = f"./refined_questions_kokoro_wav/{id}.wav"
    ex['question_refined_wav'] = path
    return ex

def add_assistant_a_wav(ex):
    id = ex['id']
    winner = ex['winner_style']
    if winner == 'model_a':
        path = f"./generated_11labs_audio/{id}_model_a.11labs.wav"
    elif winner == 'model_b':    
        path = f"./generated_kokoro_audio/{id}_model_b.kokoro.wav"
    else:
        raise Exception()
    if os.path.exists(path):
        ex['assistant_a_wav'] = path
    else:
        ex['assistant_a_wav'] = None
    return ex
    
def add_assistant_b_wav(ex):
    id = ex['id']
    winner = ex['winner_style']
    if winner == 'model_b':
        path = f"./generated_11labs_audio/{id}_model_b.11labs.wav"
    elif winner == 'model_a':    
        path = f"./generated_kokoro_audio/{id}_model_b.kokoro.wav"
    else:
        raise Exception()
    if os.path.exists(path):
        ex['assistant_b_wav'] = path
    else:
        ex['assistant_b_wav'] = None
    return ex
    
ds = ds.map(add_questions_refined)
ds = ds.map(add_question_refined_wav)
ds = ds.map(add_assistant_a_wav)
ds = ds.map(add_assistant_b_wav)
ds = ds.filter(lambda x: x['assistant_a_wav'] is not None)
ds = ds.filter(lambda x: x['assistant_b_wav'] is not None)

In [140]:
ds = ds.cast_column("question_refined_wav", Audio()).cast_column("assistant_a_wav", Audio()).cast_column("assistant_b_wav", Audio())

In [141]:
ds.push_to_hub("potsawee/chatbot-arena-spoken-style-11labs-samecontent")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/956 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/potsawee/chatbot-arena-spoken-style-11labs-samecontent/commit/3b00160368249c2fd094579d134bcdd8d573f28f', commit_message='Upload dataset', commit_description='', oid='3b00160368249c2fd094579d134bcdd8d573f28f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/potsawee/chatbot-arena-spoken-style-11labs-samecontent', endpoint='https://huggingface.co', repo_type='dataset', repo_id='potsawee/chatbot-arena-spoken-style-11labs-samecontent'), pr_revision=None, pr_num=None)