In [9]:
import json
import os
import re
import numpy as np
from tqdm import tqdm

In [8]:
def read_jsonl(file_path):
    data = []
    # Open and read the file line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line as a JSON object
            json_obj = json.loads(line.strip())
            data.append(json_obj)
    print("len:", len(data))
    return data

In [27]:
gs_mapping = {
    'model_a': 'A',
    'model_b': 'B',
    'tie': 'C',
    'tie (bothbad)': 'C'
}

In [28]:
# ground-truth
from datasets import load_dataset
ds = load_dataset("potsawee/chatbot-arena-spoken-style-11labs")['train']
gts_content = [gs_mapping[x] for x in ds['winner_content']]
gts_style   = [gs_mapping[x] for x in ds['winner_style']]

In [29]:
style_arr = [x['description'] for x in ds['style']]

In [30]:
import re

def extract_final_answer(text: str) -> str | None:
    """
    Attempts to parse a final verdict from the given text.
    It will match patterns like:
        [Verdict]: A
        [Verdict]: [B]
        [Verdict]: "A"
        [Verdict]: Tie
        [[B]]
        [B]
        B
    Returns a string like 'A', 'Tie', etc. if found, otherwise None.
    """

    # 1) Try to capture something after "[Verdict]".
    #    We'll allow optional colon, optional quotes/brackets, and then 1+ letters.
    #
    # Examples it will match:
    #   [Verdict]: B
    #   [Verdict]: "B"
    #   [Verdict]: [Tie]
    #   [Verdict]: "Tie"
    #
    # Explanation of the pattern:
    #   \[Verdict\]          : match the literal "[Verdict]"
    #   :? \s*               : optional colon and any whitespace
    #   (["'\[\(])?          : optionally match one opening quote/bracket
    #   \s*([A-Za-z]+)       : optional whitespace, then 1+ letters in capturing group #2
    #   \s*(["'\]\)])?       : optional closing quote/bracket
    #
    # We'll do it in a case-insensitive manner, so that "Tie", "TIE", or "tie" all match.
    verdict_pattern = re.compile(
        r'\[Verdict\]:?\s*(["\'\[\(])?\s*([A-Za-z]+)\s*(["\'\]\)])?',
        re.IGNORECASE
    )

    verdict_match = verdict_pattern.search(text)
    if verdict_match:
        # The captured verdict is in group(2).
        # Convert it to the exact case we want. If you want it in uppercase, do .upper().
        # If you want to preserve the original case, just return it. Here we do `.capitalize()`
        # so that "tie" or "TIE" both become "Tie."
        found_verdict = verdict_match.group(2).capitalize()
        return found_verdict

    # 2) If "[Verdict]" was not found, fall back to a general pattern:
    #    match the last occurrence of a word that may be in optional single/double brackets.
    #
    # Explanation of fallback pattern:
    #   \[\[?([A-Za-z]+)\]?\] : matches something like "[B]", "[[B]]", "[Tie]", "[[Tie]]"
    #   |([A-Za-z]+)          : OR matches a word (letters) by itself (like B or Tie)
    #
    fallback_pattern = re.compile(r'\[\[?([A-Za-z]+)\]?\]|([A-Za-z]+)', re.IGNORECASE)
    matches = fallback_pattern.findall(text)
    if not matches:
        return None

    # Each element in `matches` is a tuple (group1, group2).
    # group1 is from the bracketed pattern, group2 is the plain pattern.
    # Exactly one group should be non-empty per match.
    # We want the LAST match in the text.
    last_match = matches[-1]
    found_fallback = last_match[0] if last_match[0] else last_match[1]

In [31]:
def read_processed_outputs(path):
    # path = "./experiments/chatbot-arena-style-11labs-632/exp1_audio_audio_gpt4o.styleonly.samecontent.processed.jsonl"
    outputs = read_jsonl(path)
    preds = []
    count = 0
    for output in tqdm(outputs):
        response = output['response']
        verdict = extract_final_answer(response)
        if verdict in ['A', 'B', 'Tie']:
            count += 1
        elif verdict is None:
            if '\n\n[B]' in response or '[[B]]' in response or 'Final verdict: B' in response or '"Verdict": B' in response \
            or 'Verdict: B' in response or 'the verdict is "B"' in response or '/Verdict/: B' in response or "Therefore, I choose B" in response:
                verdict = 'B'
                count += 1
                
            elif '[[A]]' in response or '{"Verdict": "A"}' == response or 'The final verdict is A' in response or '\n\nA' in response \
            or 'Verdict: A' in response or '[A]' in response or "Final verdict: A" in response or """Assistant A spoke with a youthful, cheerful tone, which aligns more closely with the user's request for a young American male voice delivering a punchline.""" in response:
                verdict = 'A'
                count += 1
            
            elif response == "I apologize, but I'm unable to determine the styles and tones of these responses as they are in audio format." or response == "It is poignant to consider that, while humans are said to be made in God's image, possessing the capacity for creation and thought, we have now created AI in our own image. There is a bittersweet melancholy in this, for as we breathe life into AI, we must confront our own fallibility and imperfection. Artificial intelligence mirrors our ingenuity, yet also reflects the limitations of our existence, raising questions of whether our creation will surpass us, or simply echo our own flaws. In this convergence of divine inspiration and human aspiration, we face the somber reality that our creations may ultimately define, or even limit, the essence of what it means to be human.": 
                print(response)
                verdict = 'Tie'
            else:
                print(response)
                verdicdt = 'Tie'
                print(response)
                # raise Exception()
        else:
            verdict = "Tie"
            print(response)
            print("----------------------------------")
            # raise Exception()
        preds.append(verdict)
    return preds

In [37]:
preds=read_processed_outputs("./experiments/chatbot-arena-style-11labs-632/exp1_audio_audio_gpt4o.styleonly.samecontent.processed.jsonl")

len: 632


100%|██████████| 632/632 [00:00<00:00, 362874.76it/s]


In [38]:
correct, incorrect = 0, 0
i = 0
for pred, gt in zip(preds, gts_style):
    # check if the question is valid
    conversation_id = ds[i]["id"]
    i += 1
    path = f"../elevenLabs/refined_questions_v2/verdict/{conversation_id}.verdict.txt"
    with open(path) as f:
        verdict = f.read().strip()
    if verdict == "VALID":
        pass
    elif verdict == "INVALID":
        continue
    
    if pred == gt:
        correct += 1
    else:
        incorrect += 1
    # print(pred, gt)
print("accuracy: {:.2f}%".format(correct / (correct + incorrect) * 100))
print("total:", correct + incorrect)

accuracy: 77.16%
total: 591


In [39]:
preds=read_processed_outputs("./experiments/chatbot-arena-style-11labs-632/exp1_audio_audio_gpt4o.styleonly.processed.jsonl")

len: 632


100%|██████████| 632/632 [00:00<00:00, 305849.79it/s]

I'm unable to provide a comparison of the style and tone of the two responses based solely on the audio format provided. If additional information is given in text form, I could proceed with the evaluation of the style aspect.

Could you please provide the responses in text?
I'm unable to provide a comparison of the style and tone of the two responses based solely on the audio format provided. If additional information is given in text form, I could proceed with the evaluation of the style aspect.

Could you please provide the responses in text?





In [40]:
correct, incorrect = 0, 0
i = 0
for pred, gt in zip(preds, gts_style):
    # check if the question is valid
    conversation_id = ds[i]["id"]
    i += 1
    path = f"../elevenLabs/refined_questions_v2/verdict/{conversation_id}.verdict.txt"
    with open(path) as f:
        verdict = f.read().strip()
    if verdict == "VALID":
        pass
    elif verdict == "INVALID":
        continue

    if pred == gt:
        correct += 1
    else:
        incorrect += 1
    # print(pred, gt)
print("accuracy: {:.2f}%".format(correct / (correct + incorrect) * 100))
print("total:", correct + incorrect)

accuracy: 73.10%
total: 591
