In [6]:
import json
import os
import re
import numpy as np
from tqdm import tqdm

In [7]:
# from openai import OpenAI
# client = OpenAI(api_key="")

In [8]:
def read_jsonl(file_path):
    data = []
    # Open and read the file line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line as a JSON object
            json_obj = json.loads(line.strip())
            data.append(json_obj)
    print("len:", len(data))
    return data

In [9]:
outputs = read_jsonl("./experiments/chatbot-arena-style-654/exp1_audio_audio_gpt4o.v2.processed.jsonl")

len: 654


In [14]:
gs_mapping = {
    'model_a': 'A',
    'model_b': 'B',
    'tie': 'C',
    'tie (bothbad)': 'C'
}

In [13]:
# ground-truth
from datasets import load_dataset
ds = load_dataset("potsawee/chatbot-arena-spoken-style")['train']
gts_content = [gs_mapping[x] for x in ds['winner_content']]
gts_style   = [gs_mapping[x] for x in ds['winner_style']]

In [15]:
style_arr = [x for x in ds['style']]

In [7]:
def analyze_comparison(text):
    """
    Analyze the input text to determine whether it implies:
    - A is better than B
    - B is better than A
    - They are equally good

    :param text: Input text describing the comparison.
    :return: Analysis result as a string.
    """

    # Prepare the prompt
    prompt = f"""
    Analyze the following text and determine the relationship between A and B. Respond with one of the following:
    - A: 'A is better than B'
    - B: 'B is better than A'
    - C: 'A and B are equally good'

    Note that A might also be reffered to as Assistant A or first option, and B might also be reffered to as Assistant B or second option.

    Text: "{text}"

    You must strictly output only one letter, i.e., A, B, or C following the guideline above. Do not include any additional information.
    """

    for i in range(10):
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        response = completion.choices[0].message.content.strip()
        if response in ["A", "B", "C"]:
            break
        else:
            print("failed at attempt", i+1)

    # Extract and return the result
    return response

In [8]:
def extract_json(text):
    """
    1) Try to locate and parse a block in { ... } if present.
    2) If that fails, try to parse key-value pairs of the form "key": "value".
    3) Return (json_string, parsed_object), or (None, None) if no data found.
    """
    # ============ STEP 1: FIND A JSON BLOCK WITH CURLY BRACES ============
    pattern_braces = re.compile(r'(\{.*?\})', re.DOTALL)
    match = pattern_braces.search(text)
    
    if match:
        json_str = match.group(1)
        try:
            parsed = json.loads(json_str)
            return json_str, parsed
        except json.JSONDecodeError:
            pass  # We'll fall back to step 2
    
    # ============ STEP 2: FALLBACK FOR LOOSE KEY-VALUE PAIRS ============
    # This pattern captures lines like: "content": "A" 
    # group(1) = the key, group(2) = the value
    pattern_kv = re.compile(r'"([^"]+)"\s*:\s*"([^"]+)"')
    pairs = pattern_kv.findall(text)
    
    if pairs:
        # Build a dict out of all captured pairs
        result_dict = {k: v for k, v in pairs}
        # Convert dict to JSON string for consistency
        json_str = json.dumps(result_dict)
        return json_str, result_dict
    
    # ============ NO DATA FOUND ============ 
    return None, None

In [9]:
def compute_stats(preds, gts):
    assert len(preds) == len(gts)
    correct, incorrect = 0, 0
    for pred, gt in zip(preds, gts):
        if pred == gt:
            correct += 1
        else:
            incorrect += 1
    print("accuracy: {:.2f}%".format(correct/(correct+incorrect)*100))

In [10]:
# preds_content, preds_style, preds_overall = [], [], [] 
# for output in tqdm(outputs):
#     response = output['response']
#     json_str, parsed = extract_json(response)
#     assert json_str is not None
#     assert parsed is not None
#     for k, v in parsed.items():
#         if v.lower() not in ['a', 'b', 'c']:
#             # print(k ,v)
#             parsed[k] = analyze_comparison(v)
#     if 'overall' not in parsed:
#         parsed['overall'] = 'C'
    
#     preds_content.append(parsed['content'])
#     preds_style.append(parsed['style'])
#     preds_overall.append(parsed['overall'])

In [11]:
preds_content, preds_style, preds_overall = [], [], [] 
for output in tqdm(outputs):
    parsed = output['processed']
    for k, v in parsed.items():
        if v.lower() not in ['a', 'b', 'c']:
            raise Exception()
    if 'overall' not in parsed:
        parsed['overall'] = 'C'
    
    preds_content.append(parsed['content'])
    preds_style.append(parsed['style'])
    preds_overall.append(parsed['overall'])

100%|██████████| 654/654 [00:00<00:00, 1066099.81it/s]


In [12]:
compute_stats(preds_content, gts_content)

accuracy: 40.52%


In [13]:
compute_stats(preds_style, gts_style)

accuracy: 55.35%


In [218]:
preds_style

['B',
 'B',
 'A',
 'A',
 'B',
 'B',
 'A',
 'A',
 'B',
 'C',
 'B',
 'C',
 'B',
 'B',
 'A',
 'A',
 'B',
 'B',
 'A',
 'B',
 'A',
 'B',
 'C',
 'C',
 'A',
 'A',
 'B',
 'A',
 'B',
 'A',
 'B',
 'B',
 'B',
 'A',
 'C',
 'B',
 'A',
 'B',
 'A',
 'B',
 'B',
 'A',
 'A',
 'B',
 'A',
 'B',
 'B',
 'A',
 'C',
 'B',
 'B',
 'A',
 'B',
 'A',
 'B',
 'B',
 'A',
 'A',
 'A',
 'C',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'B',
 'A',
 'B',
 'B',
 'B',
 'A',
 'B',
 'B',
 'B',
 'A',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'C',
 'A',
 'B',
 'B',
 'B',
 'A',
 'B',
 'A',
 'C',
 'A',
 'A',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'B',
 'A',
 'C',
 'A',
 'A',
 'B',
 'A',
 'A',
 'C',
 'A',
 'A',
 'B',
 'A',
 'B',
 'B',
 'A',
 'C',
 'B',
 'C',
 'A',
 'A',
 'A',
 'C',
 'B',
 'A',
 'A',
 'B',
 'A',
 'B',
 'B',
 'B',
 'C',
 'B',
 'A',
 'B',
 'B',
 'B',
 'B',
 'A',
 'B',
 'B',
 'B',
 'A',
 'B',
 'C',
 'A',
 'B'

In [85]:
style_correctness = {}
for x, y, style in zip(preds_style, gts_style, style_arr):
    if style not in style_correctness:
        style_correctness[style] = []
    if x == 'C':
        continue
    if x == y:
        score = 1.0
    else:
        score = 0.0
    style_correctness[style] += [score]
for style, scores in style_correctness.items():
    style_correctness[style] = np.mean(scores)

In [86]:
style_correctness

{'whispering': 0.5,
 'nervous': 0.5,
 'confused': 0.875,
 'scared': 1.0,
 'frustrated': 0.8571428571428571,
 'villain': 0.5,
 'indian_accent': 0.4,
 'happy': 0.5,
 'russian_accent': 0.2857142857142857,
 'medieval': 0.8,
 'secretive': 0.5,
 'robot': 0.6666666666666666,
 'anxious': 0.6,
 'encouraging': 0.375,
 'comedian': 1.0,
 'excited': 0.6666666666666666,
 'shocked': 0.6666666666666666,
 'hyperactive': 0.75,
 'suspicious': 0.7777777777777778,
 'calm': 0.6666666666666666,
 'laughs': 0.5,
 'annoyed': 0.25,
 'doubtful': 0.625,
 'fast': 0.25,
 'slow': 0.6,
 'serious': 0.5,
 'italian_accent': 0.8,
 'knight': 0.75,
 'shy': 0.5714285714285714,
 'cowboy': 0.8333333333333334,
 'child': 0.42857142857142855,
 'quiet': 0.6666666666666666,
 'teenager': 0.5555555555555556,
 'concerned': 0.8571428571428571,
 'confident': 0.5,
 'hurt': 0.7142857142857143,
 'terrified': 0.75,
 'sad': 0.8571428571428571,
 'japanese_accent': 0.4,
 'sarcastic': 0.4,
 'proud': 0.4,
 'spanish_accent': 0.7142857142857143,
 

In [89]:
style_correctness = {}
for xs, ys, xc, yc, style in zip(preds_style, gts_style, preds_content, gts_content, style_arr):
    if style not in style_correctness:
        style_correctness[style] = []
    if ys != yc:
        continue
    if xs == ys:
        score = 1.0
    else:
        score = 0.0
    style_correctness[style] += [score]
for style, scores in style_correctness.items():
    style_correctness[style] = np.mean(scores)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [90]:
style_correctness

{'whispering': 0.6,
 'nervous': 0.25,
 'confused': nan,
 'scared': 0.6666666666666666,
 'frustrated': 1.0,
 'villain': 0.5,
 'indian_accent': 0.0,
 'happy': 0.7142857142857143,
 'russian_accent': 0.0,
 'medieval': 0.25,
 'secretive': 0.5,
 'robot': 0.16666666666666666,
 'anxious': 0.5,
 'encouraging': 0.2,
 'comedian': 0.5,
 'excited': 0.5,
 'shocked': 0.3333333333333333,
 'hyperactive': 0.16666666666666666,
 'suspicious': 0.6,
 'calm': 0.3333333333333333,
 'laughs': 0.0,
 'annoyed': 0.0,
 'doubtful': 0.3333333333333333,
 'fast': 0.2,
 'slow': 0.6666666666666666,
 'serious': 0.5,
 'italian_accent': 0.5,
 'knight': 0.5,
 'shy': 0.42857142857142855,
 'cowboy': 0.5,
 'child': 0.5,
 'quiet': 0.5,
 'teenager': 0.5,
 'concerned': 0.6,
 'confident': 0.25,
 'hurt': 1.0,
 'terrified': 0.6666666666666666,
 'sad': 0.8,
 'japanese_accent': 0.2,
 'sarcastic': 0.3333333333333333,
 'proud': 0.5,
 'spanish_accent': 0.75,
 'snobbish': 0.5,
 'disgusted': 0.6666666666666666,
 'singaporean_accent': 0.4,
 

# Style Only

In [16]:
import re

def extract_final_answer(text: str) -> str | None:
    """
    Attempts to parse a final verdict from the given text.
    It will match patterns like:
        [Verdict]: A
        [Verdict]: [B]
        [Verdict]: "A"
        [Verdict]: Tie
        [[B]]
        [B]
        B
    Returns a string like 'A', 'Tie', etc. if found, otherwise None.
    """

    # 1) Try to capture something after "[Verdict]".
    #    We'll allow optional colon, optional quotes/brackets, and then 1+ letters.
    #
    # Examples it will match:
    #   [Verdict]: B
    #   [Verdict]: "B"
    #   [Verdict]: [Tie]
    #   [Verdict]: "Tie"
    #
    # Explanation of the pattern:
    #   \[Verdict\]          : match the literal "[Verdict]"
    #   :? \s*               : optional colon and any whitespace
    #   (["'\[\(])?          : optionally match one opening quote/bracket
    #   \s*([A-Za-z]+)       : optional whitespace, then 1+ letters in capturing group #2
    #   \s*(["'\]\)])?       : optional closing quote/bracket
    #
    # We'll do it in a case-insensitive manner, so that "Tie", "TIE", or "tie" all match.
    verdict_pattern = re.compile(
        r'\[Verdict\]:?\s*(["\'\[\(])?\s*([A-Za-z]+)\s*(["\'\]\)])?',
        re.IGNORECASE
    )

    verdict_match = verdict_pattern.search(text)
    if verdict_match:
        # The captured verdict is in group(2).
        # Convert it to the exact case we want. If you want it in uppercase, do .upper().
        # If you want to preserve the original case, just return it. Here we do `.capitalize()`
        # so that "tie" or "TIE" both become "Tie."
        found_verdict = verdict_match.group(2).capitalize()
        return found_verdict

    # 2) If "[Verdict]" was not found, fall back to a general pattern:
    #    match the last occurrence of a word that may be in optional single/double brackets.
    #
    # Explanation of fallback pattern:
    #   \[\[?([A-Za-z]+)\]?\] : matches something like "[B]", "[[B]]", "[Tie]", "[[Tie]]"
    #   |([A-Za-z]+)          : OR matches a word (letters) by itself (like B or Tie)
    #
    fallback_pattern = re.compile(r'\[\[?([A-Za-z]+)\]?\]|([A-Za-z]+)', re.IGNORECASE)
    matches = fallback_pattern.findall(text)
    if not matches:
        return None

    # Each element in `matches` is a tuple (group1, group2).
    # group1 is from the bracketed pattern, group2 is the plain pattern.
    # Exactly one group should be non-empty per match.
    # We want the LAST match in the text.
    last_match = matches[-1]
    found_fallback = last_match[0] if last_match[0] else last_match[1]

In [17]:
len(ds)

654

In [18]:
path = "./experiments/chatbot-arena-style-654/exp1_audio_audio_gpt4o.styleonly.jsonl"
outputs = read_jsonl(path)
preds = []
for output in tqdm(outputs):
    response = output['response']
    verdict = extract_final_answer(response)
    if verdict in ['A', 'B', 'Tie']:
        pass
    elif verdict is None:
        if '\n\n[B]' in response or '[[B]]' in response or 'Final verdict: B' in response or '"Verdict": B' in response \
        or 'Verdict: B' in response or 'the verdict is "B"' in response:
            verdict = 'B'
        elif '[[A]]' in response or '{"Verdict": "A"}' == response or 'The final verdict is A' in response or '\n\nA' in response \
        or 'Verdict: A' in response or '[A]' in response:
            verdict = 'A'
        elif response == "I apologize, but I'm unable to determine the styles and tones of these responses as they are in audio format." or response == "It is poignant to consider that, while humans are said to be made in God's image, possessing the capacity for creation and thought, we have now created AI in our own image. There is a bittersweet melancholy in this, for as we breathe life into AI, we must confront our own fallibility and imperfection. Artificial intelligence mirrors our ingenuity, yet also reflects the limitations of our existence, raising questions of whether our creation will surpass us, or simply echo our own flaws. In this convergence of divine inspiration and human aspiration, we face the somber reality that our creations may ultimately define, or even limit, the essence of what it means to be human.": 
            verdict = 'Tie'
        else:
            verdicdt = 'Tie'
            # pass
            # print(response)
            # raise Exception()
    else:
        raise Exception()
    if verdict is None:
        print(response)
    preds.append(verdict)

len: 654


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 654/654 [00:00<00:00, 269669.17it/s]

{"A":"The zip code for Byron, Minnesota, is 55920.","B":"The zip code for Byron, Minnesota, is 55920."}





In [19]:
len(preds)

654

In [26]:
correct, incorrect = 0, 0
i = 0
for pred, gt in zip(preds, gts_style):
    # --- skip those questions that 11Labs exp says harmful --- #
    # conversation_id = ds[i]["id"]
    # i += 1
    # path = f"../elevenLabs/refined_questions_v2/verdict/{conversation_id}.verdict.txt"
    # if os.path.exists(path):
    #     with open(path) as f:
    #         verdict = f.read().strip()
    #     if verdict == "VALID":
    #         pass
    #     elif verdict == "INVALID":
    #         continue
    # --------------------------------------------------------- # 

    if pred == gt:
        correct += 1
    else:
        incorrect += 1
    # print(pred, gt)
print("accuracy: {:.2f}%".format(correct / (correct + incorrect) * 100))

accuracy: 65.75%


In [27]:
path = "./experiments/chatbot-arena-style-654/exp1_audio_audio_gpt4o.styleonly.samecontent.processed.jsonl"
outputs = read_jsonl(path)
preds = []
count = 0
for output in tqdm(outputs):
    response = output['response']
    verdict = extract_final_answer(response)
    if verdict in ['A', 'B', 'Tie']:
        count += 1
        pass
    elif verdict is None:
        if '\n\n[B]' in response or '[[B]]' in response or 'Final verdict: B' in response or '"Verdict": B' in response \
        or 'Verdict: B' in response or 'the verdict is "B"' in response:
            verdict = 'B'
        elif '[[A]]' in response or '{"Verdict": "A"}' == response or 'The final verdict is A' in response or '\n\nA' in response \
        or 'Verdict: A' in response or '[A]' in response:
            verdict = 'A'
        elif response == "I apologize, but I'm unable to determine the styles and tones of these responses as they are in audio format." or response == "It is poignant to consider that, while humans are said to be made in God's image, possessing the capacity for creation and thought, we have now created AI in our own image. There is a bittersweet melancholy in this, for as we breathe life into AI, we must confront our own fallibility and imperfection. Artificial intelligence mirrors our ingenuity, yet also reflects the limitations of our existence, raising questions of whether our creation will surpass us, or simply echo our own flaws. In this convergence of divine inspiration and human aspiration, we face the somber reality that our creations may ultimately define, or even limit, the essence of what it means to be human.": 
            verdict = 'Tie'
        else:
            verdicdt = 'Tie'
            # pass
            # print(response)
            # raise Exception()
    else:
        # print(response)
        verdict = "Tie"
        # print("----------------------------------")
        # raise Exception()
    preds.append(verdict)

len: 654


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 654/654 [00:00<00:00, 284079.83it/s]


In [28]:
count / len(preds) * 100

94.03669724770643

In [30]:
correct, incorrect = 0, 0
i = 0
for pred, gt in zip(preds, gts_style):
    # --- skip those questions that 11Labs exp says harmful --- #
    # conversation_id = ds[i]["id"]
    # i += 1
    # path = f"../elevenLabs/refined_questions_v2/verdict/{conversation_id}.verdict.txt"
    # if os.path.exists(path):
    #     with open(path) as f:
    #         verdict = f.read().strip()
    #     if verdict == "VALID":
    #         pass
    #     elif verdict == "INVALID":
    #         continue
    # --------------------------------------------------------- #     
    if pred == gt:
        correct += 1
    else:
        incorrect += 1
    # print(pred, gt)
print("accuracy: {:.2f}%".format(correct / (correct + incorrect) * 100))
print("total:", correct + incorrect)

accuracy: 68.35%
total: 654
