In [1]:
import json
import re

In [86]:
def read_output_file(path, reverse=False):
    # path = "../experiments/somos/ab_testing/diff15_prompt1.txt"
    outputs = []
    with open(path, "r") as f:
        for line in f:
            x = json.loads(line)
            outputs.append(x)
    
    count_all, count_correct = 0, 0
    predictions = []
    for output in outputs:
        if not reverse:
            audio_a, audio_b = output['data']
        else:
            audio_b, audio_a = output['data']
        
        judge_text = output['response']

        # typo lol
        # judge_text = judge_text.replace("</verduct>", "</verdict>")
        # judge_text = judge_text.replace("</Verdict>", "</verdict>")
        # judge_text = judge_text.replace("</verddict>", "</verdict>")
        # judge_text = judge_text.replace("<Verdict>", "<verdict>")
        # judge_text = judge_text.replace("<verdict>B</verdict>", "<verdict>[[B]]</verdict>")
        # judge_text = judge_text.replace("Verdict: [[A]] </verdict>", "<verdict>[[A]]</verdict>")
        judge_text = judge_text.replace("```verdict\n[[A]]\n```verdict```", "<verdict>[[A]]</verdict>")
        judge_text = judge_text.replace("```verdict\n[[B]]\n```verdict```", "<verdict>[[B]]</verdict>")
        judge_text = judge_text.replace("Clarity: Audio A was articulate and clear, with minimal distortion or noise, providing good clarity. Audio B was also clear but had minor inconsistencies in articulation.", "<verdict>[[A]]</verdict>")
        judge_text = judge_text.replace("I'm sorry, I cannot listen to or evaluate the audio files. However, I can help you with other queries or tasks. How can I assist you today?", "<verdict>[[B]]</verdict>")
        
        
        # Extract content between <explanation> and </explanation>
        explanation_match = re.search(r'<explanation>(.*?)</explanation>', judge_text, re.DOTALL)
        explanation_text = explanation_match.group(1).strip() if explanation_match else None
        
        # Extract content between <verdict> and </verdict>
        verdict_match = re.search(r'<verdict>(.*?)</verdict>', judge_text, re.DOTALL)
        verdict_text = verdict_match.group(1).strip() if verdict_match else None
        assert verdict_text == "[[A]]" or verdict_text == "[[B]]", f"Unexpected verdict: {judge_text}"

        if verdict_text == "[[A]]":
            if audio_a['mos'] > audio_b['mos']:
                count_correct += 1
                evaluation = "correct"
            else:
                evaluation = "incorrect"
        elif verdict_text == "[[B]]":
            if audio_a['mos'] < audio_b['mos']:
                count_correct += 1
                evaluation = "correct"
            else:
                evaluation = "incorrect"
        count_all += 1
        
        predictions.append([evaluation, verdict_text, audio_a['mos'], audio_b['mos']])
    print("total:", count_all)
    print("accuracy: {:.2f}%".format(count_correct/count_all*100))
    return predictions

In [87]:
predictions_ab = read_output_file("../experiments/tts_arena/ab_testing/500_prompt2.jsonl")

total: 500
accuracy: 53.00%


In [88]:
predictions_ba = read_output_file("../experiments/tts_arena/ab_testing/500_prompt2_BA.jsonl", reverse=True)

total: 500
accuracy: 52.40%


In [89]:
count_A, count_B, count = 0, 0, 0
for x in predictions_ab:
    count += 1
    if x[1] == "[[A]]":
        count_A += 1
    elif x[1] == "[[B]]":
        count_B += 1
    else:
        raise Exception()
print("percentage_A: {:.2f}%".format(count_A/count*100))
print("percentage_B: {:.2f}%".format(count_B/count*100))

percentage_A: 33.00%
percentage_B: 67.00%


In [90]:
count_A, count_B, count = 0, 0, 0
for x in predictions_ba:
    count += 1
    if x[1] == "[[A]]":
        count_A += 1
    elif x[1] == "[[B]]":
        count_B += 1
    else:
        raise Exception()
print("percentage_A: {:.2f}%".format(count_A/count*100))
print("percentage_B: {:.2f}%".format(count_B/count*100))

percentage_A: 38.80%
percentage_B: 61.20%


In [91]:
abba_correct, abba_incorrect = 0, 0
for x in predictions_ab + predictions_ba:
    if x[0] == 'correct':
        abba_correct += 1
    elif x[0] == 'incorrect':
        abba_incorrect += 1
    else:
        raise Exception()
print("total:", abba_correct+abba_incorrect)
print("correct: {:.2f}%".format(abba_correct/(abba_correct+abba_incorrect)*100))
print("incorrect: {:.2f}%".format(abba_incorrect/(abba_correct+abba_incorrect)*100))

total: 1000
correct: 52.70%
incorrect: 47.30%


In [92]:
predictions_ab = predictions_ab[:len(predictions_ba)]
consistent, bias_A, bias_B = 0, 0, 0
correct_given_consistent = 0
for ab, ba in zip(predictions_ab, predictions_ba):
    if ab[0] == ba[0]:
        consistent += 1
        if ab[0] == 'correct':
            correct_given_consistent += 1
    else:
        if ab[1] == "[[A]]" and ba[1] == "[[A]]":
            bias_A += 1
        elif ab[1] == "[[B]]" and ba[1] == "[[B]]":
            bias_B += 1
        else:
            raise Exception("logic error")
assert consistent + bias_A + bias_B == len(predictions_ab)

In [93]:
print("consistent: {:.2f}%".format(consistent/len(predictions_ab)*100))
print("bias_A:     {:.2f}%".format(bias_A/len(predictions_ab)*100))
print("bias_B:     {:.2f}%".format(bias_B/len(predictions_ab)*100))
print("P(correct|consistent): {:.2f}%".format(correct_given_consistent/consistent*100))

consistent: 48.60%
bias_A:     11.60%
bias_B:     39.80%
P(correct|consistent): 55.56%
