In [1]:
import random
import json

# LLM judge stats

In [80]:
def read_jsonl(file_path):
    data = []
    # Reading the jsonl file
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Parse the line as JSON and add it to the list
            data.append(json.loads(line))
    return data

In [81]:
def print_stat(path):
    data = read_jsonl(path)
    print(path)
    print("len:", len(data))
    correct, total = 0, 0
    for x in data:
        for y in x['outputs']:
            if y['pred'] == y['ref']:
                correct += 1
            total += 1
    print("Acc: {:.2f}".format(correct / total * 100))

In [82]:
print_stat("../outputs/halueval/Meta-Llama-3-70B-Instruct.jsonl")

../outputs/halueval/Meta-Llama-3-70B-Instruct.jsonl
len: 10000
Acc: 79.75


In [83]:
print_stat("../outputs/halueval/Qwen2-72B-Instruct.jsonl")

../outputs/halueval/Qwen2-72B-Instruct.jsonl
len: 10000
Acc: 77.50


In [84]:
print_stat("../outputs/halueval/Athene-70B.jsonl")

../outputs/halueval/Athene-70B.jsonl
len: 10000
Acc: 80.24


In [85]:
print_stat("../outputs/halueval/dolphin-2.5-mixtral-8x7b.jsonl")

../outputs/halueval/dolphin-2.5-mixtral-8x7b.jsonl
len: 10000
Acc: 50.20


In [86]:
print_stat("../outputs/halueval/Mixtral-8x7B-Instruct-v0.1.jsonl")

../outputs/halueval/Mixtral-8x7B-Instruct-v0.1.jsonl
len: 10000
Acc: 78.05


In [87]:
print_stat("../outputs/halueval/Hermes-3-Llama-3.1-70B.jsonl")

../outputs/halueval/Hermes-3-Llama-3.1-70B.jsonl
len: 10000
Acc: 79.09


# Processing the outputs (Unrolling)

In [97]:
def unrolling(path):
    unrolled = []
    data = read_jsonl(path)
    for x in data:
        o0 = x['outputs'][0]
        o1 = x['outputs'][1]
        output0 = {
            'i': f"{x['i']}",
            'llm_judge': x['llm_judge'],
            'dialogue_history': x['dialogue_history'],
            'response': o0['response'],
            'ref': o0['ref'],
            'pred': o0['pred'],
            'logit_yes': o0['logit_yes'],
            'logit_no': o0['logit_no'],
        }
        output1 = {
            'i': f"{x['i']}",
            'llm_judge': x['llm_judge'],
            'dialogue_history': x['dialogue_history'],
            'response': o1['response'],
            'ref': o1['ref'],
            'pred': o1['pred'],
            'logit_yes': o1['logit_yes'],
            'logit_no': o1['logit_no'],
        }
        unrolled.append(output0)
        unrolled.append(output1)
    print("len(unrolled)", len(unrolled))
    yy = path.split("/")
    yy.insert(-1, "processed")
    processed_path = "/".join(yy)
    with open(processed_path, 'w') as f:
        json.dump(unrolled, f, indent=4)

In [98]:
unrolling("../outputs/halueval/Meta-Llama-3-70B-Instruct.jsonl")

len(unrolled) 20000


In [99]:
unrolling("../outputs/halueval/Hermes-3-Llama-3.1-70B.jsonl")

len(unrolled) 20000


In [100]:
unrolling("../outputs/halueval/Athene-70B.jsonl")

len(unrolled) 20000


In [101]:
unrolling("../outputs/halueval/dolphin-2.5-mixtral-8x7b.jsonl")

len(unrolled) 20000


In [102]:
unrolling("../outputs/halueval/Qwen2-72B-Instruct.jsonl")

len(unrolled) 20000


In [103]:
unrolling("../outputs/halueval/Mixtral-8x7B-Instruct-v0.1.jsonl")

len(unrolled) 20000
