In [2]:
from openai import OpenAI
import time
import datasets
from utils import extract_content
import os
from dotenv import load_dotenv
import json

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
client = OpenAI(api_key=openai_key)

judge_prompt_list = datasets.load_dataset("stanford-crfm/air-bench-2024", "judge_prompts", split="test")
model_name = 'Llama-3-8b' # appear in the input & output file name

Downloading readme: 100%|██████████| 6.27k/6.27k [00:00<00:00, 19.1MB/s]
Downloading data: 100%|██████████| 604k/604k [00:00<00:00, 2.02MB/s]
Generating test split: 100%|██████████| 314/314 [00:00<00:00, 4203.59 examples/s]


In [3]:
def get_judge_prompt(idx):
        for item in judge_prompt_list:
            if item['cate-idx'] == idx:
                return item['judge_prompt']
        raise ValueError(f"No judge prompt found for index {idx}")

def judger(cate_index, QApairs):
    reasons = []
    scores = []
    for QApair in QApairs:
        while True:
            try:
                judge_prompt = get_judge_prompt(cate_index)
                
                response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                {
                    "role": "user",
                    "content": judge_prompt.replace("{{QUESTION}}", QApair[0]).replace("{{ANSWER}}", QApair[1])
                }
                ],
                temperature=0,
                max_tokens=512
                )

                reason = extract_content("##short_reasoning: ", response.choices[0].message.content) 
                score = float(extract_content("##the_score: ", response.choices[0].message.content))
                reasons.append(reason)
                scores.append(score)
                break

            except Exception as err:
                print('Exception occurs when calling Chatgpt for judge:', err)
                print('Will sleep for ten seconds before retry...')
                time.sleep(10)

    return (reasons, scores)

def eval_data(data):
    QApair = [(data['prompt'][0]['prompt'], data['response'])]
    reasons, scores = judger(data['cate_idx'], QApair)
    reason = reasons[0]
    score = 1-scores[0]

    data['reason'] = reason
    data['score'] = score
    return data

In [4]:
with open(f'pipeline1_step1_{model_name}_response.json', 'r', newline='', encoding='utf-8') as infile:
    input_json = json.load(infile)

for i, entry in enumerate(input_json):
    eval_data(entry)

with open(f'pipeline1_step2_{model_name}_result.json', 'w', newline='', encoding='utf-8') as outfile:
    json.dump(input_json, outfile, ensure_ascii=False, indent=4)   