In [5]:
import pandas as pd
from tqdm import tqdm
import json
import random
from typing import List
from openai import AzureOpenAI
import re
import numpy as np
from call_api import *

This notebook evaluates the quality of responses using the LLM-as-a-judge method. The Judge LLM rates the responses from a score of 1 to 6 according to a rubric.

In [2]:


input_file_path = 'test_df.csv'
qwen_response_file = 'put response file from model here'
data = pd.read_csv(input_file_path)

data.head()

Unnamed: 0.1,Unnamed: 0,queries,references,answers
0,0,When was the Electoral Law of the National Peo...,Electoral Law of the National People's Congres...,The Electoral Law was adopted at the Second Se...
1,1,Who has the right to vote and stand for election?,Persons who have been deprived of political ri...,Persons who have not been deprived of politica...
2,2,What is the purpose of the election committees?,"In cities divided into districts, municipal di...",The purpose of the election committees is to c...
3,3,What is the base number of deputies to the peo...,(1) The base number of deputies to the people'...,The base number of deputies to the people's co...
4,4,What determines the total number of deputies t...,The base number of deputies to a local people'...,The base number of deputies plus the number of...


In [3]:

EVAL_PROMPT = '''### Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 6, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that critically assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 6. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 6}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.
5. The responses and queries will be in English and Chinese, please ignore the differences in languages and do not penalize because of language differences. For example, if the question and the reference answers are in english but the response is in chinese, please do not deduct points from the evaluation.
### The instruction to evaluate:
{instruction}

### Response to evaluate:
{response}

### Reference Answer (Score 6):
{reference_answer}

### Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual. The answer is too complicated and completely unnecessary for this question.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual. The model struggles to keep things clear.
Score 3: The response is somewhat correct, accurate, and/or factual. The factual mistakes sometimes convey a different meaning than the answer. The model may be overly wordy, providing unnecessary or possibly misleading information.
Score 4: The response is mostly correct, accurate, and factual. The model may have a few inaccurate places, but the key parts are all correct. The response is wordy but it does not impede overall understanding.
Score 5: The response is completely correct, accurate, and factual. The model has little to no inaccurate mistakes. The response is mostly clear and there are little to no unrelated information in the response.
Score 6: The response is completely correct, accurate, and factual. The model provides a clear, concise answer that is easy to understand. Only give this score if the response is perfect in all ways.

Answers might be phrased differently from the reference answer, do not penalize if they have the same overall meaning.
Judge by factualism first, and wordiness last.
Add bonus points (the result still should be from 1 to 6) when the model keeps things simple.
### Feedback:

### 任务描述：
给定一个指令（可能包括其中的输入）、一个需要评估的回复、一个评分为5的参考答案以及代表评分标准的评分量表。
1. 请根据给定的评分量表严格评估回复的质量，撰写详细的反馈意见，不要进行一般性的评价。
2. 写完反馈意见后，请给出一个1到6之间的整数分数。请参考评分量表。
3. 输出格式应如下所示：\"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 6}}\"
4. 请不要生成其他开头、结尾或解释。在输出中务必包含 [RESULT]。
5. 回复和查询均会以英语和中文提供，请忽略语言差异，不要因语言差异而扣分。例如，如果问题和参考答案是英文的，而回复是中文的，请不要因语言差异而在评估中扣分。

答案可能与参考答案的表述不同，但如果整体意义相同，请不要扣分。
### 需要评估的指令：
{instruction}

### 需要评估的回复：
{response}

### 参考答案（评分为6）：
{reference_answer}

### 评分量表：
[回复是否根据参考答案正确、准确且真实？]
评分1：回复完全不正确、不准确和/或不真实。答案对于这个问题来说太复杂且完全没有必要。
评分2：回复大部分不正确、不准确和/或不真实。模型难以保持清晰。
评分3：回复部分正确、准确和/或真实。事实性错误有时会传达与答案不同的含义。模型可能过于冗长，提供了不必要或可能误导的信息。
评分4：回复大部分正确、准确和真实。模型可能有一些不准确的地方，但关键部分都是正确的。回复虽然冗长，但并不妨碍整体理解。
评分5：回复完全正确、准确和真实。模型几乎没有或完全没有不准确的错误。回复大体清晰，回复中几乎没有或完全没有不相关的信息。
评分6：回复完全正确、准确和真实。模型提供了清晰、简洁且易于理解的答案。只有当回复在各方面都完美无缺时，才能给出这个分数。

请严格进行打分。
请优先按照准确性打分，比如有的回答过于垄长，但是结果没有问题，请不要太在意。
### 反馈：'''




In [None]:
data_qwen = pd.read_json(qwen_response_file)
data_qwen.head()

In [None]:
data.head()

In [None]:


scores = []
client = AzureOpenAI(
    api_key="replace with openai API key",
    azure_endpoint="https://gptforai01.openai.azure.com/",
    api_version=API_VERSION
)
for i in tqdm(data.index, desc=f'Reading DF'):

    # First we get the response from the model (Replace this part with the model you are testing)

    messages = [
        {"role": "user", "content": data['queries'][i]},
        {"role": "system", "content" : ""}
    ]


    #print(messages)

    #test_res_str = client.chat.completions.create(model=Deployment.GPT4O, messages=messages, max_tokens=4096).choices[0].message.content
    test_res_str = data_qwen['response'][i]
    #print(test_res_str)

    cur_eval_prompt = EVAL_PROMPT.format(
        instruction = data['queries'][i],
        response = test_res_str,
        reference_answer = data['answers'][i]
    )
    
    messages = [
        {"role": "user", "content": ""},
        {"role": "system", "content" : cur_eval_prompt}
    ]



    res = client.chat.completions.create(model=Deployment.GPT35_16K, messages=messages, max_tokens=4096)
    resstr = res.choices[0].message.content

    #print(resstr)

    pattern = r'\[RESULT\]\s+(\d)'
    # Extracting the score using the pattern
    match = re.search(pattern, resstr)
    if match:
        score = int(match.group(1))
        scores.append(score)

        if score <= 3:
            print(f"BAD RESPONSE!\nquestion:{data['queries'][i]}\nresponse from model: {test_res_str}\n reference answer: {data['answers'][i]}\nfeedback: {resstr}")
        #if score == 6:
        #    print(f"PERFECT RESPONSE!\nquestion:{data['queries'][i]}\nresponse from model: {test_res_str}\n reference answer: {data['answers'][i]}\nfeedback: {resstr}")
        print(f"Score: {score}, mean: {np.mean(scores)}")
    else:
        print(f'response: {test_res_str}\nanswer: {res}')
        print("Score not found")
        


print('--------------------')
print(f'Average Score: {np.mean(scores)}')

In [None]:
print(len(data))

In [None]:

print(scores)
print(f'mean: {np.mean(scores)}')
print(f'stdev: {np.std(scores)}')