In [2]:
from dotenv import load_dotenv

load_dotenv()

eval_prompt_p1 = """You are asked to assess the quality of an AI assistant's answer to a user's question as an impartial judge. Since the type of answer you are evaluating is [Solve Professional Problem], you need to evaluate the answer in the following 5 criteria:
1. Factuality: Whether the information provided is accurate and based on reliable facts and data.
2. User Satisfaction: Whether the response meets the user's question and needs and provides a comprehensive and appropriate answer to the question.
3. Clarity: Whether the response is clear and understandable, and whether it uses concise language and structure so that the user can easily understand it.
4. Logical Coherence: Whether the response maintains overall consistency and logical coherence between different sections, avoiding self-contradiction.
5. Completeness: Whether the response provides sufficient information and details to meet the user's needs, and whether it avoids omitting important aspects.
6. Note that a longer answer is not always better, the answer that is concise and meets the above requirements is the best.

We will provide you with the user's question, an 8-score reference answer, and answers from the AI assistant that needs your assessment. When starting your evaluation, you need to follow the reasoning steps below:
1. Compare the AI assistant's answer with the reference answer, point out any shortcomings in the AI assistant's answer, and explain further.
2. Evaluate the AI assistant's answer in terms of the different criteria, giving each criterion a score from 1 to 10 after the evaluation of each.
3. Finally, combine the evaluations from each criterion and give the AI assistant's answer a composite score of 1 to 10.
4. Your scoring needs to be as rigorous as possible and adhere to the following scoring rules: in general, the higher the quality of the model's answers, the higher the score.
The two most important criteria are factual correctness and fulfillment of user needs, and the scores for these two dimensions dominate the final composite score.

When the model answer has irrelevance to the question, or intrinsically factually incorrect, or generates harmful content, the total score should be 1 to 2;
When the model answer has no serious errors and is largely harmless, but is of low quality and does not meet user requirements, the total score must be 3 to 4;
When the model answer basically meets the user's needs but performs poorly on some criteria and is of medium quality, the total score can be 5 to 6;
When the quality of the model response is similar to the reference answer and performs well in all criteria, the total score should be 7 to 8;
A score of 9 to 10 can only be achieved if the model significantly exceeds the quality of the reference answer, adequately addresses the user's question and all the needs, and is close to a perfect score on all criteria. As an example, the reference answer would receive a score of 8.

You need to evaluate and explain before you score. Your explanation of each criterion needs to be followed by the scoring. After that, at the end of your answer, return all of your scores in the following dictionary format, including the curly brackets, and make sure that your scores are integers:
{'Dimension 1': scoring, 'Dimension 2': scoring, ... , 'Final Score': Score}, e.g. {'Factuality': 9, 'User Satisfaction': 6, ... , 'Final Score': 7}.
"""


In [28]:
import json
import os
from groq import Groq

def prepare_batch_evaluation(results_file="./result/final_experiment_results.jsonl",
                             output_batch_file="batch_evaluation.jsonl"):
    """
    準備 Batch API 輸入檔案，以評估 final_experiment_results.jsonl 中的答案。
    使用 system == "MindSearch" 和 model == "gpt-4o" 的答案作為參考答案。
    評估所有其他模型和系統組合的答案。
    """
    batch_requests = []
    reference_answers = {}
    data_to_evaluate = {}
    
    RQ3_llama_best_After_CD = "final_RQ3_experiment_results.jsonl"

    # 載入參考答案 (MindSearch, gpt-4o)
    with open(results_file, 'r') as f_ref:
        for line in f_ref:
            try:
                record = json.loads(line.strip())
                if record.get("system") == "MindSearch" and record.get("model") == "gpt-4o" and record.get("question_id"):
                    reference_answers[record["question_id"]] = {
                        "answer": record.get("answer"),
                        "question": record.get("question")
                    }
            except json.JSONDecodeError as e:
                print(f"Error decoding reference answers: {e}")
                continue

    # 載入需要評估的答案 (所有其他組合)
    with open(RQ3_llama_best_After_CD, 'r') as f_eval:
        for line in f_eval:
            try:
                record = json.loads(line.strip())
                question_id = record.get("question_id")
                system = record.get("system")
                model = record.get("model")
                answer = record.get("answer")
                question = record.get("question")

                if question_id and answer is not None and not (system == "MindSearch" and model == "gpt-4o") and question_id in reference_answers:
                    if question_id not in data_to_evaluate:
                        data_to_evaluate[question_id] = {}
                    data_to_evaluate[question_id][f"{system}-{model}"] = answer
            except json.JSONDecodeError as e:
                print(f"Error decoding evaluation data: {e}")
                continue
            
    print(f"Loaded {len(reference_answers)} reference answers.")
    print(f"Attempting to evaluate answers for {len(data_to_evaluate)} question IDs.")

    evaluated_count = 0
    skipped_due_to_no_reference = 0

    # 準備 Batch API 請求
    for question_id, answers_by_model in data_to_evaluate.items():
        reference_info = reference_answers.get(question_id)
        if reference_info:
            reference_answer = reference_info["answer"]
            question = reference_info["question"]
            for model_system, evaluated_answer in answers_by_model.items():
                try:
                    system, model = model_system.split('-', 1)
                    custom_id = f"{question_id}-{system}-{model}"

                    eval_prompt_p2 = f"""Question: "{question}"
<Reference Answer>
{reference_answer}
</Reference Answer>

<AI assistant's answer>
{evaluated_answer}
</AI assistant's answer>"""

                    batch_requests.append({
                        "custom_id": custom_id,
                        "method": "POST",
                        "url": "/v1/chat/completions",
                        "body": {
                            "model": "llama-3.3-70b-versatile",
                            "messages": [{"role": "system", "content": eval_prompt_p1},
                                         {"role": "user", "content": eval_prompt_p2}]
                        }
                    })
                except ValueError as e:
                    print(f"Error splitting model_system '{model_system}': {e}")
                    continue # 跳過這個有問題的 model_system
                
            evaluated_count += len(answers_by_model)
            
        else:
            skipped_due_to_no_reference += len(answers_by_model)
            print(f"Warning: No reference answer found for question_id '{question_id}'. Skipping evaluation for: {answers_by_model.keys()}")
    print(f"Created {len(batch_requests)} evaluation requests.")
    print(f"Skipped {skipped_due_to_no_reference} evaluations due to missing reference answers.")

    # 將請求寫入 Batch 輸入檔案
    with open(output_batch_file, 'w') as f:
        for req in batch_requests:
            f.write(json.dumps(req) + '\n')

if __name__ == "__main__":
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    output_batch_file = "llama_ver_batch_evaluation_RQ3.jsonl"
    # prepare_batch_evaluation(output_batch_file=output_batch_file)
    print(f"Batch input file created: {output_batch_file}")

    # 步驟 2: 上傳 Batch 輸入檔案
    try:
        with open(output_batch_file, "rb") as f:
            batch_input_file = client.files.create(
                file=f,
                purpose="batch"
            )
        print(f"Batch input file uploaded with ID: {batch_input_file.id}")
        input_file_id = batch_input_file.id

        # 步驟 3: 創建 Batch
        batch = client.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": "Batch evaluation of different model/system combinations"}
        )
        print(f"Batch created with ID: {batch.id}")

        # **後續步驟：輪詢狀態、檢索結果、解析和記錄結果**
        # 您需要實現這些步驟，就像之前的流程一樣。

    except Exception as e:
        print(f"Error during Batch API interaction: {e}")

Batch input file created: llama_ver_batch_evaluation_RQ3.jsonl
Batch input file uploaded with ID: file_01jtzxks0jfwm8jnvgvp4t9g7b
Batch created with ID: batch_01jtzxksbkeevbxsjcqx5ytc7n


In [30]:
from groq import Groq
client = Groq()

batch = client.batches.retrieve("batch_01jtzxksbkeevbxsjcqx5ytc7n")
print(batch)

BatchRetrieveResponse(id='batch_01jtzxksbkeevbxsjcqx5ytc7n', completion_window='24h', created_at=1746975450, endpoint='/v1/chat/completions', input_file_id='file_01jtzxks0jfwm8jnvgvp4t9g7b', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1746975470, error_file_id=None, errors=None, expired_at=None, expires_at=1747061850, failed_at=None, finalizing_at=1746975470, in_progress_at=1746975454, metadata={'description': 'Batch evaluation of different model/system combinations'}, output_file_id='file_01jtzxmckhew9ve80v2fqgbswv', request_counts=RequestCounts(completed=500, failed=0, total=500))


In [31]:
from groq import Groq
client = Groq()

file_response = client.files.content("file_01jtzxmckhew9ve80v2fqgbswv")
output_filename = "llama_ver_RQ3_evaluation_results.jsonl"
file_response.write_to_file(output_filename)
# with open(output_filename, 'w', encoding='utf-8') as outfile:
#     outfile.write(file_response.text)

In [32]:
import json

input_file = 'llama_ver_RQ3_evaluation_results.jsonl'
output_file = 'llama_ver_RQ3_evaluation_results.jsonl'

cleaned_data = []

with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            entry = json.loads(line)
            custom_id = entry.get("custom_id")
            content = (
                entry.get("response", {})
                     .get("body", {})
                     .get("choices", [{}])[0]
                     .get("message", {})
                     .get("content", "")
            )
            cleaned_data.append({
                "custom_id": custom_id,
                "content": content
            })
        except Exception as e:
            print(f"❌ 處理失敗: {e}")
            continue

# ✅ 可選：寫入清理後的 .jsonl 檔案
with open(output_file, 'w', encoding='utf-8') as f:
    for item in cleaned_data:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"✅ 完成，共處理 {len(cleaned_data)} 筆資料。結果已儲存至 {output_file}")


✅ 完成，共處理 500 筆資料。結果已儲存至 llama_ver_RQ3_evaluation_results.jsonl


In [33]:
import json

def process_evaluation_results(input_filename="llama_ver_RQ3_evaluation_results.jsonl",
                               output_filename="llama_ver_RQ3_evaluation_results.jsonl"):
    """
    處理評估結果 JSONL 檔案，將 "custom_id" 拆分為 "system", "model", "question_id"。

    Args:
        input_filename (str): 輸入的 JSONL 檔案名。
        output_filename (str): 輸出的 JSONL 檔案名。
    """
    processed_records = []
    with open(input_filename, 'r', encoding='utf-8') as infile:
        for line in infile:
            try:
                record = json.loads(line.strip())
                custom_id = record.pop("custom_id", None)  # 移除 "custom_id"，如果不存在則為 None

                if custom_id:
                    parts = custom_id.split('-')
                    record["question_id"] = parts[0]

                    if "gpt-4o" in custom_id:
                        record["system"] = parts[1]
                        record["model"] = parts[2] + "-" + parts[3]
                    else:
                        record["system"] = parts[1]
                        record["model"] = parts[2]

                processed_records.append(record)

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {line.strip()} - {e}")
            except IndexError as e:
                print(f"Error splitting custom_id in line: {line.strip()} - {e}")

    with open(output_filename, 'w', encoding='utf-8') as outfile:
        for record in processed_records:
            outfile.write(json.dumps(record, ensure_ascii=False) + '\n')

    print(f"Processed results saved to {output_filename}")


process_evaluation_results()

Processed results saved to llama_ver_RQ3_evaluation_results.jsonl


In [34]:
import json
import ast
import re

input_file = 'llama_ver_RQ3_evaluation_results.jsonl'
output_file = 'llama_ver_RQ3_evaluation_results.jsonl' # 更改輸出檔案名以區別

# 評分欄位標準
expected_fields = {
    'Factuality', 'User Satisfaction', 'Clarity', 'Logical Coherence', 'Completeness', 'Final Score'
}

valid_scores = set(range(1, 11))

processed_entries = [] # 儲存所有處理過的條目

with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        content = entry.get("content", "")
        score_dict = None

        try:
            matches = re.findall(r"\{[^{}]+\}", content)
            for match in matches:
                try:
                    possible_dict = ast.literal_eval(match)
                    if isinstance(possible_dict, dict) and len(possible_dict) == 6:
                        keys = set(possible_dict.keys())
                        values = set(possible_dict.values())
                        if keys == expected_fields and all(isinstance(v, int) and v in valid_scores for v in possible_dict.values()):
                            score_dict = possible_dict
                            break
                except (SyntaxError, ValueError):
                    continue
        except Exception as e:
            print(f"⚠️ 無法解析 content：{e}")

        # 加入 score 欄位（如果成功解析）
        if score_dict:
            entry["score"] = score_dict

        processed_entries.append(entry) # 將處理後的（無論成功與否）條目加入列表

# 寫入處理後的檔案（包含成功和失敗的）
with open(output_file, 'w', encoding='utf-8') as f:
    for entry in processed_entries:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

# 顯示報告
valid_count = sum(1 for entry in processed_entries if "score" in entry)
invalid_count = len(processed_entries) - valid_count
print(f"✅ 共處理 {len(processed_entries)} 筆")
print(f"✅ 成功解析評分的資料：{valid_count} 筆")
print(f"❌ 未能解析評分的資料：{invalid_count} 筆")

# 可選：列出前幾筆未能解析評分的 custom_id 來檢查原因
print("\n📌 前幾筆未能成功解析評分的 custom_id：")
for item in processed_entries:
    if "score" not in item:
        print("-", item.get("system"))
        print("-", item.get("model"))
        print("-", item.get("question_id"))

✅ 共處理 500 筆
✅ 成功解析評分的資料：500 筆
❌ 未能解析評分的資料：0 筆

📌 前幾筆未能成功解析評分的 custom_id：


In [1]:
import json
from collections import defaultdict

input_file = "processed_evaluation_results.jsonl" #'./result/final_evaluation_results.jsonl'

expected_fields = [
    'Factuality', 'User Satisfaction', 'Clarity',
    'Logical Coherence', 'Completeness', 'Final Score'
]

# 初始化統計資料: (system, model) → 各評分欄位的數值 list
systems_scores = defaultdict(lambda: defaultdict(list))

# 讀取並分類累加
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        system = entry.get("system")
        model = entry.get("model")
        score = entry.get("score")

        if system and model and score:
            key = (system, model)
            for field in expected_fields:
                value = score.get(field)
                if isinstance(value, int):
                    systems_scores[key][field].append(value)

# 計算平均分數
print("\n📊 各系統 + 模型 的平均成績：\n")
for (system, model), scores in systems_scores.items():
    print(f"🔹 系統：{system} | 模型：{model}")
    for field in expected_fields:
        values = scores[field]
        if values:
            avg = sum(values) / len(values)
            print(f"  {field}: {avg:.2f}")
        else:
            print(f"  {field}: 無資料")
    print()



📊 各系統 + 模型 的平均成績：

🔹 系統：ReAct | 模型：llama 3.3 70B
  Factuality: 6.47
  User Satisfaction: 4.87
  Clarity: 7.09
  Logical Coherence: 6.43
  Completeness: 4.09
  Final Score: 5.15

🔹 系統：CQ_Solver | 模型：llama 3.3 70B
  Factuality: 7.78
  User Satisfaction: 6.57
  Clarity: 8.13
  Logical Coherence: 8.11
  Completeness: 5.92
  Final Score: 6.82

🔹 系統：ReAct | 模型：gpt-4o
  Factuality: 8.13
  User Satisfaction: 7.58
  Clarity: 8.54
  Logical Coherence: 8.53
  Completeness: 7.14
  Final Score: 7.66

🔹 系統：CQ_Solver | 模型：gpt-4o
  Factuality: 8.44
  User Satisfaction: 8.00
  Clarity: 8.67
  Logical Coherence: 8.84
  Completeness: 7.77
  Final Score: 8.14

🔹 系統：MindSearch | 模型：llama 3.3 70B
  Factuality: 7.80
  User Satisfaction: 6.71
  Clarity: 8.09
  Logical Coherence: 8.09
  Completeness: 6.16
  Final Score: 6.97



In [21]:
import json

def fetch_original_results(refined_results_file="final_refined_evaluation_results.jsonl",
                           original_results_files={
                               1: "RQ3_experiment_results.jsonl",
                               2: "RQ3_experiment_results_2.jsonl",
                               3: "RQ3_experiment_results_3.jsonl"
                           },
                           output_file="RQ3_llama_best_After_CD.jsonl"):
    """
    根據 "__order" 欄位從不同的原始結果檔案中取回對應的行，並合併到一個新的 JSONL 檔案中。

    Args:
        refined_results_file (str): 包含 "__order" 欄位的 JSONL 檔案。
        original_results_files (dict): 包含 "__order" 值與對應原始結果檔案名的字典。
        output_file (str): 輸出合併結果的 JSONL 檔案名。
    """
    original_data = {}
    for order, filename in original_results_files.items():
        original_data[order] = {}
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                for line in f:
                    record = json.loads(line)
                    key = (record.get("system"), record.get("model"), record.get("question_id"))
                    original_data[order][key] = record
        except FileNotFoundError:
            print(f"警告：找不到檔案 {filename}")
        except json.JSONDecodeError as e:
            print(f"警告：解析檔案 {filename} 時發生錯誤：{e}")

    merged_results = []
    try:
        with open(refined_results_file, 'r', encoding='utf-8') as f_refined:
            for line_refined in f_refined:
                record_refined = json.loads(line_refined)
                order = record_refined.get("__order")
                system_refined = record_refined.get("system")
                model_refined = record_refined.get("model")
                question_id_refined = record_refined.get("question_id")

                if order in original_data:
                    key_refined = (system_refined, model_refined, question_id_refined)
                    if key_refined in original_data[order]:
                        merged_results.append(original_data[order][key_refined])
                    else:
                        print(f"警告：在檔案 {original_results_files[order]} 中找不到與 {system_refined}, {model_refined}, {question_id_refined} 相符的記錄 (order: {order})")
                        merged_results.append(record_refined) # 如果找不到，保留 refined 的記錄
                else:
                    print(f"警告：__order 值 {order} 無對應的原始結果檔案。")
                    merged_results.append(record_refined) # 如果 order 不在字典中，保留 refined 的記錄

    except FileNotFoundError:
        print(f"錯誤：找不到檔案 {refined_results_file}")
    except json.JSONDecodeError as e:
        print(f"錯誤：解析檔案 {refined_results_file} 時發生錯誤：{e}")

    # 寫入新的 JSONL 檔案
    with open(output_file, 'w', encoding='utf-8') as f_output:
        for record in merged_results:
            json.dump(record, f_output, ensure_ascii=False)
            f_output.write('\n')

    print(f"已將合併結果寫入檔案：{output_file}")

if __name__ == "__main__":
    fetch_original_results()

已將合併結果寫入檔案：RQ3_llama_best_After_CD.jsonl
