In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from CQ.cq_solver_llama import CQ_Solver_llama, CQ_SYSTEM_PROMPT
from ReAct.react_agent_llama import ReAct_agent, REACT_SYSTEM_PROMPT
from Multi_Agent.mindsearch_llama import MindSearch, PLANNER_SYSTEM_PROMPT

In [None]:
import json
import os

test_dataset_500 = 'test_dataset_500.json'
if not os.path.exists(test_dataset_500):
    print(f"找不到示範問題檔案: {test_dataset_500}。請先運行之前的程式碼生成該檔案。")
    exit()

with open(test_dataset_500, 'r', encoding='utf-8') as f:
    demo_questions = json.load(f)

# 初始化不同的 Agent 系統和模型


results = []
output_file = 'experiment_results.jsonl'  # 使用 .jsonl 方便逐行寫入 JSON 物件

max_questions = 500

for i, question in enumerate(demo_questions):
    question_id = f"question_{i+1:02d}"  # 使用 3 位數補零，如 question_115
    print(f"\n--- 處理問題: {question_id} ---")

    # react_gpt4o = ReAct_agent(
    #     llm="gpt-4o", system_prompt=REACT_SYSTEM_PROMPT, max_turns=9)
    # react_llama = ReAct_agent(llm="llama 3.3 70B", system_prompt=REACT_SYSTEM_PROMPT, max_turns=9)

    # mindsearch_gpt4o = MindSearch(
    #     llm="gpt-4o", system_prompt=PLANNER_SYSTEM_PROMPT, max_turns=9)
    mindsearch_llama = MindSearch(
        llm="llama 3.3 70B", system_prompt=PLANNER_SYSTEM_PROMPT, max_turns=9)

    # cqsolver_llama = CQ_Solver_llama(llm="llama 3.3 70B", system_prompt=CQ_SYSTEM_PROMPT, max_turns=9, debug_log="llama_debug.log", summary_json="llama_summary.json")

    # systems = {
    #     "ReAct": {"gpt-4o": react_gpt4o, "llama": react_llama},
    #     "MindSearch": {"gpt-4o": mindsearch_gpt4o, "llama": mindsearch_llama},
    #     "CQ_Solver": {"gpt-4o": cqsolver_gpt4o, "llama": cqsolver_llama},
    # }

    systems = {
        # "ReAct": {"llama 3.3 70B": react_llama},
        "MindSearch": {"llama 3.3 70B": mindsearch_llama},
        # "CQ_Solver": {"llama 3.3 70B": cqsolver_llama}
    }

    for system_name, models in systems.items():
        for model_name, agent in models.items():
            print(f"系統: {system_name}, 模型: {model_name}")
            try:
                # 假設每個 Agent 都有一個類似的 run 方法來處理問題
                response = agent.run(question)
                print(f"  回答: {response}")
                result = {
                    "system": system_name,
                    "model": model_name,
                    "question_id": question_id,
                    "question": question,
                    "answer": response,
                    "error": None
                }
            except Exception as e:
                error_message = str(e)
                print(f"  發生錯誤: {error_message}")
                result = {
                    "system": system_name,
                    "model": model_name,
                    "question_id": question_id,
                    "question": question,
                    "answer": None,
                    "error": error_message
                }

            # 即時保存結果
            with open(output_file, 'a', encoding='utf-8') as f:
                json.dump(result, f, ensure_ascii=False)
                f.write('\n')

print(f"\n實驗已完成。結果已保存至 {output_file}")

In [None]:
import json
import os

test_dataset_500 = 'test_dataset_500.json'
if not os.path.exists(test_dataset_500):
    print(f"找不到示範問題檔案: {test_dataset_500}。請先運行之前的程式碼生成該檔案。")
    exit()

with open(test_dataset_500, 'r', encoding='utf-8') as f:
    demo_questions = json.load(f)

results = []
output_file = 'experiment_results.jsonl'  # 使用 .jsonl 方便逐行寫入 JSON 物件
max_retries = 10

for retry_count in range(1, max_retries + 1):
    print(f"\n--- 第 {retry_count} 次重新嘗試 ---")
    error_entries = []
    updated_entries = []
    original_entries = []

    # 讀取現有的結果並找出錯誤的 'llama 3.3 70B' 回答
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    entry = json.loads(line.strip())
                    original_entries.append(entry)
                    if entry.get("model") == "llama 3.3 70B" and entry.get("answer") is None:
                        error_entries.append(entry)
                    else:
                        updated_entries.append(entry) # 先將正確或非目標模型的條目加入待寫回列表
                except json.JSONDecodeError as e:
                    print(f"JSON 解析錯誤: {e}, 行內容: {line.strip()}")

    if not error_entries:
        print("沒有需要重新嘗試的 'llama 3.3 70B' 錯誤回答。")
        break

    print(f"找到 {len(error_entries)} 個需要重新嘗試的錯誤回答。")

    # 重新回答錯誤的條目
    for error_entry in error_entries:
        question_id = error_entry.get("question_id")
        question = error_entry.get("question")
        system_name = error_entry.get("system")
        model_name = error_entry.get("model")

        print(f"\n重新嘗試系統: {system_name}, 問題 ID: {question_id}")

        react_llama = ReAct_agent(
            llm="llama 3.3 70B", system_prompt=REACT_SYSTEM_PROMPT, max_turns=9)
        mindsearch_llama = MindSearch(
            llm="llama 3.3 70B", system_prompt=PLANNER_SYSTEM_PROMPT, max_turns=9)
        cqsolver_llama = CQ_Solver_llama(
            llm="llama 3.3 70B", system_prompt=CQ_SYSTEM_PROMPT,
            max_turns=9, debug_log="llama_debug.log", summary_json="llama_summary.json")

        systems = {
            "ReAct": {"llama 3.3 70B": react_llama},
            "MindSearch": {"llama 3.3 70B": mindsearch_llama},
            "CQ_Solver": {"llama 3.3 70B": cqsolver_llama}
        }

        if system_name in systems and model_name in systems[system_name]:
            agent = systems[system_name][model_name]
            try:
                response = agent.run(question)
                print(f"  重新回答成功: {response}")
                updated_entry = error_entry.copy()
                updated_entry["answer"] = response
                updated_entry["error"] = None # 清除錯誤訊息
                updated_entries.append(updated_entry)
            except Exception as e:
                error_message = str(e)
                print(f"  重新回答失敗: {error_message}")
                updated_entries.append(error_entry) # 保留原來的錯誤條目
        else:
            print(f"  找不到系統 '{system_name}' 或模型 '{model_name}'。")
            updated_entries.append(error_entry) # 保留原來的錯誤條目

    # 寫回更新後的結果 (覆蓋原檔案)
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in updated_entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

print("\n重新嘗試過程結束。")

In [4]:
import json
import os

# === 檔案路徑 ===
experiment_input = 'experiment_results.jsonl'
output_file = 'llama_experiment_results_makeup_exam2.jsonl'

specified_question_id = ['question_02', 'question_03', 'question_04', 'question_07', 'question_10', 'question_13', 'question_18', 'question_19', 'question_21', 'question_22', 'question_24', 'question_25', 'question_28', 'question_31', 'question_32', 'question_36', 'question_38', 'question_39', 'question_40', 'question_43', 'question_48', 'question_50', 'question_52', 'question_54', 'question_56', 'question_61', 'question_62', 'question_65', 'question_70', 'question_71', 'question_72', 'question_73', 'question_76', 'question_77', 'question_83', 'question_87', 'question_89', 'question_91', 'question_92', 'question_93', 'question_96', 'question_100', 'question_102', 'question_103', 'question_104', 'question_106', 'question_107', 'question_108', 'question_109', 'question_110', 'question_111', 'question_113', 'question_116', 'question_118', 'question_119', 'question_121', 'question_122', 'question_124', 'question_126', 'question_127', 'question_128', 'question_131', 'question_133', 'question_134', 'question_135', 'question_136', 'question_137', 'question_138', 'question_139', 'question_141', 'question_142', 'question_143', 'question_144', 'question_145', 'question_147', 'question_152', 'question_153', 'question_154', 'question_155', 'question_159', 'question_160', 'question_165', 'question_167', 'question_168', 'question_172', 'question_174', 'question_177', 'question_182', 'question_183', 'question_184', 'question_189', 'question_192', 'question_193', 'question_194', 'question_199', 'question_200', 'question_203', 'question_209', 'question_210', 'question_211', 'question_214', 'question_215', 'question_217', 'question_218', 'question_221', 'question_222', 'question_224', 'question_227', 'question_230', 'question_231', 'question_232', 'question_234', 'question_235', 'question_237', 'question_238', 'question_241', 'question_243', 'question_245', 'question_246', 'question_247', 'question_249', 'question_251', 'question_252', 'question_254', 'question_255', 'question_257', 'question_259', 'question_260', 'question_262', 'question_264', 'question_265', 'question_266', 'question_269', 'question_270', 'question_271', 'question_272', 'question_275', 'question_276', 'question_278', 'question_279', 'question_280',
                         'question_281', 'question_282', 'question_283', 'question_286', 'question_287', 'question_293', 'question_294', 'question_298', 'question_299', 'question_300', 'question_301', 'question_302', 'question_305', 'question_306', 'question_307', 'question_308', 'question_309', 'question_310', 'question_311', 'question_312', 'question_313', 'question_315', 'question_316', 'question_318', 'question_319', 'question_320', 'question_321', 'question_322', 'question_324', 'question_325', 'question_326', 'question_328', 'question_331', 'question_332', 'question_335', 'question_338', 'question_343', 'question_344', 'question_345', 'question_349', 'question_351', 'question_352', 'question_353', 'question_354', 'question_356', 'question_357', 'question_358', 'question_361', 'question_362', 'question_363', 'question_365', 'question_370', 'question_371', 'question_373', 'question_375', 'question_376', 'question_380', 'question_383', 'question_385', 'question_387', 'question_388', 'question_391', 'question_395', 'question_396', 'question_397', 'question_399', 'question_402', 'question_403', 'question_405', 'question_406', 'question_407', 'question_408', 'question_409', 'question_411', 'question_414', 'question_415', 'question_418', 'question_419', 'question_422', 'question_428', 'question_429', 'question_430', 'question_431', 'question_432', 'question_433', 'question_438', 'question_441', 'question_442', 'question_444', 'question_445', 'question_446', 'question_447', 'question_448', 'question_449', 'question_451', 'question_454', 'question_456', 'question_459', 'question_460', 'question_461', 'question_463', 'question_465', 'question_468', 'question_472', 'question_474', 'question_476', 'question_480', 'question_481', 'question_482', 'question_485', 'question_487', 'question_489', 'question_490', 'question_492', 'question_493', 'question_494', 'question_496', 'question_497', 'question_498', 'question_41', 'question_53', 'question_55', 'question_90', 'question_97', 'question_188', 'question_196', 'question_206', 'question_212', 'question_216', 'question_334', 'question_336', 'question_382', 'question_390', 'question_393', 'question_410', 'question_417', 'question_466', 'question_475', 'question_163', 'question_66', 'question_125']

question_lookup = {}

if not os.path.exists(experiment_input):
    print(f"找不到原始結果檔案: {experiment_input}")
    exit()

with open(experiment_input, 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        qid = entry.get("question_id")
        question = entry.get("question")
        if qid and question:
            question_lookup[qid] = question

# === 重新跑的目標問題 ===
target_questions = []

for qid in specified_question_id:
    if qid in question_lookup:
        target_questions.append((qid, question_lookup[qid]))
    else:
        print(f"⚠️ 找不到對應內容: {qid}")


# === 開始執行實驗 ===
for question_id, question in target_questions:
    print(f"\n--- 重新處理問題: {question_id} ---")

    cqsolver_gpt4o = CQ_Solver_llama(
        llm="llama 3.3 70B",
        system_prompt=CQ_SYSTEM_PROMPT,
        max_turns=9,
        summary_json="llama_CQ_Solver_summary_makeup2.json"
    )

    system_name = "CQ_Solver"
    model_name = "llama 3.3 70B"

    try:
        response = cqsolver_gpt4o.run(question)
        print(f"  ✅ 回答: {response}")
        result = {
            "system": system_name,
            "model": model_name,
            "question_id": question_id,
            "question": question,
            "answer": response,
            "error": None
        }
    except Exception as e:
        error_message = str(e)
        print(f"  💥 發生錯誤: {error_message}")
        result = {
            "system": system_name,
            "model": model_name,
            "question_id": question_id,
            "question": question,
            "answer": None,
            "error": error_message
        }

    # === 即時寫入結果 ===
    with open(output_file, 'a', encoding='utf-8') as f_out:
        json.dump(result, f_out, ensure_ascii=False)
        f_out.write('\n')

print("\n✅ 重新處理指定問題已完成。結果已更新至 llama_experiment_results_makeup_exam2.jsonl")


--- 重新處理問題: question_02 ---
Error processing https://www.thoughtco.com/increased-minimum-wage-impact-4019618: HTTPSConnectionPool(host='www.thoughtco.com', port=443): Read timed out. (read timeout=5)
Error processing https://www.investopedia.com/ask/answers/052815/does-raising-minimum-wage-increase-inflation.asp: HTTPSConnectionPool(host='www.investopedia.com', port=443): Read timed out. (read timeout=5)
  ✅ 回答: The minimum wage increases due to a combination of factors including economic growth, inflation, geographical differences, and the need to maintain the purchasing power of low-wage workers. The main factors contributing to minimum wage increases are interconnected and influenced by various economic and social conditions. Inflation, in particular, plays a critical role as it directly affects the purchasing power of the minimum wage, necessitating adjustments to protect the standard of living of low-wage workers. Understanding these causes is essential for assessing the effects 

In [11]:
import json
from dotenv import load_dotenv

load_dotenv()

eval_prompt_p1 = """You are asked to assess the quality of an AI assistant's answer to a user's question as an impartial judge. Since the type of answer you are evaluating is [Solve Professional Problem], you need to evaluate the answer in the following 5 criteria:
1. Factuality: Whether the information provided is accurate and based on reliable facts and data.
2. User Satisfaction: Whether the response meets the user's question and needs and provides a comprehensive and appropriate answer to the question.
3. Clarity: Whether the response is clear and understandable, and whether it uses concise language and structure so that the user can easily understand it.
4. Logical Coherence: Whether the response maintains overall consistency and logical coherence between different sections, avoiding self-contradiction.
5. Completeness: Whether the response provides sufficient information and details to meet the user's needs, and whether it avoids omitting important aspects.
6. Note that a longer answer is not always better, the answer that is concise and meets the above requirements is the best.

We will provide you with the user's question, an 8-score reference answer, and answers from the AI assistant that needs your assessment. When starting your evaluation, you need to follow the reasoning steps below:
1. Compare the AI assistant's answer with the reference answer, point out any shortcomings in the AI assistant's answer, and explain further.
2. Evaluate the AI assistant's answer in terms of the different criteria, giving each criterion a score from 1 to 10 after the evaluation of each.
3. Finally, combine the evaluations from each criterion and give the AI assistant's answer a composite score of 1 to 10.
4. Your scoring needs to be as rigorous as possible and adhere to the following scoring rules: in general, the higher the quality of the model's answers, the higher the score.
The two most important criteria are factual correctness and fulfillment of user needs, and the scores for these two dimensions dominate the final composite score.

When the model answer has irrelevance to the question, or intrinsically factually incorrect, or generates harmful content, the total score should be 1 to 2;
When the model answer has no serious errors and is largely harmless, but is of low quality and does not meet user requirements, the total score must be 3 to 4;
When the model answer basically meets the user's needs but performs poorly on some criteria and is of medium quality, the total score can be 5 to 6;
When the quality of the model response is similar to the reference answer and performs well in all criteria, the total score should be 7 to 8;
A score of 9 to 10 can only be achieved if the model significantly exceeds the quality of the reference answer, adequately addresses the user's question and all the needs, and is close to a perfect score on all criteria. As an example, the reference answer would receive a score of 8.

You need to evaluate and explain before you score. Your explanation of each criterion needs to be followed by the scoring. After that, at the end of your answer, return all of your scores in the following dictionary format, including the curly brackets, and make sure that your scores are integers:
{'Dimension 1': scoring, 'Dimension 2': scoring, ... , 'Final Score': Score}, e.g. {'Factuality': 9, 'User Satisfaction': 6, ... , 'Final Score': 7}.
"""


def compare_answers(filename="experiment_results.jsonl"):
    """
    開啟 JSONL 檔案，針對相同的 question_id，比較不同系統的答案，並回傳問題內容。
    "MindSearch" (gpt-4o) 的答案被視為參考答案，
    "ReAct" (gpt-4o) 和 "CQ_Solver" (gpt-4o) 的答案將與之比較。

    Args:
        filename (str): JSONL 檔案的名稱。

    Returns:
        dict: 一個字典，key 是 question_id，value 是一個包含問題、參考答案和要比較答案的字典。
              例如:
              {
                  "question_498": {
                      "question": "how does case management improve patient outcomes",
                      "reference_answer": "Case management...",
                      "ReAct_answer": "...",
                      "CQ_Solver_answer": "..."
                  },
                  "question_499": {
                      "question": "...",
                      "reference_answer": "...",
                      "ReAct_answer": "...",
                      "CQ_Solver_answer": "..."
                  },
                  ...
              }
    """
    results = {}
    data_by_question_id = {}

    with open(filename, 'r') as f:
        for line in f:
            try:
                record = json.loads(line.strip())
                question_id = record.get("question_id")
                question = record.get("question")
                system = record.get("system")
                model = record.get("model")
                answer = record.get("answer")

                if question_id and model == "gpt-4o":
                    if question_id not in data_by_question_id:
                        data_by_question_id[question_id] = {
                            "question": question}
                    data_by_question_id[question_id][system] = answer
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {line.strip()} - {e}")
                continue

    for question_id, data in data_by_question_id.items():
        if "MindSearch" in data:
            comparison_data = {"question": data.get(
                "question"), "reference_answer": data["MindSearch"]}
            if "ReAct" in data:
                comparison_data["ReAct_answer"] = data["ReAct"]
            if "CQ_Solver" in data:
                comparison_data["CQ_Solver_answer"] = data["CQ_Solver"]
            results[question_id] = comparison_data

    return results



In [None]:
import json
import os
from openai import OpenAI

import json
import os
from openai import OpenAI

def compare_answers_for_batch(filename="experiment_results.jsonl", output_batch_file="batch_input.jsonl"):
    """
    開啟 JSONL 檔案，針對相同的 question_id，準備 Batch API 的輸入檔案。
    使用 gpt-4o 且 system == "MindSearch" 的答案作為參考答案，
    評估 llama 3.3 70B 模型在不同系統下的答案。
    """
    batch_requests = []
    reference_answers = {}
    questions = {}

    with open(filename, 'r') as f:
        for line in f:
            try:
                record = json.loads(line.strip())
                question_id = record.get("question_id")
                question = record.get("question")
                system = record.get("system")
                model = record.get("model")
                answer = record.get("answer")

                if question_id:
                    questions[question_id] = question
                    if model == "gpt-4o" and system == "MindSearch":
                        reference_answers[question_id] = answer
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {line.strip()} - {e}")
                continue

    with open(filename, 'r') as f:
        for line in f:
            try:
                record = json.loads(line.strip())
                question_id = record.get("question_id")
                system = record.get("system")
                model = record.get("model")
                answer = record.get("answer")

                if question_id and model == "llama 3.3 70B":
                    question = questions.get(question_id, "N/A")
                    reference_answer = reference_answers.get(question_id, "N/A")

                    if reference_answer != "N/A" and answer != "N/A":
                        eval_prompt_p2 = f"""Question: "{question}"
<Reference Answer>
{reference_answer}
</Reference Answer>

<AI assistant's answer>
{answer}
</AI assistant's answer>"""

                        batch_requests.append({
                            "custom_id": f"{question_id}-{system}-llama-3.3-70B",
                            "method": "POST",
                            "url": "/v1/chat/completions",
                            "body": {
                                "model": "gpt-4o",
                                "messages": [{"role": "developer", "content": eval_prompt_p1},
                                             {"role": "user", "content": eval_prompt_p2}]
                            }
                        })
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {line.strip()} - {e}")
                continue

    # 將請求寫入 Batch 輸入檔案
    with open(output_batch_file, 'w') as f:
        for req in batch_requests:
            f.write(json.dumps(req) + '\n')

if __name__ == "__main__":
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    output_batch_file = "batch_input_llama.jsonl"
    # compare_answers_for_batch(output_batch_file=output_batch_file)
    # print(f"Batch input file created: {output_batch_file}")

    # 步驟 2: 上傳 Batch 輸入檔案
    try:
        with open(output_batch_file, "rb") as f:
            batch_input_file = client.files.create(
                file=f,
                purpose="batch"
            )
        print(f"Batch input file uploaded with ID: {batch_input_file.id}")
        input_file_id = batch_input_file.id

        # 步驟 3: 創建 Batch
        batch = client.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": "Evaluation batch job for llama 3.3 70B"}
        )
        print(f"Batch created with ID: {batch.id}")

        # **後續步驟：輪詢狀態、檢索結果、解析和記錄結果**
        # 您需要實現這些步驟，就像之前的流程一樣。

    except Exception as e:
        print(f"Error during Batch API interaction: {e}")


Batch input file uploaded with ID: file-SGCN91hLjPgga6QnDukym7
Batch created with ID: batch_67ffd0c550d08190b556a03fbedc3665


In [None]:
import json
import os
from openai import OpenAI

def prepare_cq_solver_re_evaluation_batch(cq_solver_results_file="experiment_results_2.jsonl",
                                          mindsearch_results_file="experiment_results.jsonl",
                                          output_batch_file="batch_input_cq_reEval.jsonl"):
    """
    準備 Batch API 輸入檔案，以重新評估 experiment_results_2.jsonl 中的 CQ_Solver 答案，
    並使用 experiment_results.jsonl 中對應的 MindSearch 答案作為參考。
    """
    batch_requests = []
    mindsearch_answers = {}
    cq_solver_data = {}

    # 載入 MindSearch 答案作為參考
    with open(mindsearch_results_file, 'r') as f_ms:
        for line in f_ms:
            try:
                record = json.loads(line.strip())
                if record.get("system") == "MindSearch" and record.get("model") == "gpt-4o" and record.get("question_id"):
                    mindsearch_answers[record["question_id"]] = record.get("answer")
            except json.JSONDecodeError as e:
                print(f"Error decoding MindSearch results: {e}")
                continue

    # 載入 CQ_Solver 的答案和問題
    with open(cq_solver_results_file, 'r') as f_cq:
        for line in f_cq:
            try:
                record = json.loads(line.strip())
                if record.get("system") == "CQ_Solver" and record.get("model") == "gpt-4o" and record.get("question_id"):
                    cq_solver_data[record["question_id"]] = {
                        "question": record.get("question"),
                        "answer": record.get("answer")
                    }
            except json.JSONDecodeError as e:
                print(f"Error decoding CQ_Solver results: {e}")
                continue

    # 準備 Batch API 請求
    for question_id, cq_solver_info in cq_solver_data.items():
        question = cq_solver_info.get("question", "N/A")
        cq_solver_answer = cq_solver_info.get("answer", "N/A")
        reference_answer = mindsearch_answers.get(question_id, "N/A")

        if cq_solver_answer != "N/A" and reference_answer != "N/A":
            eval_prompt_p2_cq_solver = f"""Question: "{question}"
<Reference Answer>
{reference_answer}
</Reference Answer>

<AI assistant's answer>
{cq_solver_answer}
</AI assistant's answer>"""
            batch_requests.append({
                "custom_id": f"{question_id}-cq-solver-re-eval",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o",
                    "messages": [{"role": "developer", "content": eval_prompt_p1},
                                 {"role": "user", "content": eval_prompt_p2_cq_solver}]
                }
            })

    # 將請求寫入 Batch 輸入檔案
    with open(output_batch_file, 'w') as f:
        for req in batch_requests:
            f.write(json.dumps(req) + '\n')

if __name__ == "__main__":
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    output_batch_file = "batch_input_cq_reEval.jsonl"
    # prepare_cq_solver_re_evaluation_batch(output_batch_file=output_batch_file)
    # print(f"Batch input file created: {output_batch_file}")

    # 步驟 2: 上傳 Batch 輸入檔案
    try:
        with open(output_batch_file, "rb") as f:
            batch_input_file = client.files.create(
                file=f,
                purpose="batch"
            )
        print(f"Batch input file uploaded with ID: {batch_input_file.id}")
        input_file_id = batch_input_file.id

        # 步驟 3: 創建 Batch
        batch = client.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": "Re-evaluation batch job for CQ_Solver"}
        )
        print(f"Batch created with ID: {batch.id}")

        # **後續步驟：輪詢狀態、檢索結果、解析和記錄結果**
        # 您需要實現這些步驟，就像之前的流程一樣。

    except Exception as e:
        print(f"Error during Batch API interaction: {e}")

In [16]:
import json
import os
from openai import OpenAI

def compare_answers_for_batch_makeup(input_filenames, output_batch_file="batch_input_makeup.jsonl"):
    """
    開啟多個 JSONL 檔案，針對相同的 question_id，準備 Batch API 的輸入檔案。
    使用 gpt-4o 且 system == "MindSearch" 的答案作為參考答案，
    評估 llama 3.3 70B 模型在不同系統下的答案。
    如果 record.get("answer") is None，則略過該條記錄。

    Args:
        input_filenames (list): 包含輸入 JSONL 檔案名的列表。
        output_batch_file (str): 輸出 Batch API 輸入檔案名。
    """
    batch_requests = []
    reference_answers = {}
    questions = {}
    request_counter = 0

    # 第一次遍歷 experiment_results.jsonl 以收集參考答案和問題
    reference_filename = "experiment_results.jsonl"
    with open(reference_filename, 'r') as f_ref:
        for line in f_ref:
            try:
                record = json.loads(line.strip())
                question_id = record.get("question_id")
                question = record.get("question")
                system = record.get("system")
                model = record.get("model")
                answer = record.get("answer")

                if question_id and answer is not None:
                    questions[question_id] = question
                    if model == "gpt-4o" and system == "MindSearch":
                        reference_answers[question_id] = answer
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line in {reference_filename}: {line.strip()} - {e}")
                continue

    # 第二次遍歷輸入的 llama 模型結果檔案以準備評估請求
    for filename in input_filenames:
        with open(filename, 'r') as f_llama:
            for line in f_llama:
                try:
                    record = json.loads(line.strip())
                    question_id = record.get("question_id")
                    system = record.get("system")
                    model = record.get("model")
                    answer = record.get("answer")

                    if question_id and model == "llama 3.3 70B" and answer is not None:
                        question = questions.get(question_id, "N/A")
                        reference_answer = reference_answers.get(question_id, "N/A")

                        if reference_answer != "N/A":
                            eval_prompt_p2 = f"""User Question: "{question}"
<Reference Answer>
{reference_answer}
</Reference Answer>

<AI assistant's answer>
{answer}
</AI assistant's answer>"""

                            custom_id = f"{question_id}-{system}-llama-3.3-70B-{request_counter:05d}"
                            batch_requests.append({
                                "custom_id": custom_id,
                                "method": "POST",
                                "url": "/v1/chat/completions",
                                "body": {
                                    "model": "gpt-4o",
                                    "messages": [{"role": "developer", "content": eval_prompt_p1},
                                                 {"role": "user", "content": eval_prompt_p2}]
                                }
                            })
                            request_counter += 1
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON line in {filename}: {line.strip()} - {e}")
                    continue

    # 將請求寫入 Batch 輸入檔案
    with open(output_batch_file, 'w') as f:
        for req in batch_requests:
            f.write(json.dumps(req) + '\n')

if __name__ == "__main__":
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    # input_filenames = ['llama_experiment_results_makeup_exam.jsonl', 'llama_experiment_results_makeup_exam2.jsonl']
    output_batch_file = "batch_input_llama_makeup.jsonl"
    # compare_answers_for_batch_makeup(input_filenames, output_batch_file=output_batch_file)
    # print(f"Batch input file created: {output_batch_file}")

    # 步驟 2: 上傳 Batch 輸入檔案
    try:
        with open(output_batch_file, "rb") as f:
            batch_input_file = client.files.create(
                file=f,
                purpose="batch"
            )
        print(f"Batch input file uploaded with ID: {batch_input_file.id}")
        input_file_id = batch_input_file.id

        # 步驟 3: 創建 Batch
        batch = client.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"description": "Evaluation batch job for llama 3.3 70B"}
        )
        print(f"Batch created with ID: {batch.id}")

        # **後續步驟：輪詢狀態、檢索結果、解析和記錄結果**
        # 您需要實現這些步驟，就像之前的流程一樣。

    except Exception as e:
        print(f"Error during Batch API interaction: {e}")

Batch input file uploaded with ID: file-7TK6JChQSNGtP4kYKXKbWN
Batch created with ID: batch_68009ccb03408190aa0c38c35c0e6b75


In [20]:
from openai import OpenAI
client = OpenAI()

batch = client.batches.retrieve("batch_68009ccb03408190aa0c38c35c0e6b75")
print(batch)

Batch(id='batch_68009ccb03408190aa0c38c35c0e6b75', completion_window='24h', created_at=1744870603, endpoint='/v1/chat/completions', input_file_id='file-7TK6JChQSNGtP4kYKXKbWN', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1744870782, error_file_id=None, errors=None, expired_at=None, expires_at=1744957003, failed_at=None, finalizing_at=1744870700, in_progress_at=1744870604, metadata={'description': 'Evaluation batch job for llama 3.3 70B'}, output_file_id='file-6VfPFHpsB1fP2e6GhS6MuL', request_counts=BatchRequestCounts(completed=491, failed=0, total=491))


In [21]:
from openai import OpenAI
client = OpenAI()

file_response = client.files.content("file-6VfPFHpsB1fP2e6GhS6MuL")
output_filename = "llama_makeup_evaluation_results.jsonl"
with open(output_filename, 'w') as outfile:
    outfile.write(file_response.text)