In [1]:
# import libraries
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
import time
import asyncio
from datetime import datetime
import json
import os
import re
from langchain_core.messages import BaseMessage
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.output_parsers.json import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

In [2]:
# Define chat models
o3_mini = init_chat_model("openai:o3-mini")
claude_haiku = init_chat_model("anthropic:claude-3-haiku-20240307", temperature=0)
phi = init_chat_model("ollama:phi:latest", temperature=0)
qwen1_5 = init_chat_model("ollama:qwen:0.5b", temperature=0)
vicuna = init_chat_model("ollama:vicuna:7b", temperature=0)
llama2 = init_chat_model("ollama:llama2:latest", temperature=0)
llama3_1 = init_chat_model("ollama:llama3.1:latest", temperature=0)
deepseek_r1 = init_chat_model("ollama:deepseek-r1:8b", temperature=0)
gemma3 = init_chat_model("ollama:gemma3:12b", temperature=0)
answerer = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)
evaluator = init_chat_model("openai:gpt-4-turbo-2024-04-09", temperature=0)

In [3]:
# Preparing benchmark data and prompts, rubric

bench_path = "biggen_bench_test_4instance.json"

def load_benchmark_data(bench_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(bench_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

benchmark_data = load_benchmark_data(bench_path)

def prepare_prompts(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare prompts by excluding reference_answer and score_rubric."""
    prompts = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "capability": item["capability"],
            "task": item["task"],
            "instance_idx": item["instance_idx"],
            "system_prompt": item["system_prompt"],
            "input": item["input"],
            # Exclude reference_answer and score_rubric
        }
        prompts.append(prompt)
    return prompts

def prepare_rubric(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare rubric including reference_answer and score_rubric."""
    rubric = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "reference_answer": item["reference_answer"],
            "score_rubric": item["score_rubric"]
        }
        rubric.append(prompt)
    return rubric

prompts = prepare_prompts(benchmark_data)
rubrics = prepare_rubric(benchmark_data)

In [4]:
# Define the state structure
class State(TypedDict):
    prompts: List[Dict[str, Any]]
    rubrics: Dict[str, Dict[str, Any]] 
#    processed_count: int
    phi_results: List[Dict[str, Any]]
    qwen1_5_results: List[Dict[str, Any]]
    vicuna_results: List[Dict[str, Any]]
    llama2_results: List[Dict[str, Any]]
    llama3_1_results: List[Dict[str, Any]]
    gemma3_results: List[Dict[str, Any]]
#    deepseek_r1_results: List[Dict[str, Any]]
    evaluation_results: List[Dict[str, Any]]
    timestamp: str  # 워크플로우 전체에서 공유할 타임스탬프 필드

In [5]:
# QuestionAugmentedProcessor Class to replace the create_model_processor function
class QuestionAugmentedProcessor:
    def __init__(self, model, model_name, answerer, benchmark_data):
        self.model = model
        self.model_name = model_name
        self.answerer = answerer
        # Create a mapping from prompt ID to benchmark item for easy lookup
        self.benchmark_map = {item["id"]: item for item in benchmark_data}
        print(f"Initialized QuestionAugmentedProcessor for {self.model_name} with {len(self.benchmark_map)} benchmark items.")

    def _extract_content(self, response: Any) -> str:
        """Helper function to reliably extract string content from model responses."""
        if isinstance(response, BaseMessage) and hasattr(response, 'content'):
            return response.content.strip()
        elif isinstance(response, str):
            return response.strip()
        elif isinstance(response, dict) and 'content' in response:
            return response['content'].strip()
        elif hasattr(response, 'text'): # Handle potential other response objects
             return response.text.strip()
        else:
            # Fallback: attempt to convert to string, might not be ideal
            print(f"Warning: Unexpected response type {type(response)}. Attempting string conversion.")
            return str(response).strip()

    def generate_uncertainty_check(self, system_prompt: str, user_input: str) -> str | None:
        """Check if the model needs clarification before answering."""
        uncertainty_prompt_text = f"""
        Review the task: "{user_input}"

        Is additional information or clarification NEEDED to provide a comprehensive response?
        - If NO: Output ONLY the single word: None
        - If YES: Output ONLY a brief phrase (max 10 words) describing the main uncertainty.

        Examples:
        Input Task: Write a poem about cats.
        Output: None

        Input Task: Summarize the document.
        Output: Need document context.

        Your response MUST be either 'None' or a short phrase (max 10 words). Do not include anything else.
        """
        try:
            uncertainty_response = self.model.invoke(
                uncertainty_prompt_text,
                config={"system_prompt": system_prompt} if system_prompt else None
            )
            uncertainty_text = self._extract_content(uncertainty_response)
            cleaned_uncertainty = uncertainty_text.strip().strip('.').lower()

            if cleaned_uncertainty == "none":
                print(f"  [{self.model_name}] Uncertainty check: Strictly 'None'.")
                return None # Return Python None
            # 길이 제한 추가 (예: 15단어 이상이면 무효 처리)
            elif len(uncertainty_text.split()) > 15:
                print(f"  [{self.model_name}] Uncertainty check: Response too long. Assuming None. Response: '{uncertainty_text}'")
                return None
            # 입력 포함 여부 체크 추가 (간단한 예시)
            elif user_input[:30] in uncertainty_text: # 입력 시작 부분이 포함되어 있다면 무효 처리 (더 정교한 체크 필요 가능성 있음)
                print(f"  [{self.model_name}] Uncertainty check: Response seems to contain input. Assuming None. Response: '{uncertainty_text}'")
                return None
            elif not cleaned_uncertainty:
                print(f"  [{self.model_name}] Uncertainty check: Empty response. Assuming None.")
                return None
            else:
                # 유효한 불확실성 구문으로 판단될 경우 (짧고, 입력 미포함)
                print(f"  [{self.model_name}] Uncertainty check: Found uncertainty - '{uncertainty_text}'")
                return uncertainty_text.strip()

        except Exception as e:
            print(f"  [{self.model_name}] Error checking uncertainty: {e}")
            return None # Return None on error


    def generate_question(self, system_prompt: str, user_input: str, uncertainty: str, question_num: int, inquiring_history: List[Dict]) -> str | None:
        """Generate a clarifying question based on uncertainty."""

        # prompt with history and instructions
        question_prompt_text = f"""
        Task: "{user_input}"
        Detected Uncertainty: "{uncertainty}"
       
        Generate a single, clear question to clarify the Detected Uncertainty.
        
        - Output ONLY the question itself.
        - Do not add any other text or comments.

        Example:
        Uncertainty: Need scope clarification.  
        Output: What is the specific scope for this task?
        """
        try:
            question_response = self.model.invoke(question_prompt_text)
            question = self._extract_content(question_response)

            # Check if the model indicates no more questions are needed
            if question.lower().strip().strip('.').strip() == "none":
                return None # Signal to stop asking questions
            else:
                return question # Return the new question

        except Exception as e:
            print(f"  [{self.model_name}] Error generating context-aware question {question_num+1}: {e}")
            return None # Signal to stop on error to prevent infinite loops or error cascades

    def generate_answer(self, prompt_id: str, question: str, user_input: str, question_num: int) -> str:
        """Generate an answer to a clarifying question using the answerer model."""
        benchmark_item = self.benchmark_map.get(prompt_id)
        if not benchmark_item:
            print(f"  [Answerer] Error: Benchmark data not found for prompt ID {prompt_id}.")
            return "Error: Could not find reference data."

        reference_answer = benchmark_item.get("reference_answer", "No reference answer available.")

        # More robust answerer prompt
        answer_prompt_text = f"""
        You are an assistant providing clarification to another AI.
        The original task given to the AI was:
        Original Task: {user_input}
        Another AI's question: {question}

        Provide a *brief* and *focused* answer to the AI's specific question.
        Use the context from the 'Original Task' and the 'Reference Answer' 
        Reference Answer (Context Only - Do Not Reveal Directly): 
        {reference_answer}

        - *Do not* solve the original task. 
        - *Do not* reveal the reference answer directly.
        - Just provide the information needed to answer the AI's specific question concisely.
        """
        try:
            # Use the dedicated answerer model
            answer_response = self.answerer.invoke(answer_prompt_text)
            answer = self._extract_content(answer_response)
            print(f"  [Answerer] Generated Answer {question_num+1}: {answer}")
            return answer

        except Exception as e:
            print(f"  [Answerer] Error generating answer for question {question_num+1}: {e}")
            return f"Error generating answer to question {question_num + 1}."

    def generate_final_response(self, system_prompt: str, user_input: str, inquiring_list: List[Dict]) -> str:
        """Generate the final response based on original input and answers in inquiring Q&A"""

        context_section = "\nNo clarification was requested or provided.\n"

        # check if inquiring_list has items and if the first item is a dictionary
        if inquiring_list and isinstance(inquiring_list[0], dict):
            first_entry = inquiring_list[0]
            clarification_answer = first_entry.get("answer_0") # check if answer_0 exists

            if clarification_answer:
                # if clarification_answer exists, add it to the context_section
                context_section = "\n--- Clarifications Provided During Process ---\n{clarification_answer}\n--- 추가 정보 끝 ---\n"
            # in case initial uncertainty is detected but model decided not to ask a question ('None' from generate_question)
            elif first_entry.get("uncertainty") is not None and first_entry.get("question_0") is None:
                 context_section = "\n--- initial uncertainty was detected but model decided not to ask a question. ---\n"
            # in case initial uncertainty is not detected
            elif first_entry.get("uncertainty") is None:
                 context_section = "\n--- No additional question was needed ---\n"
            # other cases (e.g., question was generated but answer generation failed) -> keep the default value or handle separately

        final_prompt_text = f"""
        You are tasked with completing the following request:
        Original Request: {user_input}

        Additional clarifications provided during the process (if any):
        {context_section}

        Based *only* on the Original Request and the Clarifications Provided above, generate the complete and final response.
        Adhere strictly to the requirements of the Original Request.
        """
        try:
            final_response_obj = self.model.invoke(
                final_prompt_text,
                config={"system_prompt": system_prompt} if system_prompt else None
            )
            final_response = self._extract_content(final_response_obj)
            print(f"  [{self.model_name}] Generated Final Response (using answers as context).")
            return final_response

        except Exception as e:
            print(f"  [{self.model_name}] Error generating final response: {e}")
            return "Error generating final response."

    def process(self, state: State) -> Dict[str, Any]:
        """Main processor method that handles the entire QAG workflow for this model."""
        results = []
        model_results_key = f"{self.model_name}_results"
        print(f"--- Processing model: {self.model_name} with QAG ---")

        prompts_to_process = state.get("prompts", [])
        if not prompts_to_process:
            print(f"  No prompts found for {self.model_name}.")
            return {model_results_key: []} # Return empty list for this model

        for prompt in prompts_to_process:
            prompt_id = prompt.get('id', f'unknown_id_{time.time()}')
            system_prompt = prompt.get('system_prompt', '')
            user_input = prompt.get('input', '')

            print(f"\n  Processing Prompt ID: {prompt_id} for {self.model_name}")

            if not user_input:
                print(f"  Skipping prompt {prompt_id} for {self.model_name} due to empty input.")
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": [], # Keep structure consistent
                    "response": None,
                    "error": "Skipped due to empty input",
                    "latency": 0
                }
                results.append(result)
                continue

            start_time = time.time()
            inquiring_list = [] # Initialize list to store Q&A history *for this prompt*
            final_response = None
            error_message = None

            try:
                # 1. Initial Uncertainty Check (Done only ONCE)
                initial_uncertainty = self.generate_uncertainty_check(system_prompt, user_input)

                if initial_uncertainty is None:
                     # Case: No uncertainty, generate response directly
                     print(f"  [{self.model_name}] No initial uncertainty detected. Proceeding to final response.")
                     inquiring_list.append({"uncertainty": None}) # Keep minimal entry
                     final_response = self.generate_final_response(system_prompt, user_input, [])

                else:
                    # Case: Uncertainty detected, make a single question
                    print(f"  [{self.model_name}] Initial uncertainty detected: '{initial_uncertainty}'. Asking a question.")
                    question_count = 0
                    print(f"\n  [{self.model_name}] Attempting Q&A Iteration {question_count + 1}")

                    # Generate Question based on the detected uncertainty
                    question = self.generate_question(
                        system_prompt,
                        user_input,
                        initial_uncertainty, # Pass the initially identified uncertainty
                        question_count,
                        [] 
                        )

                    # Check if generate_question returned None (meaning stop)
                    if question is None:
                        print(f"  [{self.model_name}] Generated Question {question_count+1}: {question}")
                        inquiry_step = {
                            "uncertainty": initial_uncertainty,
                            f"question_{question_count}": question
                        }

                        # Answer genertation
                        answer = self.generate_answer(prompt_id, question, user_input, question_count)
                        inquiry_step[f"answer_{question_count}"] = answer
                        inquiring_list.append(inquiry_step)

                    else: # If no uncertainty was detected, generate final response directly
                        print(f"  [{self.model_name}] Model did not generate a question (returned None). Skipping answer generation.")
                        inquiring_list.append({"uncertainty": initial_uncertainty, f"question_{question_count}": None, f"answer_{question_count}": None})

                # After clarifying Q&A attempt, generate final response immediately and pass the Q&A history
                print(f"  [{self.model_name}] Generating final response after clarifying Q&A attempt.")
                final_response = self.generate_final_response(system_prompt, user_input, inquiring_list)

            # Error handling
            except Exception as e:
                 error_message = f"Unhandled error during processing prompt {prompt_id} for {self.model_name}: {type(e).__name__}: {e}"
                 print(f"  {error_message}")
                 if not inquiring_list: inquiring_list = [] # Ensure it's a list

            # latency calculation 
            end_time = time.time()
            latency = int((end_time - start_time) * 1000)

            # Assemble final result object for this prompt
            result = {
                "id": prompt_id,
                "model_name": self.model_name,
                "inquiring": inquiring_list,
                "response": final_response,
                "latency": latency,
                "error": error_message
            }
            results.append(result)
            print(f"  Finished processing prompt {prompt_id} for {self.model_name}. Latency: {latency}ms. Errors: {'Yes' if error_message else 'No'}")

        # Return results for this model
        return {model_results_key: results}

In [6]:
# Create processors using the new class
# Pass the benchmark_data loaded earlier (ensure it's available in this scope)
phi_processor = QuestionAugmentedProcessor(phi, "phi", answerer, benchmark_data)
qwen1_5_processor = QuestionAugmentedProcessor(qwen1_5, "qwen1_5", answerer, benchmark_data)
vicuna_processor = QuestionAugmentedProcessor(vicuna, "vicuna", answerer, benchmark_data)
llama2_processor = QuestionAugmentedProcessor(llama2, "llama2", answerer, benchmark_data)
llama3_1_processor = QuestionAugmentedProcessor(llama3_1, "llama3_1", answerer, benchmark_data)
gemma3_processor = QuestionAugmentedProcessor(gemma3, "gemma3", answerer, benchmark_data)   
# process_deepseek_r1 = QuestionAugmentedProcessor(deepseek_r1, "deepseek_r1", answerer, benchmark_data) # Keep commented if not used

# Define the process functions that will be used as nodes
def process_phi(state: State) -> Dict[str, Any]:
    """Node function to run the phi processor."""
    return phi_processor.process(state)

def process_qwen1_5(state: State) -> Dict[str, Any]:
    """Node function to run the qwen1_5 processor."""
    return qwen1_5_processor.process(state)

def process_vicuna(state: State) -> Dict[str, Any]:
    """Node function to run the vicuna processor."""
    return vicuna_processor.process(state)

def process_llama2(state: State) -> Dict[str, Any]:
    """Node function to run the llama2 processor."""
    return llama2_processor.process(state)

def process_llama3_1(state: State) -> Dict[str, Any]:
    """Node function to run the llama3_1 processor."""
    return llama3_1_processor.process(state)

def process_gemma3(state: State) -> Dict[str, Any]:
    """Node function to run the gemma3 processor."""
    return gemma3_processor.process(state)

# Define other nodes (save_intermediate_results, evaluate_responses)
# Ensure these functions are defined as they were in your original notebook (Cells 7 and 8)
# ... (Make sure the save_intermediate_results and evaluate_responses functions are defined here or previously)

Initialized QuestionAugmentedProcessor for phi with 4 benchmark items.
Initialized QuestionAugmentedProcessor for qwen1_5 with 4 benchmark items.
Initialized QuestionAugmentedProcessor for vicuna with 4 benchmark items.
Initialized QuestionAugmentedProcessor for llama2 with 4 benchmark items.
Initialized QuestionAugmentedProcessor for llama3_1 with 4 benchmark items.
Initialized QuestionAugmentedProcessor for gemma3 with 4 benchmark items.


In [7]:
# --- define save intermediate results node ---
def save_intermediate_results(state: State) -> Dict:
    """Saves the generated responses before evaluation using the timestamp from the state."""
    print("--- Saving Intermediate Results (Responses Only) ---")
    # State로부터 타임스탬프 가져오기
    timestamp = state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_fallback"))
    if not timestamp:
        print("  Warning: Timestamp not found in state. Generating a new one for fallback.")
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S_fallback")
    
    output_dir = "_output"
    os.makedirs(output_dir, exist_ok=True) # 출력 디렉토리 확인 및 생성

    # 저장할 데이터 구성 (응답 결과만 선택)
    data_to_save = {}
    model_result_keys = [key for key in state if key.endswith("_results") and key not in ["evaluation_results"]]
    for key in model_result_keys:
        if state.get(key):
             data_to_save[key] = state[key]

    # 프롬프트 정보도 함께 저장하고 싶다면 추가
    # data_to_save["prompts"] = state.get("prompts", [])

    if not data_to_save:
        print("  No response data found to save.")
        return {} # 저장할 데이터 없으면 아무것도 안함

    base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
    intermediate_filename = f"{timestamp}_{base_name}_quag_responses_only.json"
    output_file_path = os.path.join(output_dir, intermediate_filename)

    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(data_to_save, f, ensure_ascii=False, indent=4)
        print(f"  Intermediate results successfully saved to {output_file_path}")
    except Exception as e:
        print(f"  Error saving intermediate results: {e}")

    # 이 노드는 상태를 변경하지 않으므로 빈 딕셔셔리 반환
    return {}

In [8]:
# --- Define Evaluation node ---

# 평가 프롬프트 템플릿 정의 (biggen_bench 구조 기반)
evaluation_prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an expert evaluator. Your task is to evaluate an AI assistant's response based on the provided user query, reference answer, and a detailed scoring rubric.
Focus ONLY on the provided information and rubric. Assign a score from 1 to 5, where 5 is the best, according to the descriptions.
Provide your output strictly in the specified format."""),
    ("human", """
**Evaluation Context:**

* **Task Type:** {task_description}
* **User Query:**
    ```
    {user_query}
    ```
* **Reference Answer:**
    ```
    {reference_answer}
    ```
* **AI Response to Evaluate:**
    ```
    {ai_response}
    ```

**Scoring Rubric:**

* **Criteria:** {criteria}
* **Score 1 Description:** {score1_desc}
* **Score 2 Description:** {score2_desc}
* **Score 3 Description:** {score3_desc}
* **Score 4 Description:** {score4_desc}
* **Score 5 Description:** {score5_desc}

**Instructions:**

1.  Carefully compare the "AI Response to Evaluate" against the "Reference Answer" and the "Scoring Rubric".
2.  Determine the score (1-5) that best reflects the quality of the AI Response according to the rubric descriptions.
3.  Provide a brief rationale explaining *why* you chose that score, referencing specific aspects of the rubric descriptions and the AI response.

**Output Format (MUST follow exactly):**
Score: [Your score between 1-5]
Rationale: [Your concise explanation based on the rubric]
""")
])

# --- evaluate_responses 함수 내에서 이 템플릿을 사용하는 방법 ---

def evaluate_responses(state: State) -> Dict[str, Any]:
    """Evaluates responses from different models based on rubrics."""
    print("--- Starting Evaluation ---")
    all_evaluations = []
    # State에서 benchmark_data 또는 그 매핑을 가져와야 함
    # 예: benchmark_data가 evaluate_responses 스코프에서 사용 가능하다고 가정
    benchmark_map_full = {item["id"]: item for item in benchmark_data}

    parser = StrOutputParser()
    evaluation_chain = evaluation_prompt_template | evaluator | parser

    model_result_keys = [key for key in state if key.endswith("_results") and key != "evaluation_results"]

    for key in model_result_keys:
        print(f"  Evaluating results from: {key}")
        model_results = state.get(key, [])
        for response_item in model_results:
            prompt_id = response_item.get("id")
            model_name = response_item.get("model_name")
            response_content = response_item.get("response")
            error = response_item.get("error")

            if error:
                eval_result = {"id": prompt_id, "agent_name": model_name, "score": None, "rationale": f"Skipped due to error: {error}", "error": True}
                all_evaluations.append(eval_result)
                continue

            if not prompt_id or prompt_id not in benchmark_map_full:
                print(f"    Warning: Missing full benchmark data for prompt ID {prompt_id}. Skipping.")
                continue

            benchmark_item = benchmark_map_full[prompt_id] # 해당 ID의 전체 benchmark 데이터

            if not response_content:
                 eval_result = {"id": prompt_id, "agent_name": model_name, "score": 0, "rationale": "Empty response", "error": False}
                 all_evaluations.append(eval_result)
                 continue

            # --- 여기가 중요: input_data 딕셔너리 생성 ---
            # benchmark_item (JSON 파일의 항목) 과 response_content (모델 응답)에서 값을 가져와
            # evaluation_prompt_template 의 변수 이름에 매핑합니다.
            input_data = {
                "task_description": benchmark_item.get("task", "N/A"),              # JSON의 'task' 필드
                "user_query": benchmark_item.get("input", "N/A"),                # JSON의 'input' 필드
                "reference_answer": benchmark_item.get("reference_answer", "N/A"),# JSON의 'reference_answer' 필드
                "ai_response": response_content,                                 # LangGraph State에서 온 모델 응답
                "criteria": benchmark_item.get("score_rubric", {}).get("criteria", "N/A"), # JSON의 'score_rubric'.'criteria'
                "score1_desc": benchmark_item.get("score_rubric", {}).get("score1_description", "N/A"), # 이하 scoreX_description
                "score2_desc": benchmark_item.get("score_rubric", {}).get("score2_description", "N/A"),
                "score3_desc": benchmark_item.get("score_rubric", {}).get("score3_description", "N/A"),
                "score4_desc": benchmark_item.get("score_rubric", {}).get("score4_description", "N/A"),
                "score5_desc": benchmark_item.get("score_rubric", {}).get("score5_description", "N/A"),
            }
            # ---------------------------------------------

            try:
                start_time = time.time()
                evaluation_output_str = evaluation_chain.invoke(input_data)
                end_time = time.time()

                # 출력 파싱 (이전 답변과 동일)
                score = None
                rationale = ""
                score_match = re.search(r"Score:\s*(\d)", evaluation_output_str)
                rationale_match = re.search(r"Rationale:\s*(.*)", evaluation_output_str, re.DOTALL)

                if score_match:
                    score = int(score_match.group(1))
                if rationale_match:
                    rationale = rationale_match.group(1).strip()

                if score is None or not rationale:
                     print(f"    Warning: Could not parse score/rationale for prompt {prompt_id}. Raw output: {evaluation_output_str}")
                     rationale = f"Parsing Warning. Raw Output: {evaluation_output_str}"

                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": score,
                    "rationale": rationale, "latency": end_time - start_time, "error": False
                }
                print(f"    Evaluated prompt {prompt_id} from {model_name} in {eval_result['latency']:.2f}s. Score: {eval_result['score']}")

            except Exception as e:
                print(f"    Error evaluating prompt {prompt_id} from {model_name}: {e}")
                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": None,
                    "rationale": f"Evaluation failed: {str(e)}", "latency": 0, "error": True
                }
            all_evaluations.append(eval_result)

    print("--- Evaluation Finished ---")
    return {"evaluation_results": all_evaluations}

In [9]:
# --- define workflow ---

# 1. 워크플로우 시작 전에 타임스탬프 생성
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# 2. initial state 설정 시 생성된 타임스탬프 포함
initial_state = {
    "prompts": prompts,
    "rubrics": rubrics,
#    "processed_count": 0,
    "phi_results": [],
    "qwen1_5_results": [],
    "vicuna_results": [],
    "llama2_results": [],
    "llama3_1_results": [],
    "gemma3_results": [],
    # "deepseek_r1_results": [],
    "evaluation_results": [],
    "timestamp": current_timestamp  # 생성된 타임스탬프를 State에 추가
}

# Map rubrics by ID for easier access in evaluation if needed directly from state
# Although the current evaluate_responses uses benchmark_map loaded from the file
initial_state["rubrics"] = {item["id"]: item for item in rubrics}

# create workflow (기존과 동일)
workflow = StateGraph(State)

# add nodes
workflow.add_node("process_phi", process_phi)
workflow.add_node("process_qwen1_5", process_qwen1_5)
workflow.add_node("process_vicuna", process_vicuna)
workflow.add_node("process_llama2", process_llama2)
workflow.add_node("process_llama3", process_llama3_1)
workflow.add_node("process_gemma3", process_gemma3)
workflow.add_node("save_responses", save_intermediate_results)
workflow.add_node("evaluate", evaluate_responses)

# connect edges
workflow.set_entry_point("process_phi")
workflow.add_edge("process_phi", "process_qwen1_5")
workflow.add_edge("process_qwen1_5", "process_vicuna")
workflow.add_edge("process_vicuna", "process_llama2")
workflow.add_edge("process_llama2", "process_llama3")
workflow.add_edge("process_llama3", "process_gemma3")
workflow.add_edge("process_gemma3", "save_responses")
workflow.add_edge("save_responses", "evaluate")
workflow.add_edge("evaluate", END)


# compile workflow (기존과 동일)
app = workflow.compile()


In [10]:
# run workflow and save the final state
print("--- Starting Workflow ---")
final_state = app.invoke(initial_state)
print("--- Workflow Finished ---")

# --- save final results (using timestamp from final_state) ---
final_output_dir = "_output"
os.makedirs(final_output_dir, exist_ok=True)

# 3. 최종 상태에서 타임스탬프 가져와서 사용
final_timestamp = final_state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_final_fallback"))
if not final_timestamp:
    print("  Warning: Timestamp not found in final state. Generating a new one for fallback.")
    final_timestamp = datetime.now().strftime("%Y%m%d%H%M%S_final_fallback")


final_base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
final_filename = f"{final_timestamp}_{final_base_name}_quag_with_evaluation.json" 
final_output_file_path = os.path.join(final_output_dir, final_filename)

try:
    with open(final_output_file_path, 'w', encoding='utf-8') as f:
        json.dump(final_state, f, ensure_ascii=False, indent=4)
    print(f"\nFinal results (with evaluation) successfully saved to {final_output_file_path}")
except Exception as e:
    print(f"Error saving final results: {e}")
    print(f"Final state type: {type(final_state)}")


--- Starting Workflow ---
--- Processing model: phi with QAG ---

  Processing Prompt ID: instruction_following_multi_task_inference_0 for phi
  [phi] Uncertainty check: Response too long. Assuming None. Response: 'The text mentions Elon Musk's involvement in controversies and scandals, but it does not provide enough information to determine the impact on his portfolio. Additional context is needed to provide a comprehensive response.'
  [phi] No initial uncertainty detected. Proceeding to final response.
  [phi] Generated Final Response (using answers as context).
  [phi] Generating final response after clarifying Q&A attempt.
  [phi] Generated Final Response (using answers as context).
  Finished processing prompt instruction_following_multi_task_inference_0 for phi. Latency: 9115ms. Errors: No

  Processing Prompt ID: theory_of_mind_thinking_for_doing_0 for phi
  [phi] Uncertainty check: Response too long. Assuming None. Response: 'Emma will put away her blocks after having a snack,