In [1]:
# import libraries
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
import time
import asyncio
from datetime import datetime
import json
import os
import re
from langchain_core.messages import BaseMessage
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.output_parsers.json import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

In [2]:
# Define chat models

o3_mini = init_chat_model("openai:o3-mini")
claude_haiku = init_chat_model("anthropic:claude-3-haiku-20240307", temperature=0)
claude_sonnet = init_chat_model("anthropic:claude-3-7-sonnet-20250219", temperature=0)
phi = init_chat_model("ollama:phi:latest", temperature=0)
qwen1_5 = init_chat_model("ollama:qwen:0.5b", temperature=0)
vicuna = init_chat_model("ollama:vicuna:7b", temperature=0)
llama2 = init_chat_model("ollama:llama2:latest", temperature=0)
llama3_1 = init_chat_model("ollama:llama3.1:latest", temperature=0)
deepseek_r1 = init_chat_model("ollama:deepseek-r1:8b", temperature=0)
gemma3 = init_chat_model("ollama:gemma3:12b", temperature=0)
answerer = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)
evaluator = init_chat_model("openai:gpt-4-turbo-2024-04-09", temperature=0)

In [3]:
# Preparing benchmark data and prompts, rubric

bench_path = "biggen_bench_test_4instance.json"

def load_benchmark_data(bench_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(bench_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

benchmark_data = load_benchmark_data(bench_path)

def prepare_prompts(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare prompts by excluding reference_answer and score_rubric."""
    prompts = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "capability": item["capability"],
            "task": item["task"],
            "instance_idx": item["instance_idx"],
            "system_prompt": item["system_prompt"],
            "input": item["input"],
            # Exclude reference_answer and score_rubric
        }
        prompts.append(prompt)
    return prompts

def prepare_rubric(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare rubric including reference_answer and score_rubric."""
    rubric = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "reference_answer": item["reference_answer"],
            "score_rubric": item["score_rubric"]
        }
        rubric.append(prompt)
    return rubric

prompts = prepare_prompts(benchmark_data)
rubrics = prepare_rubric(benchmark_data)

In [4]:
# Define the state structure
class State(TypedDict):
    prompts: List[Dict[str, Any]]
    rubrics: List[Dict[str, Any]]
    # o3_mini_results: List[Dict[str, Any]]
    # claude_haiku_results: List[Dict[str, Any]]
    # claude_sonnet_results: List[Dict[str, Any]]
    phi_results: List[Dict[str, Any]]
    # qwen1_5_results: List[Dict[str, Any]]
    vicuna_results: List[Dict[str, Any]]
    # llama2_results: List[Dict[str, Any]]
    # llama3_1_results: List[Dict[str, Any]]
    # gemma3_results: List[Dict[str, Any]]
    answerer_results: List[Dict[str, Any]]
    evaluation_results: List[Dict[str, Any]]
    timestamp: str  # Workflow-wide shared timestamp field

In [5]:
# QuestionAugmentedProcessor class to replace create_model_processor function
class QuestionAugmentedProcessor:
    def __init__(self, model, model_name, answerer):
        """
        Initialize the processor with a model, model name, and answerer model
        
        Args:
            model: The language model to use for response generation
            model_name: Name of the model
            answerer: The model to use for answering clarification questions
        """
        self.model = model
        self.model_name = model_name
        self.answerer = answerer
        
    def check_uncertainty(self, input_text, system_prompt):
        """
        Check if the model needs additional information to generate a response
        
        Returns:
            uncertainty: None if no additional info needed, otherwise a summary phrase
        """
        # Prompt to check uncertainty
        uncertainty_prompt = f"""
        Based on the following input, do you need any additional information or clarification to provide a high-quality response?
        
        System prompt: {system_prompt}
        
        Input: {input_text}
        
        If you need additional information, respond with a concise phrase (max 10 words) summarizing what you need.
        If you don't need additional information, respond with "None".
        """
        
        try:
            uncertainty_response = self.model.invoke(
                uncertainty_prompt,
                config={"temperature": 0.2}  # Lower temperature for more deterministic response
            )
            
            # Extract content from response
            uncertainty_content = None
            if isinstance(uncertainty_response, BaseMessage) and hasattr(uncertainty_response, 'content'):
                uncertainty_content = uncertainty_response.content
            elif isinstance(uncertainty_response, str):
                uncertainty_content = uncertainty_response
            elif isinstance(uncertainty_response, dict) and 'content' in uncertainty_response:
                uncertainty_content = uncertainty_response['content']
            else:
                uncertainty_content = str(uncertainty_response)
                
            # Clean up the response
            uncertainty_content = uncertainty_content.strip()
            
            if uncertainty_content.lower() == "none":
                return None
            else:
                # Limit to 10 words if necessary
                words = uncertainty_content.split()
                if len(words) > 10:
                    uncertainty_content = " ".join(words[:10])
                return uncertainty_content
                
        except Exception as e:
            print(f"Error checking uncertainty: {e}")
            return None
    
    def generate_question(self, uncertainty, input_text, system_prompt):
        """
        Generate a clarification question based on uncertainty
        """
        question_prompt = f"""
        Based on the following input and identified uncertainty, generate a single specific question 
        to obtain additional information or clarify context.
        
        System prompt: {system_prompt}
        
        Input: {input_text}
        
        Uncertainty: {uncertainty}
        
        Generate one clear, concise question:
        """
        
        try:
            question_response = self.model.invoke(
                question_prompt,
                config={"temperature": 0.3}
            )
            
            # Extract content from response
            question_content = None
            if isinstance(question_response, BaseMessage) and hasattr(question_response, 'content'):
                question_content = question_response.content
            elif isinstance(question_response, str):
                question_content = question_response
            elif isinstance(question_response, dict) and 'content' in question_response:
                question_content = question_response['content']
            else:
                question_content = str(question_response)
                
            return question_content.strip()
                
        except Exception as e:
            print(f"Error generating question: {e}")
            return f"Can you provide more information about {uncertainty}?"
    
    def get_answer(self, question, input_text, reference_answer):
        """
        Get answer to the question using the answerer model
        """
        answer_prompt = f"""
        You are assisting a language model that is trying to generate a response to a task.
        The model has asked a clarification question. Please provide a brief, helpful answer
        based on the original input and reference answer. Do NOT provide the final answer to the task.
        
        Original input: {input_text}
        
        Reference answer (for context only, don't repeat verbatim): {reference_answer}
        
        Question from the model: {question}
        
        Your brief answer (help clarify but don't solve the entire task):
        """
        
        try:
            answer_response = self.answerer.invoke(
                answer_prompt,
                config={"temperature": 0.2}
            )
            
            # Extract content from response
            answer_content = None
            if isinstance(answer_response, BaseMessage) and hasattr(answer_response, 'content'):
                answer_content = answer_response.content
            elif isinstance(answer_response, str):
                answer_content = answer_response
            elif isinstance(answer_response, dict) and 'content' in answer_response:
                answer_content = answer_response['content']
            else:
                answer_content = str(answer_response)
                
            return answer_content.strip()
                
        except Exception as e:
            print(f"Error getting answer: {e}")
            return "Unable to provide additional information at this time."
    
    def generate_final_response(self, input_text, system_prompt, inquiring_list=None):
        """
        Generate the final response using the original input and any inquiring information
        """
        if not inquiring_list or len(inquiring_list) == 0:
            # No inquiring information, just respond to the input
            final_prompt = input_text
            config = {"system_prompt": system_prompt}
        else:
            # Include inquiring information in the prompt
            inquiry_context = ""
            for item in inquiring_list:
                if "question" in item and "answer" in item:
                    inquiry_context += f"\nQuestion: {item['question']}\nAnswer: {item['answer']}\n"
            
            final_prompt = f"""
            {input_text}
            
            Additional information:
            {inquiry_context}
            
            Please provide your final response to the original input.
            """
            config = {"system_prompt": system_prompt}
        
        try:
            response = self.model.invoke(
                final_prompt,
                config=config
            )
            
            # Extract content from response
            response_content = None
            if isinstance(response, BaseMessage) and hasattr(response, 'content'):
                response_content = response.content
            elif isinstance(response, str):
                response_content = response
            elif isinstance(response, dict) and 'content' in response:
                response_content = response['content']
            else:
                response_content = str(response)
                
            return response_content
                
        except Exception as e:
            print(f"Error generating final response: {e}")
            return f"Error: Unable to generate a response: {str(e)}"
    
    def process(self, state):
        """
        Main processing method to handle the entire workflow
        """
        results = []
        print(f"--- Processing model: {self.model_name} ---")
        prompts_to_process = state.get("prompts", [])
        rubrics = {item["id"]: item for item in state.get("rubrics", [])}
        
        if not prompts_to_process:
            print(f"  No prompts found for {self.model_name}.")
            return {} # Return empty result if no prompts to process
        
        for prompt in prompts_to_process:
            prompt_id = prompt.get('id', 'unknown_id')
            system_prompt = prompt.get('system_prompt', '')
            user_input = prompt.get('input', '')
            
            if not user_input:
                print(f"  Skipping prompt {prompt_id} for {self.model_name} due to empty input.")
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": [{"uncertainty": None}],
                    "response": None,
                    "error": "Skipped due to empty input",
                    "latency": 0
                }
                results.append(result)
                continue
            
            try:
                start_time = time.time()
                
                # Initialize an empty inquiring list
                inquiring_list = []
                
                # Check if additional information is needed
                uncertainty = self.check_uncertainty(user_input, system_prompt)
                
                # If uncertainty is None, directly generate the response
                if uncertainty is None:
                    inquiring_list.append({"uncertainty": None})
                    response_content = self.generate_final_response(user_input, system_prompt)
                else:
                    # Generate a question
                    question = self.generate_question(uncertainty, user_input, system_prompt)
                    
                    # Get reference answer for the prompt ID
                    reference_answer = ""
                    if prompt_id in rubrics:
                        reference_answer = rubrics[prompt_id].get("reference_answer", "")
                    
                    # Get answer from the answerer model
                    answer = self.get_answer(question, user_input, reference_answer)
                    
                    # Add to inquiring list
                    inquiring_item = {
                        "uncertainty": uncertainty,
                        "question": question,
                        "answer": answer
                    }
                    inquiring_list.append(inquiring_item)
                    
                    # Generate final response using the original input and inquiring information
                    response_content = self.generate_final_response(user_input, system_prompt, inquiring_list)
                
                end_time = time.time()
                latency = end_time - start_time
                
                # Success result dictionary
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": inquiring_list,
                    "response": response_content,
                    "latency": latency,
                    "error": None # No error
                }
                print(f"  Successfully processed prompt {prompt_id} for {self.model_name} in {result['latency']:.2f}s")
                
            except Exception as e:
                # Error handling with detailed logging
                error_message = f"Error processing prompt {prompt_id} for {self.model_name}: {type(e).__name__}: {e}"
                print(f"  {error_message}")
                
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": [{"uncertainty": None}],
                    "response": None,
                    "error": error_message,
                    "latency": 0
                }
            
            results.append(result)
        
        results_key = f"{self.model_name}_results"
        return {results_key: results}

In [7]:
# Create processors
process_phi = QuestionAugmentedProcessor(phi, "phi", answerer).process
# process_qwen1_5 =  QuestionAugmentedProcessor(qwen1_5, "qwen1_5", answerer).process
process_vicuna = QuestionAugmentedProcessor(vicuna, "vicuna", answerer).process
# process_llama2 = QuestionAugmentedProcessor(llama2, "llama2", answerer).process
# process_llama3_1 = QuestionAugmentedProcessor(llama3_1, "llama3_1", answerer).process
# process_gemma3 = QuestionAugmentedProcessor(gemma3, "gemma3", answerer).process   
# process_deepseek_r1 = QuestionAugmentedProcessor(deepseek_r1, "deepseek_r1", answerer).process

In [8]:
# --- define save intermediate results node ---
def save_intermediate_results(state: State) -> Dict:
    """Saves the generated responses including inquiring lists before evaluation using the timestamp from the state."""
    print("--- Saving Intermediate Results (Responses with Inquiring Data) ---")
    # Get timestamp from state
    timestamp = state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_fallback"))
    if not timestamp:
        print("  Warning: Timestamp not found in state. Generating a new one for fallback.")
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S_fallback")
    
    output_dir = "_output"
    os.makedirs(output_dir, exist_ok=True)

    # Prepare data to save (responses with inquiring lists)
    data_to_save = {}
    model_result_keys = [key for key in state if key.endswith("_results") and key not in ["evaluation_results"]]
    for key in model_result_keys:
        if state.get(key):
             data_to_save[key] = state[key]


    # 프롬프트 정보도 함께 저장하고 싶다면 추가
    # data_to_save["prompts"] = state.get("prompts", [])

    if not data_to_save:
        print("  No response data found to save.")
        return {}

    base_name, _ = os.path.splitext(os.path.basename(bench_path)) # remove extension
    intermediate_filename = f"{timestamp}_{base_name}_straight_responses_only.json" 
    output_file_path = os.path.join(output_dir, intermediate_filename)

    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(data_to_save, f, ensure_ascii=False, indent=4)
        print(f"  Intermediate results successfully saved to {output_file_path}")
    except Exception as e:
        print(f"  Error saving intermediate results: {e}")

    return {}

In [9]:
# --- Define Evaluation node ---

# 평가 프롬프트 템플릿 정의 (biggen_bench 구조 기반)
evaluation_prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an expert evaluator. Your task is to evaluate an AI assistant's response based on the provided user query, reference answer, and a detailed scoring rubric.
Focus ONLY on the provided information and rubric. Assign a score from 1 to 5, where 5 is the best, according to the descriptions.
Provide your output strictly in the specified format."""),
    ("human", """
**Evaluation Context:**

* **Task Type:** {task_description}
* **User Query:**
    ```
    {user_query}
    ```
* **Reference Answer:**
    ```
    {reference_answer}
    ```
* **AI Response to Evaluate:**
    ```
    {ai_response}
    ```

**Scoring Rubric:**

* **Criteria:** {criteria}
* **Score 1 Description:** {score1_desc}
* **Score 2 Description:** {score2_desc}
* **Score 3 Description:** {score3_desc}
* **Score 4 Description:** {score4_desc}
* **Score 5 Description:** {score5_desc}

**Instructions:**

1.  Carefully compare the "AI Response to Evaluate" against the "Reference Answer" and the "Scoring Rubric".
2.  Determine the score (1-5) that best reflects the quality of the AI Response according to the rubric descriptions.
3.  Provide a brief rationale explaining *why* you chose that score, referencing specific aspects of the rubric descriptions and the AI response.

**Output Format (MUST follow exactly):**
Score: [Your score between 1-5]
Rationale: [Your concise explanation based on the rubric]
""")
])

# --- evaluate_responses 함수 내에서 이 템플릿을 사용하는 방법 ---

def evaluate_responses(state: State) -> Dict[str, Any]:
    """Evaluates only the final responses from different models based on rubrics."""
    print("--- Starting Evaluation ---")
    all_evaluations = []
    benchmark_map_full = {item["id"]: item for item in benchmark_data}

    parser = StrOutputParser()
    evaluation_chain = evaluation_prompt_template | evaluator | parser

    model_result_keys = [key for key in state if key.endswith("_results") and key != "evaluation_results"]

    for key in model_result_keys:
        print(f"  Evaluating results from: {key}")
        model_results = state.get(key, [])
        for response_item in model_results:
            prompt_id = response_item.get("id")
            model_name = response_item.get("model_name")
            response_content = response_item.get("response")  # Only evaluate the final response
            error = response_item.get("error")

            if error:
                eval_result = {"id": prompt_id, "agent_name": model_name, "score": None, "rationale": f"Skipped due to error: {error}", "error": True}
                all_evaluations.append(eval_result)
                continue

            if not prompt_id or prompt_id not in benchmark_map_full:
                print(f"    Warning: Missing full benchmark data for prompt ID {prompt_id}. Skipping.")
                continue

            benchmark_item = benchmark_map_full[prompt_id]

            if not response_content:
                 eval_result = {"id": prompt_id, "agent_name": model_name, "score": 0, "rationale": "Empty response", "error": False}
                 all_evaluations.append(eval_result)
                 continue

            input_data = {
                "task_description": benchmark_item.get("task", "N/A"),
                "user_query": benchmark_item.get("input", "N/A"),
                "reference_answer": benchmark_item.get("reference_answer", "N/A"),
                "ai_response": response_content,
                "criteria": benchmark_item.get("score_rubric", {}).get("criteria", "N/A"),
                "score1_desc": benchmark_item.get("score_rubric", {}).get("score1_description", "N/A"),
                "score2_desc": benchmark_item.get("score_rubric", {}).get("score2_description", "N/A"),
                "score3_desc": benchmark_item.get("score_rubric", {}).get("score3_description", "N/A"),
                "score4_desc": benchmark_item.get("score_rubric", {}).get("score4_description", "N/A"),
                "score5_desc": benchmark_item.get("score_rubric", {}).get("score5_description", "N/A"),
            }

            try:
                start_time = time.time()
                evaluation_output_str = evaluation_chain.invoke(input_data)
                end_time = time.time()

                score = None
                rationale = ""
                score_match = re.search(r"Score:\\s*(\\d)", evaluation_output_str)
                rationale_match = re.search(r"Rationale:\\s*(.*)", evaluation_output_str, re.DOTALL)

                if score_match:
                    score = int(score_match.group(1))
                if rationale_match:
                    rationale = rationale_match.group(1).strip()

                if score is None or not rationale:
                     print(f"    Warning: Could not parse score/rationale for prompt {prompt_id}. Raw output: {evaluation_output_str}")
                     rationale = f"Parsing Warning. Raw Output: {evaluation_output_str}"

                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": score,
                    "rationale": rationale, "latency": end_time - start_time, "error": False
                }
                print(f"    Evaluated prompt {prompt_id} from {model_name} in {eval_result['latency']:.2f}s. Score: {eval_result['score']}")

            except Exception as e:
                print(f"    Error evaluating prompt {prompt_id} from {model_name}: {e}")
                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": None,
                    "rationale": f"Evaluation failed: {str(e)}", "latency": 0, "error": True
                }
            all_evaluations.append(eval_result)

    print("--- Evaluation Finished ---")
    return {"evaluation_results": all_evaluations}

In [10]:
# --- define workflow ---

# 1. 워크플로우 시작 전에 타임스탬프 생성
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# 2. initial state 설정 시 생성된 타임스탬프 포함
initial_state = {
    "prompts": prompts,
    "rubrics": rubrics,
    # "processed_count": 0,
    "phi_results": [],
    # "qwen1_5_results": [],
    "vicuna_results": [],
    # "llama2_results": [],
    # "llama3_1_results": [],
    # "gemma3_results": [],
    # "deepseek_r1_results": [],
    "evaluation_results": [],
    "timestamp": current_timestamp  # 생성된 타임스탬프를 State에 추가
}

# create workflow 
workflow = StateGraph(State)

# add nodes
workflow.add_node("process_phi", process_phi)
# workflow.add_node("process_qwen1_5", process_qwen1_5)
workflow.add_node("process_vicuna", process_vicuna)
# workflow.add_node("process_llama2", process_llama2)
# workflow.add_node("process_llama3", process_llama3_1)
# workflow.add_node("process_gemma3", process_gemma3)
workflow.add_node("save_responses", save_intermediate_results)
workflow.add_node("evaluate", evaluate_responses)

# connect edges
workflow.set_entry_point("process_phi")
# workflow.add_edge("process_phi", "process_qwen1_5")
# workflow.add_edge("process_qwen1_5", "save_responses")
workflow.add_edge("process_phi", "process_vicuna")
workflow.add_edge("process_vicuna", "save_responses")
# workflow.add_edge("process_qwen1_5", "process_vicuna")
# workflow.add_edge("process_vicuna", "process_llama2")
# workflow.add_edge("process_llama2", "process_llama3")
# workflow.add_edge("process_llama3", "process_gemma3")
# workflow.add_edge("process_gemma3", "save_responses")
workflow.add_edge("save_responses", "evaluate")
workflow.add_edge("evaluate", END)

# compile workflow 
app = workflow.compile()


In [11]:
# run workflow
print("--- Starting Workflow ---")
final_state = app.invoke(initial_state)
print("--- Workflow Finished ---")

# --- save final results (using timestamp from final_state) ---
final_output_dir = "_output"
os.makedirs(final_output_dir, exist_ok=True)

# 3. 최종 상태에서 타임스탬프 가져와서 사용
final_timestamp = final_state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_final_fallback"))
if not final_timestamp:
    print("  Warning: Timestamp not found in final state. Generating a new one for fallback.")
    final_timestamp = datetime.now().strftime("%Y%m%d%H%M%S_final_fallback")


final_base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
final_filename = f"{final_timestamp}_{final_base_name}_straight_with_evaluation.json" # 수정됨
final_output_file_path = os.path.join(final_output_dir, final_filename)

try:
    with open(final_output_file_path, 'w', encoding='utf-8') as f:
        json.dump(final_state, f, ensure_ascii=False, indent=4)
    print(f"\nFinal results (with evaluation) successfully saved to {final_output_file_path}")
except Exception as e:
    print(f"Error saving final results: {e}")
    print(f"Final state type: {type(final_state)}")


--- Starting Workflow ---
--- Processing model: phi ---
  Successfully processed prompt instruction_following_multi_task_inference_0 for phi in 18.13s
  Successfully processed prompt theory_of_mind_thinking_for_doing_0 for phi in 9.73s
  Successfully processed prompt safety_knowledge_unlearning_0 for phi in 5.55s
  Successfully processed prompt refinement_rationale_revision_0 for phi in 3.84s
--- Processing model: vicuna ---
  Successfully processed prompt instruction_following_multi_task_inference_0 for vicuna in 53.46s
  Successfully processed prompt theory_of_mind_thinking_for_doing_0 for vicuna in 20.03s
  Successfully processed prompt safety_knowledge_unlearning_0 for vicuna in 5.14s
  Successfully processed prompt refinement_rationale_revision_0 for vicuna in 59.84s
--- Saving Intermediate Results (Responses with Inquiring Data) ---
  Intermediate results successfully saved to _output\20250507154042_biggen_bench_test_4instance_straight_responses_only.json
--- Starting Evaluation 