In [1]:
# import libraries
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
import time
import asyncio
from datetime import datetime
import json
import os
import re
from langchain_core.messages import BaseMessage
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.output_parsers.json import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

In [2]:
# Define chat models
# o3_mini = init_chat_model("openai:o3-mini")
# claude_haiku = init_chat_model("anthropic:claude-3-haiku-20240307", temperature=0)
#gemma3 = init_chat_model("ollama:gemma3:12b", temperature=0)
llama2 = init_chat_model("ollama:llama2:latest", temperature=0)
llama3_1 = init_chat_model("ollama:llama3.1:latest", temperature=0)
#deepseek_r1 = init_chat_model("ollama:deepseek-r1:8b", temperature=0)
answerer = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)
evaluator = init_chat_model("anthropic:claude-3-7-sonnet-20250219", temperature=0)

In [3]:
# Preparing benchmark data and prompts, rubric

bench_path = "biggen_bench_instruction_idx0.json"

def load_benchmark_data(bench_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(bench_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

benchmark_data = load_benchmark_data(bench_path)

def prepare_prompts(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare prompts by excluding reference_answer and score_rubric."""
    prompts = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "capability": item["capability"],
            "task": item["task"],
            "instance_idx": item["instance_idx"],
            "system_prompt": item["system_prompt"],
            "input": item["input"],
            # Exclude reference_answer and score_rubric
        }
        prompts.append(prompt)
    return prompts

def prepare_rubric(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare rubric including reference_answer and score_rubric."""
    rubric = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "reference_answer": item["reference_answer"],
            "score_rubric": item["score_rubric"]
        }
        rubric.append(prompt)
    return rubric

prompts = prepare_prompts(benchmark_data)
rubrics = prepare_rubric(benchmark_data)

In [4]:
# Define the state structure
class State(TypedDict):
    prompts: List[Dict[str, Any]]
    rubrics: Dict[str, Dict[str, Any]] 
    processed_count: int
#    gemma3_results: List[Dict[str, Any]]
    llama2_results: List[Dict[str, Any]]
    llama3_1_results: List[Dict[str, Any]]
#    deepseek_r1_results: List[Dict[str, Any]]
    evaluation_results: List[Dict[str, Any]]
    timestamp: str  # 워크플로우 전체에서 공유할 타임스탬프 필드

In [5]:
# QuestionAugmentedProcessor Class to replace the create_model_processor function
class QuestionAugmentedProcessor:
    def __init__(self, model, model_name, answerer, benchmark_data):
        self.model = model
        self.model_name = model_name
        self.answerer = answerer
        self.benchmark_data = benchmark_data
        self.benchmark_map = {item["id"]: item for item in benchmark_data}
    
    def generate_uncertainty_check(self, system_prompt, user_input):
        """Check if the model needs additional clarification before answering."""
        uncertainty_prompt = f"""
        Given the following task:
        
        "{user_input}"
        
        Your goal is to determine if you need any additional information or clarification to provide a comprehensive response.
        
        If you have all the information needed to respond confidently, reply with "None".
        If you need clarification, provide a brief phrase describing what specific information you need.
        Keep your response brief and focused.
        """
        
        try:
            uncertainty_response = self.model.invoke(
                uncertainty_prompt,
                config={"system_prompt": system_prompt}
            )
            
            # Extract content from response
            if isinstance(uncertainty_response, BaseMessage) and hasattr(uncertainty_response, 'content'):
                uncertainty = uncertainty_response.content
            elif isinstance(uncertainty_response, str):
                uncertainty = uncertainty_response
            elif isinstance(uncertainty_response, dict) and 'content' in uncertainty_response:
                uncertainty = uncertainty_response['content']
            else:
                uncertainty = str(uncertainty_response)
            
            # Clean up and standardize the response
            uncertainty = uncertainty.strip()
            if uncertainty.lower() in ["none", "no", "no uncertainty", "i don't need any additional information"]:
                return None
            return uncertainty
            
        except Exception as e:
            print(f"Error checking uncertainty: {e}")
            return None
    
    def generate_question(self, system_prompt, user_input, uncertainty, question_num):
        """Generate a clarifying question based on uncertainty."""
        question_prompt = f"""
        Given the following task:
        
        "{user_input}"
        
        You have identified the following uncertainty: "{uncertainty}"
        
        Formulate a clear, specific question (Question #{question_num+1}) that would help resolve this uncertainty.
        Your question should be brief, focused, and directly related to the uncertainty.
        """
        
        try:
            question_response = self.model.invoke(
                question_prompt,
                config={"system_prompt": system_prompt}
            )
            
            # Extract content from response
            if isinstance(question_response, BaseMessage) and hasattr(question_response, 'content'):
                question = question_response.content
            elif isinstance(question_response, str):
                question = question_response
            elif isinstance(question_response, dict) and 'content' in question_response:
                question = question_response['content']
            else:
                question = str(question_response)
                
            return question.strip()
            
        except Exception as e:
            print(f"Error generating question: {e}")
            return f"Error generating question {question_num+1}."
    
    def generate_answer(self, prompt_id, question, user_input):
        """Generate an answer to a clarifying question using the answerer model."""
        # Get reference data
        benchmark_item = self.benchmark_map.get(prompt_id, {})
        reference_answer = benchmark_item.get("reference_answer", "")
        
        answer_prompt = f"""
        You are providing clarification to an AI that is working on the following task:
        
        "{user_input}"
        
        The AI has asked: "{question}"
        
        Provide a brief, focused answer to this question that offers useful clarification without solving the entire task.
        Use the following reference information to inform your answer, but don't simply state the final answer:
        
        REFERENCE (DO NOT EXPLICITLY SHARE THIS): {reference_answer}
        
        Your response should be helpful but not provide the complete solution directly.
        """
        
        try:
            answer_response = self.answerer.invoke(answer_prompt)
            
            # Extract content from response
            if isinstance(answer_response, BaseMessage) and hasattr(answer_response, 'content'):
                answer = answer_response.content
            elif isinstance(answer_response, str):
                answer = answer_response
            elif isinstance(answer_response, dict) and 'content' in answer_response:
                answer = answer_response['content']
            else:
                answer = str(answer_response)
                
            return answer.strip()
            
        except Exception as e:
            print(f"Error generating answer: {e}")
            return f"Error generating answer to question."
    
    def generate_final_response(self, system_prompt, user_input, inquiring_list):
        """Generate the final response based on original input and inquiring results."""
        # Build context from inquiring list
        inquiring_context = ""
        for i, inquiry in enumerate(inquiring_list):
            if "question_" + str(i) in inquiry and "answer_" + str(i) in inquiry:
                inquiring_context += f"\nQuestion {i+1}: {inquiry['question_' + str(i)]}\n"
                inquiring_context += f"Answer {i+1}: {inquiry['answer_' + str(i)]}\n"
        
        final_prompt = f"""
        Given the following task:
        
        "{user_input}"
        
        And the following additional context from clarifying questions and answers:
        {inquiring_context if inquiring_context else "No additional context available."}
        
        Provide your complete and final response to the original task.
        """
        
        try:
            response = self.model.invoke(
                final_prompt,
                config={"system_prompt": system_prompt}
            )
            
            # Extract content from response
            if isinstance(response, BaseMessage) and hasattr(response, 'content'):
                final_response = response.content
            elif isinstance(response, str):
                final_response = response
            elif isinstance(response, dict) and 'content' in response:
                final_response = response['content']
            else:
                final_response = str(response)
                
            return final_response.strip()
            
        except Exception as e:
            print(f"Error generating final response: {e}")
            return "Error generating final response."
    
    def process(self, state):
        """Main processor method that handles the entire QAG workflow."""
        results = []
        print(f"--- Processing model: {self.model_name} with Question Augmented Generation ---")
        
        prompts_to_process = state.get("prompts", [])
        if not prompts_to_process:
            print(f"  No prompts found for {self.model_name}.")
            return {}  # Return empty if no prompts
        
        for prompt in prompts_to_process:
            prompt_id = prompt.get('id', 'unknown_id')
            system_prompt = prompt.get('system_prompt', '')
            user_input = prompt.get('input', '')
            
            if not user_input:
                print(f"  Skipping prompt {prompt_id} for {self.model_name} due to empty input.")
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": [],
                    "response": None,
                    "error": "Skipped due to empty input",
                    "latency": 0
                }
                results.append(result)
                continue
            
            try:
                start_time = time.time()
                
                # Initialize inquiring list to track the QAG process
                inquiring_list = []
                
                # Check for uncertainties that need clarification
                uncertainty = self.generate_uncertainty_check(system_prompt, user_input)
                
                # If there's uncertainty, start the QAG process
                question_count = 0
                max_questions = 2  # Maximum 3 questions (0, 1, 2)
                
                while uncertainty and question_count <= max_questions:
                    # Create a new inquiry entry
                    inquiry = {"uncertainty": uncertainty}
                    
                    # Generate question
                    question = self.generate_question(system_prompt, user_input, uncertainty, question_count)
                    inquiry[f"question_{question_count}"] = question
                    
                    # Generate answer from the answerer model
                    answer = self.generate_answer(prompt_id, question, user_input)
                    inquiry[f"answer_{question_count}"] = answer
                    
                    # Add this inquiry to the list
                    inquiring_list.append(inquiry)
                    
                    # Check if we need further clarification
                    if question_count < max_questions:
                        uncertainty = self.generate_uncertainty_check(system_prompt, user_input)
                    else:
                        uncertainty = None  # Stop after max questions
                    
                    question_count += 1
                
                # Generate the final response
                final_response = self.generate_final_response(system_prompt, user_input, inquiring_list)
                
                end_time = time.time()
                latency = end_time - start_time
                
                # Create result object with the inquiring list and final response
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": inquiring_list,
                    "response": final_response,
                    "latency": latency,
                    "error": None
                }
                
                print(f"  Successfully processed prompt {prompt_id} for {self.model_name} in {result['latency']:.2f}s with {len(inquiring_list)} inquiries")
                
            except Exception as e:
                error_message = f"Error processing prompt {prompt_id} for {self.model_name}: {type(e).__name__}: {e}"
                print(f"  {error_message}")
                
                result = {
                    "id": prompt_id,
                    "model_name": self.model_name,
                    "inquiring": [],  # Empty inquiring list on error
                    "response": None,
                    "error": error_message,
                    "latency": 0
                }
                
            results.append(result)
        
        results_key = f"{self.model_name}_results"
        return {results_key: results}

In [6]:

# Create processors using the new class
llama2_processor = QuestionAugmentedProcessor(llama2, "llama2", answerer, benchmark_data)
llama3_1_processor = QuestionAugmentedProcessor(llama3_1, "llama3_1", answerer, benchmark_data)
# process_deepseek_r1 = create_model_processor(deepseek_r1, "deepseek_r1")

# Define the process functions that will be used as nodes
def process_llama2(state):
    return llama2_processor.process(state)

def process_llama3_1(state):
    return llama3_1_processor.process(state)

In [7]:
# --- define save intermediate results node ---
def save_intermediate_results(state: State) -> Dict:
    """Saves the generated responses before evaluation using the timestamp from the state."""
    print("--- Saving Intermediate Results (Responses Only) ---")
    # State로부터 타임스탬프 가져오기
    timestamp = state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_fallback"))
    if not timestamp:
        print("  Warning: Timestamp not found in state. Generating a new one for fallback.")
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S_fallback")
    
    output_dir = "_output"
    os.makedirs(output_dir, exist_ok=True) # 출력 디렉토리 확인 및 생성

    # 저장할 데이터 구성 (응답 결과만 선택)
    data_to_save = {}
    model_result_keys = [key for key in state if key.endswith("_results") and key not in ["evaluation_results"]]
    for key in model_result_keys:
        if state.get(key):
             data_to_save[key] = state[key]

    # 프롬프트 정보도 함께 저장하고 싶다면 추가
    # data_to_save["prompts"] = state.get("prompts", [])

    if not data_to_save:
        print("  No response data found to save.")
        return {} # 저장할 데이터 없으면 아무것도 안함

    base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
    intermediate_filename = f"{timestamp}_{base_name}_responses_only.json" # 수정됨
    output_file_path = os.path.join(output_dir, intermediate_filename)

    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(data_to_save, f, ensure_ascii=False, indent=4)
        print(f"  Intermediate results successfully saved to {output_file_path}")
    except Exception as e:
        print(f"  Error saving intermediate results: {e}")

    # 이 노드는 상태를 변경하지 않으므로 빈 딕셔셔리 반환
    return {}

In [8]:
# --- Define Evaluation node ---

# 평가 프롬프트 템플릿 정의 (biggen_bench_instruction_idx0.json 구조 기반)
evaluation_prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an expert evaluator. Your task is to evaluate an AI assistant's response based on the provided user query, reference answer, and a detailed scoring rubric.
Focus ONLY on the provided information and rubric. Assign a score from 1 to 5, where 5 is the best, according to the descriptions.
Provide your output strictly in the specified format."""),
    ("human", """
**Evaluation Context:**

* **Task Type:** {task_description}
* **User Query:**
    ```
    {user_query}
    ```
* **Reference Answer:**
    ```
    {reference_answer}
    ```
* **AI Response to Evaluate:**
    ```
    {ai_response}
    ```

**Scoring Rubric:**

* **Criteria:** {criteria}
* **Score 1 Description:** {score1_desc}
* **Score 2 Description:** {score2_desc}
* **Score 3 Description:** {score3_desc}
* **Score 4 Description:** {score4_desc}
* **Score 5 Description:** {score5_desc}

**Instructions:**

1.  Carefully compare the "AI Response to Evaluate" against the "Reference Answer" and the "Scoring Rubric".
2.  Determine the score (1-5) that best reflects the quality of the AI Response according to the rubric descriptions.
3.  Provide a brief rationale explaining *why* you chose that score, referencing specific aspects of the rubric descriptions and the AI response.

**Output Format (MUST follow exactly):**
Score: [Your score between 1-5]
Rationale: [Your concise explanation based on the rubric]
""")
])

# --- evaluate_responses 함수 내에서 이 템플릿을 사용하는 방법 ---

def evaluate_responses(state: State) -> Dict[str, Any]:
    """Evaluates responses from different models based on rubrics."""
    print("--- Starting Evaluation ---")
    all_evaluations = []
    # State에서 benchmark_data 또는 그 매핑을 가져와야 함
    # 예: benchmark_data가 evaluate_responses 스코프에서 사용 가능하다고 가정
    benchmark_map_full = {item["id"]: item for item in benchmark_data}

    parser = StrOutputParser()
    evaluation_chain = evaluation_prompt_template | evaluator | parser

    model_result_keys = [key for key in state if key.endswith("_results") and key != "evaluation_results"]

    for key in model_result_keys:
        print(f"  Evaluating results from: {key}")
        model_results = state.get(key, [])
        for response_item in model_results:
            prompt_id = response_item.get("id")
            model_name = response_item.get("model_name")
            response_content = response_item.get("response")
            error = response_item.get("error")

            if error:
                eval_result = {"id": prompt_id, "agent_name": model_name, "score": None, "rationale": f"Skipped due to error: {error}", "error": True}
                all_evaluations.append(eval_result)
                continue

            if not prompt_id or prompt_id not in benchmark_map_full:
                print(f"    Warning: Missing full benchmark data for prompt ID {prompt_id}. Skipping.")
                continue

            benchmark_item = benchmark_map_full[prompt_id] # 해당 ID의 전체 benchmark 데이터

            if not response_content:
                 eval_result = {"id": prompt_id, "agent_name": model_name, "score": 0, "rationale": "Empty response", "error": False}
                 all_evaluations.append(eval_result)
                 continue

            # --- 여기가 중요: input_data 딕셔너리 생성 ---
            # benchmark_item (JSON 파일의 항목) 과 response_content (모델 응답)에서 값을 가져와
            # evaluation_prompt_template 의 변수 이름에 매핑합니다.
            input_data = {
                "task_description": benchmark_item.get("task", "N/A"),              # JSON의 'task' 필드
                "user_query": benchmark_item.get("input", "N/A"),                # JSON의 'input' 필드
                "reference_answer": benchmark_item.get("reference_answer", "N/A"),# JSON의 'reference_answer' 필드
                "ai_response": response_content,                                 # LangGraph State에서 온 모델 응답
                "criteria": benchmark_item.get("score_rubric", {}).get("criteria", "N/A"), # JSON의 'score_rubric'.'criteria'
                "score1_desc": benchmark_item.get("score_rubric", {}).get("score1_description", "N/A"), # 이하 scoreX_description
                "score2_desc": benchmark_item.get("score_rubric", {}).get("score2_description", "N/A"),
                "score3_desc": benchmark_item.get("score_rubric", {}).get("score3_description", "N/A"),
                "score4_desc": benchmark_item.get("score_rubric", {}).get("score4_description", "N/A"),
                "score5_desc": benchmark_item.get("score_rubric", {}).get("score5_description", "N/A"),
            }
            # ---------------------------------------------

            try:
                start_time = time.time()
                evaluation_output_str = evaluation_chain.invoke(input_data)
                end_time = time.time()

                # 출력 파싱 (이전 답변과 동일)
                score = None
                rationale = ""
                score_match = re.search(r"Score:\s*(\d)", evaluation_output_str)
                rationale_match = re.search(r"Rationale:\s*(.*)", evaluation_output_str, re.DOTALL)

                if score_match:
                    score = int(score_match.group(1))
                if rationale_match:
                    rationale = rationale_match.group(1).strip()

                if score is None or not rationale:
                     print(f"    Warning: Could not parse score/rationale for prompt {prompt_id}. Raw output: {evaluation_output_str}")
                     rationale = f"Parsing Warning. Raw Output: {evaluation_output_str}"

                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": score,
                    "rationale": rationale, "latency": end_time - start_time, "error": False
                }
                print(f"    Evaluated prompt {prompt_id} from {model_name} in {eval_result['latency']:.2f}s. Score: {eval_result['score']}")

            except Exception as e:
                print(f"    Error evaluating prompt {prompt_id} from {model_name}: {e}")
                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": None,
                    "rationale": f"Evaluation failed: {str(e)}", "latency": 0, "error": True
                }
            all_evaluations.append(eval_result)

    print("--- Evaluation Finished ---")
    return {"evaluation_results": all_evaluations}

In [None]:
# --- define workflow and run ---

# 1. 워크플로우 시작 전에 타임스탬프 생성
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# 2. initial state 설정 시 생성된 타임스탬프 포함
initial_state = {
    "prompts": prompts,
    "rubrics": rubrics,
#    "processed_count": 0,
    "llama2_results": [],
    "llama3_1_results": [],
    # "deepseek_r1_results": [],
    "evaluation_results": [],
    "timestamp": current_timestamp  # 생성된 타임스탬프를 State에 추가
}

# create workflow (기존과 동일)
workflow = StateGraph(State)

# 노드 추가 (기존과 동일)
# workflow.add_node("process_gemma3", process_gemma3)
workflow.add_node("process_llama2", process_llama2)
workflow.add_node("process_llama3", process_llama3_1)
workflow.add_node("save_responses", save_intermediate_results)
workflow.add_node("evaluate", evaluate_responses)

# 엣지 연결 (기존과 동일)
workflow.set_entry_point("process_llama2")
workflow.add_edge("process_llama2", "process_llama3")
workflow.add_edge("process_llama3", "save_responses")
workflow.add_edge("save_responses", "evaluate")
workflow.add_edge("evaluate", END)

# compile workflow (기존과 동일)
app = workflow.compile()

# run workflow (기존과 동일)
print("--- Starting Workflow ---")
final_state = app.invoke(initial_state)
print("--- Workflow Finished ---")


In [None]:
# --- save final results (using timestamp from final_state) ---
final_output_dir = "_output"
os.makedirs(final_output_dir, exist_ok=True)

# 3. 최종 상태에서 타임스탬프 가져와서 사용
final_timestamp = final_state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_final_fallback"))
if not final_timestamp:
    print("  Warning: Timestamp not found in final state. Generating a new one for fallback.")
    final_timestamp = datetime.now().strftime("%Y%m%d%H%M%S_final_fallback")


final_base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
final_filename = f"{final_timestamp}_{final_base_name}_with_evaluation.json" # 수정됨
final_output_file_path = os.path.join(final_output_dir, final_filename)

try:
    with open(final_output_file_path, 'w', encoding='utf-8') as f:
        json.dump(final_state, f, ensure_ascii=False, indent=4)
    print(f"\nFinal results (with evaluation) successfully saved to {final_output_file_path}")
except Exception as e:
    print(f"Error saving final results: {e}")
    print(f"Final state type: {type(final_state)}")
