In [1]:
# import libraries
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
import time
import asyncio
from datetime import datetime
import json
import os
import re
from langchain_core.messages import BaseMessage
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.output_parsers.json import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

In [2]:
# Define chat models

o3_mini = init_chat_model("openai:o3-mini")
claude_haiku = init_chat_model("anthropic:claude-3-haiku-20240307", temperature=0)
claude_sonnet = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)
phi = init_chat_model("ollama:phi:latest", temperature=0)
qwen1_5 = init_chat_model("ollama:qwen:0.5b", temperature=0)
vicuna = init_chat_model("ollama:vicuna:7b", temperature=0)
llama2 = init_chat_model("ollama:llama2:latest", temperature=0)
llama3_1 = init_chat_model("ollama:llama3.1:latest", temperature=0)
deepseek_r1 = init_chat_model("ollama:deepseek-r1:8b", temperature=0)
gemma3 = init_chat_model("ollama:gemma3:12b", temperature=0)
evaluator = init_chat_model("openai:gpt-4-turbo-2024-04-09", temperature=0)

In [3]:
# Preparing benchmark data and prompts, rubric

bench_path = "biggen_bench_test_4instance.json"

def load_benchmark_data(bench_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(bench_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

benchmark_data = load_benchmark_data(bench_path)

def prepare_prompts(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare prompts by excluding reference_answer and score_rubric."""
    prompts = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "capability": item["capability"],
            "task": item["task"],
            "instance_idx": item["instance_idx"],
            "system_prompt": item["system_prompt"],
            "input": item["input"],
            # Exclude reference_answer and score_rubric
        }
        prompts.append(prompt)
    return prompts

def prepare_rubric(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare rubric including reference_answer and score_rubric."""
    rubric = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "reference_answer": item["reference_answer"],
            "score_rubric": item["score_rubric"]
        }
        rubric.append(prompt)
    return rubric

prompts = prepare_prompts(benchmark_data)
rubrics = prepare_rubric(benchmark_data)

In [4]:
# Define the state structure
class State(TypedDict):
    prompts: List[Dict[str, Any]]
    rubrics: Dict[str, Dict[str, Any]] 
#    processed_count: int
    phi_results: List[Dict[str, Any]]
    qwen1_5_results: List[Dict[str, Any]]
    vicuna_results: List[Dict[str, Any]]
    llama2_results: List[Dict[str, Any]]
    llama3_1_results: List[Dict[str, Any]]
    gemma3_results: List[Dict[str, Any]]
#    deepseek_r1_results: List[Dict[str, Any]]
    evaluation_results: List[Dict[str, Any]]
    timestamp: str  # 워크플로우 전체에서 공유할 타임스탬프 필드

In [5]:
# processor_model function (node)

def create_model_processor(model, model_name):
    def process_model(state: State) -> Dict[str, Any]:
        results = []
        print(f"--- Processing model: {model_name} ---")
        prompts_to_process = state.get("prompts", [])
        if not prompts_to_process:
             print(f"  No prompts found for {model_name}.")
             return {} # 처리할 프롬프트 없으면 빈 결과 반환

        for prompt in prompts_to_process:
            prompt_id = prompt.get('id', 'unknown_id') # ID 먼저 추출
            system_prompt = prompt.get('system_prompt', '')
            user_input = prompt.get('input', '')

            if not user_input:
                print(f"  Skipping prompt {prompt_id} for {model_name} due to empty input.")
                result = {
                    "id": prompt_id,
                    "model_name": model_name,
                    "response": None,
                    "error": "Skipped due to empty input",
                    "latency": 0
                }
                results.append(result)
                continue

            try:
                start_time = time.time()
                response = model.invoke(
                    user_input,
                    config={"system_prompt": system_prompt}
                )
                end_time = time.time()

                # 응답 내용 추출 (다양한 응답 타입 처리)
                response_content = None
                if isinstance(response, BaseMessage) and hasattr(response, 'content'):
                    response_content = response.content
                elif isinstance(response, str):
                    response_content = response
                elif isinstance(response, dict) and 'content' in response: # Ollama 직접 호출 시
                    response_content = response['content']
                else:
                    # 예상치 못한 응답 타입일 경우 문자열로 변환 시도
                    try:
                        response_content = str(response)
                        print(f"  Warning: Unexpected response type for {prompt_id} from {model_name}. Type: {type(response)}. Content extracted as string.")
                    except Exception as str_err:
                         print(f"  Error converting unexpected response type to string for {prompt_id} from {model_name}: {str_err}")
                         raise ValueError(f"Unexpected response type and failed to convert to string: {type(response)}")


                latency = end_time - start_time

                # 성공 시 결과 딕셔너리
                result = {
                    "id": prompt_id,
                    "model_name": model_name,
                    "response": response_content, # 추출된 응답 내용
                    "latency": latency,
                    "error": None # 성공 시 에러는 None
                }
                print(f"  Successfully processed prompt {prompt_id} for {model_name} in {result['latency']:.2f}s")

            except Exception as e:
                # 오류 발생 시 상세 로깅 및 결과 딕셔너리
                error_message = f"Error processing prompt {prompt_id} for {model_name}: {type(e).__name__}: {e}"
                print(f"  {error_message}") # 콘솔에 상세 오류 출력

                result = {
                    "id": prompt_id,
                    "model_name": model_name,
                    "response": None, # 오류 시 response는 None
                    "error": error_message, # 상세 오류 메시지 저장
                    "latency": 0 # 오류 시 latency는 0
                }
            results.append(result)

        results_key = f"{model_name}_results"
        return {results_key: results} # 변경된 부분만 반환
    return process_model



In [10]:
# Create processors
process_phi = create_model_processor(phi, "phi")
process_qwen1_5 = create_model_processor(qwen1_5, "qwen1_5")
process_vicuna = create_model_processor(vicuna, "vicuna")
process_llama2 = create_model_processor(llama2, "llama2")
process_llama3_1 = create_model_processor(llama3_1, "llama3_1")
process_gemma3 = create_model_processor(gemma3, "gemma3")
# process_deepseek_r1 = create_model_processor(deepseek_r1, "deepseek_r1")

In [7]:
# --- define save intermediate results node ---
def save_intermediate_results(state: State) -> Dict:
    """Saves the generated responses before evaluation using the timestamp from the state."""
    print("--- Saving Intermediate Results (Responses Only) ---")
    # State로부터 타임스탬프 가져오기
    timestamp = state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_fallback"))
    if not timestamp:
        print("  Warning: Timestamp not found in state. Generating a new one for fallback.")
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S_fallback")
    
    output_dir = "_output"
    os.makedirs(output_dir, exist_ok=True) # 출력 디렉토리 확인 및 생성

    # 저장할 데이터 구성 (응답 결과만 선택)
    data_to_save = {}
    model_result_keys = [key for key in state if key.endswith("_results") and key not in ["evaluation_results"]]
    for key in model_result_keys:
        if state.get(key):
             data_to_save[key] = state[key]

    # 프롬프트 정보도 함께 저장하고 싶다면 추가
    # data_to_save["prompts"] = state.get("prompts", [])

    if not data_to_save:
        print("  No response data found to save.")
        return {} # 저장할 데이터 없으면 아무것도 안함

    base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
    intermediate_filename = f"{timestamp}_{base_name}_straight_responses_only.json" # 수정됨
    output_file_path = os.path.join(output_dir, intermediate_filename)

    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(data_to_save, f, ensure_ascii=False, indent=4)
        print(f"  Intermediate results successfully saved to {output_file_path}")
    except Exception as e:
        print(f"  Error saving intermediate results: {e}")

    # 이 노드는 상태를 변경하지 않으므로 빈 딕셔셔리 반환
    return {}

In [8]:
# --- Define Evaluation node ---

# 평가 프롬프트 템플릿 정의 (biggen_bench 구조 기반)
evaluation_prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an expert evaluator. Your task is to evaluate an AI assistant's response based on the provided user query, reference answer, and a detailed scoring rubric.
Focus ONLY on the provided information and rubric. Assign a score from 1 to 5, where 5 is the best, according to the descriptions.
Provide your output strictly in the specified format."""),
    ("human", """
**Evaluation Context:**

* **Task Type:** {task_description}
* **User Query:**
    ```
    {user_query}
    ```
* **Reference Answer:**
    ```
    {reference_answer}
    ```
* **AI Response to Evaluate:**
    ```
    {ai_response}
    ```

**Scoring Rubric:**

* **Criteria:** {criteria}
* **Score 1 Description:** {score1_desc}
* **Score 2 Description:** {score2_desc}
* **Score 3 Description:** {score3_desc}
* **Score 4 Description:** {score4_desc}
* **Score 5 Description:** {score5_desc}

**Instructions:**

1.  Carefully compare the "AI Response to Evaluate" against the "Reference Answer" and the "Scoring Rubric".
2.  Determine the score (1-5) that best reflects the quality of the AI Response according to the rubric descriptions.
3.  Provide a brief rationale explaining *why* you chose that score, referencing specific aspects of the rubric descriptions and the AI response.

**Output Format (MUST follow exactly):**
Score: [Your score between 1-5]
Rationale: [Your concise explanation based on the rubric]
""")
])

# --- evaluate_responses 함수 내에서 이 템플릿을 사용하는 방법 ---

def evaluate_responses(state: State) -> Dict[str, Any]:
    """Evaluates responses from different models based on rubrics."""
    print("--- Starting Evaluation ---")
    all_evaluations = []
    # State에서 benchmark_data 또는 그 매핑을 가져와야 함
    # 예: benchmark_data가 evaluate_responses 스코프에서 사용 가능하다고 가정
    benchmark_map_full = {item["id"]: item for item in benchmark_data}

    parser = StrOutputParser()
    evaluation_chain = evaluation_prompt_template | evaluator | parser

    model_result_keys = [key for key in state if key.endswith("_results") and key != "evaluation_results"]

    for key in model_result_keys:
        print(f"  Evaluating results from: {key}")
        model_results = state.get(key, [])
        for response_item in model_results:
            prompt_id = response_item.get("id")
            model_name = response_item.get("model_name")
            response_content = response_item.get("response")
            error = response_item.get("error")

            if error:
                eval_result = {"id": prompt_id, "agent_name": model_name, "score": None, "rationale": f"Skipped due to error: {error}", "error": True}
                all_evaluations.append(eval_result)
                continue

            if not prompt_id or prompt_id not in benchmark_map_full:
                print(f"    Warning: Missing full benchmark data for prompt ID {prompt_id}. Skipping.")
                continue

            benchmark_item = benchmark_map_full[prompt_id] # 해당 ID의 전체 benchmark 데이터

            if not response_content:
                 eval_result = {"id": prompt_id, "agent_name": model_name, "score": 0, "rationale": "Empty response", "error": False}
                 all_evaluations.append(eval_result)
                 continue

            # --- 여기가 중요: input_data 딕셔너리 생성 ---
            # benchmark_item (JSON 파일의 항목) 과 response_content (모델 응답)에서 값을 가져와
            # evaluation_prompt_template 의 변수 이름에 매핑합니다.
            input_data = {
                "task_description": benchmark_item.get("task", "N/A"),              # JSON의 'task' 필드
                "user_query": benchmark_item.get("input", "N/A"),                # JSON의 'input' 필드
                "reference_answer": benchmark_item.get("reference_answer", "N/A"),# JSON의 'reference_answer' 필드
                "ai_response": response_content,                                 # LangGraph State에서 온 모델 응답
                "criteria": benchmark_item.get("score_rubric", {}).get("criteria", "N/A"), # JSON의 'score_rubric'.'criteria'
                "score1_desc": benchmark_item.get("score_rubric", {}).get("score1_description", "N/A"), # 이하 scoreX_description
                "score2_desc": benchmark_item.get("score_rubric", {}).get("score2_description", "N/A"),
                "score3_desc": benchmark_item.get("score_rubric", {}).get("score3_description", "N/A"),
                "score4_desc": benchmark_item.get("score_rubric", {}).get("score4_description", "N/A"),
                "score5_desc": benchmark_item.get("score_rubric", {}).get("score5_description", "N/A"),
            }
            # ---------------------------------------------

            try:
                start_time = time.time()
                evaluation_output_str = evaluation_chain.invoke(input_data)
                end_time = time.time()

                # 출력 파싱 (이전 답변과 동일)
                score = None
                rationale = ""
                score_match = re.search(r"Score:\s*(\d)", evaluation_output_str)
                rationale_match = re.search(r"Rationale:\s*(.*)", evaluation_output_str, re.DOTALL)

                if score_match:
                    score = int(score_match.group(1))
                if rationale_match:
                    rationale = rationale_match.group(1).strip()

                if score is None or not rationale:
                     print(f"    Warning: Could not parse score/rationale for prompt {prompt_id}. Raw output: {evaluation_output_str}")
                     rationale = f"Parsing Warning. Raw Output: {evaluation_output_str}"

                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": score,
                    "rationale": rationale, "latency": end_time - start_time, "error": False
                }
                print(f"    Evaluated prompt {prompt_id} from {model_name} in {eval_result['latency']:.2f}s. Score: {eval_result['score']}")

            except Exception as e:
                print(f"    Error evaluating prompt {prompt_id} from {model_name}: {e}")
                eval_result = {
                    "id": prompt_id, "agent_name": model_name, "score": None,
                    "rationale": f"Evaluation failed: {str(e)}", "latency": 0, "error": True
                }
            all_evaluations.append(eval_result)

    print("--- Evaluation Finished ---")
    return {"evaluation_results": all_evaluations}

In [11]:
# --- define workflow ---

# 1. 워크플로우 시작 전에 타임스탬프 생성
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# 2. initial state 설정 시 생성된 타임스탬프 포함
initial_state = {
    "prompts": prompts,
    "rubrics": rubrics,
    # "processed_count": 0,
    "phi_results": [],
    "qwen1_5_results": [],
    "vicuna_results": [],
    "llama2_results": [],
    "llama3_1_results": [],
    "gemma3_results": [],
    # "deepseek_r1_results": [],
    "evaluation_results": [],
    "timestamp": current_timestamp  # 생성된 타임스탬프를 State에 추가
}

# create workflow 
workflow = StateGraph(State)

# add nodes
workflow.add_node("process_phi", process_phi)
workflow.add_node("process_qwen1_5", process_qwen1_5)
workflow.add_node("process_vicuna", process_vicuna)
workflow.add_node("process_llama2", process_llama2)
workflow.add_node("process_llama3", process_llama3_1)
workflow.add_node("process_gemma3", process_gemma3)
workflow.add_node("save_responses", save_intermediate_results)
workflow.add_node("evaluate", evaluate_responses)

# connect edges
workflow.set_entry_point("process_phi")
workflow.add_edge("process_phi", "process_qwen1_5")
workflow.add_edge("process_qwen1_5", "process_vicuna")
workflow.add_edge("process_vicuna", "process_llama2")
workflow.add_edge("process_llama2", "process_llama3")
workflow.add_edge("process_llama3", "process_gemma3")
workflow.add_edge("process_gemma3", "save_responses")
workflow.add_edge("save_responses", "evaluate")
workflow.add_edge("evaluate", END)

# compile workflow 
app = workflow.compile()


In [None]:
# run workflow
print("--- Starting Workflow ---")
final_state = app.invoke(initial_state)
print("--- Workflow Finished ---")

# --- save final results (using timestamp from final_state) ---
final_output_dir = "_output"
os.makedirs(final_output_dir, exist_ok=True)

# 3. 최종 상태에서 타임스탬프 가져와서 사용
final_timestamp = final_state.get("timestamp", datetime.now().strftime("%Y%m%d%H%M%S_final_fallback"))
if not final_timestamp:
    print("  Warning: Timestamp not found in final state. Generating a new one for fallback.")
    final_timestamp = datetime.now().strftime("%Y%m%d%H%M%S_final_fallback")


final_base_name, _ = os.path.splitext(os.path.basename(bench_path)) # 파일명 생성시 bench_path에서 확장자 제거
final_filename = f"{final_timestamp}_{final_base_name}_straight_with_evaluation.json" # 수정됨
final_output_file_path = os.path.join(final_output_dir, final_filename)

try:
    with open(final_output_file_path, 'w', encoding='utf-8') as f:
        json.dump(final_state, f, ensure_ascii=False, indent=4)
    print(f"\nFinal results (with evaluation) successfully saved to {final_output_file_path}")
except Exception as e:
    print(f"Error saving final results: {e}")
    print(f"Final state type: {type(final_state)}")


# Visualization

In [12]:
# # import libraries
# from IPython.display import display, Markdown, Image
# import pydot # Mermaid 생성 시 내부적으로 필요할 수 있음
# import graphviz # PNG 등 이미지 생성 시 시스템 설치 필요


In [None]:
# # --- visualization workflow ---

# print("--- LangGraph Workflow Visualization ---")

# # app 변수가 workflow.compile()을 통해 생성되었다고 가정
# if 'app' in locals() and app is not None:
#     try:
#         # Mermaid 다이어그램 생성
#         mermaid_string = app.get_graph().draw_mermaid()

#         # Jupyter Notebook에서 Mermaid 렌더링
#         print("Displaying Mermaid diagram:")
#         display(Markdown(f"```mermaid\n{mermaid_string}\n```"))

#         # # (선택 사항) PNG 이미지 생성 및 표시
#         # print("\n--- LangGraph Workflow Structure (PNG) ---")
#         # try:
#         #     output_filename = "workflow_graph_test.png"
#         #     # PNG 파일로 직접 저장 시도
#         #     app.get_graph().draw_png(path=output_filename)
#         #     print(f"Attempted to save PNG to {output_filename}. Check if the file exists in your notebook's directory.")

#         #     # 저장된 파일로부터 이미지 표시 시도 (선택 사항)
#         #     from IPython.display import Image, display
#         #     if os.path.exists(output_filename):
#         #         display(Image(filename=output_filename))
#         #     else:
#         #         print(f"File '{output_filename}' was not created.")

#         # except ImportError:
#         #     print("PNG generation requires 'pydot' and 'graphviz' Python libraries.")
#         #     print("Also ensure Graphviz is installed on your system and added to PATH.")
#         #     print("(conda install python-graphviz pydot / pip install pydot graphviz)")
#         # except Exception as img_err:
#         #     print(f"Could not generate PNG: {img_err}")
#         #     print("Please check Graphviz installation.")

#     except Exception as viz_err:
#         print(f"An error occurred during visualization: {viz_err}")
# else:
#     print("Error: Workflow has not been compiled yet (variable 'app' not found or is None).")
#     print("Please run the cell containing 'app = workflow.compile()' first.")