In [1]:
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
import time
import asyncio
from datetime import datetime
import json

In [2]:
# Define models

o3_mini = init_chat_model("openai:o3-mini")
claude_sonnet = init_chat_model("anthropic:claude-3-5-sonnet-latest", temperature=0)
gemma3 = init_chat_model("ollama:gemma3:12b", temperature=0)
llama3_1 = init_chat_model("ollama:llama3.1:latest", temperature=0)
deepseek_r1 = init_chat_model("ollama:deepseek-r1:8b", temperature=0)
evaluator = init_chat_model("anthropic:claude-3-7-sonnet-20250219", temperature=0)

In [3]:
# Preparing benchmark data

bench_path = "biggen_bench_instruction_idx0.json"

def load_benchmark_data(bench_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(bench_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

benchmark_data = load_benchmark_data(bench_path)

In [4]:

def prepare_prompts(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare prompts by excluding reference_answer and score_rubric."""
    prompts = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "capability": item["capability"],
            "task": item["task"],
            "instance_idx": item["instance_idx"],
            "system_prompt": item["system_prompt"],
            "input": item["input"],
            # Exclude reference_answer and score_rubric
        }
        prompts.append(prompt)
    return prompts

def prepare_rubric(benchmark_data: List[Dict]) -> List[Dict]:
    """Prepare rubric including reference_answer and score_rubric."""
    rubric = []
    for item in benchmark_data:
        prompt = {
            "id": item["id"],
            "reference_answer": item["reference_answer"],
            "score_rubric": item["score_rubric"]
        }
        rubric.append(prompt)
    return rubric

prompts = prepare_prompts(benchmark_data)
rubric = prepare_rubric(benchmark_data)

In [5]:
# Define the state structure
class State(TypedDict):
    prompts: List[Dict[str, Any]]
    processed_count: int
    gemma3_results: List[Dict[str, Any]]
    llama3_1_results: List[Dict[str, Any]]
    deepseek_r1_results: List[Dict[str, Any]]
    evaluator_results: List[Dict[str, Any]]

In [6]:
def create_model_processor(model, model_name):
    def process_model(state: State) -> State:
        results = []
        
        for prompt in state["prompts"]:
            system_prompt = prompt.get('system_prompt', '')
            user_input = prompt.get('input', '')
            
            try:
                response = model.invoke(
                    user_input,
                    config={"system_prompt": system_prompt}
                )
                
                response_content = response.content if hasattr(response, 'content') else str(response)

                result = {
                    "id": prompt.get('id', ''),
                    "model_name": model_name,
                    "response": response_content
                }
            except Exception as e:
                result = {
                    "id": prompt.get('id', ''),
                    "model_name": model_name,
                    "error": str(e)
                }
                
            results.append(result)
        
        results_key = f"{model_name}_results"
        new_state = state.copy()
        new_state[results_key] = results
        return new_state
    
    return process_model

# create model processor variables
process_gemma3 = create_model_processor(gemma3, "gemma3")
process_llama3_1 = create_model_processor(llama3_1, "llama3_1")
# process_deepseek_r1 = create_model_processor(deepseek_r1, "deepseek_r1")

In [7]:
# initial state setting
initial_state = {
    "prompts": prompts,
    "processed_count": 0,
    "gemma3_results": [],
    "llama3_1_results": [],
#    "deepseek_r1_results": []
}

# create workflow
workflow = StateGraph(State)
workflow.add_node("process_gemma3", process_gemma3)
workflow.add_node("process_llama3", process_llama3_1)
#workflow.add_node("process_deepseek", process_deepseek_r1)

# connect nodes
workflow.set_entry_point("process_gemma3")
workflow.add_edge("process_gemma3", "process_llama3")
#workflow.add_edge("process_llama3", "process_deepseek")

# compile workflow
app = workflow.compile()

# run workflow
final_state = app.invoke(initial_state)

In [8]:
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

In [9]:
# check final_state whether dict 
if isinstance(final_state, dict):
    # save final_state as JSON file
    output_file_path = f"_output/{timestamp}_{bench_path}_response.json"
    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(final_state, f, ensure_ascii=False, indent=4) # ensure_ascii=False는 한글 깨짐 방지, indent는 가독성 향상
        print(f"결과가 {output_file_path} 에 성공적으로 저장되었습니다.")
    except TypeError as e:
        print(f"JSON 직렬화 오류: {e}")

else:
    print(f"오류: 최종 결과의 타입이 dict가 아닙니다. 타입: {type(final_state)}")

결과가 _output/20250505131906_biggen_bench_instruction_idx0.json_response.json 에 성공적으로 저장되었습니다.


# Evaluation

In [11]:
response_path = f"_output/{timestamp}_{bench_path}_response.json"

def load_benchmark_data(response_path: str) -> List[Dict]:
    """Load benchmark data from a JSON file."""
    with open(response_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

response_data = load_benchmark_data(response_path)

In [19]:
def evaluate_responses_with_langgraph(response_data: Dict, benchmark_data: List[Dict], evaluator) -> List[Dict]:
    """Evaluate responses using the langgraph Evaluator model and return scores."""
    
    benchmark_map = {item["id"]: item for item in benchmark_data}
    evaluations = []
    
    # gemma3_results와 llama3_1_results에서 응답 평가
    for model_results_key in ['gemma3_results', 'llama3_1_results']:
        if model_results_key in response_data:
            for response in response_data[model_results_key]:
                prompt_id = response["id"]
                benchmark_item = benchmark_map.get(prompt_id)
                if benchmark_item:
                    evaluation = evaluator.evaluate(
                        response=response["response"],
                        reference=benchmark_item["reference_answer"],
                        rubric=benchmark_item["score_rubric"]
                    )
                    evaluations.append({
                        "id": prompt_id,
                        "agent_name": response["model_name"],
                        "score": evaluation["score"],
                        "rationale": evaluation["rationale"]
                    })
    
    return evaluations

evaluations = evaluate_responses_with_langgraph(response_data, benchmark_data, evaluator)

for evaluation in evaluations:
    print(evaluation)

AttributeError: 'ChatAnthropic' object has no attribute 'evaluate'