In [1]:
import os

from dotenv import load_dotenv

load_dotenv()

# if not os.getenv("OPENAI_API_KEY"):
#     raise ValueError("OPENAI_API_KEY is not set in .env file")
if not os.getenv("ANTHROPIC_API_KEY"):
    raise ValueError("ANTHROPIC_API_KEY is not set in .env file")
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY is not set in .env file")
if not os.getenv("DEEPSEEK_API_KEY"):
    raise ValueError("DEEPSEEK_API_KEY is not set in .env file")

In [2]:
import sys
import json
from datetime import datetime
import pandas as pd

# LangChain関連
from langchain_anthropic import ChatAnthropic
from langchain_deepseek import ChatDeepSeek
from langchain_google_genai import ChatGoogleGenerativeAI

# OpenAI Evals関連

# Add the project root directory to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
sys.path.append(project_root)
from generator.generator import MahjongQuestionGenerator
from prompts.prompts import (
    generate_question_with_tools_prompt_template,
)
from llmmj.llmmj import calculate_score
from evaluator.evaluator import MultiModelEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configure logging to see error messages
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

In [4]:
# モデル設定（APIキーが必要）
try:
    # # OpenAI GPT-4
    # gpt4 = ChatOpenAI(model_name="gpt-4o", temperature=0)

    # Claude (Anthropic APIキーが必要)
    claude_sonnet = ChatAnthropic(model="claude-sonnet-4-20250514", temperature=0)
    # claude_opus = ChatAnthropic(model="claude-opus-4-20250514", temperature=0)
    claude_3_7_sonnet = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0)

    # Google Gemini
    gemini = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-preview-05-20", temperature=0
    )

    # DeepSeek
    deepseek = ChatDeepSeek(model="deepseek-chat", temperature=0)

except Exception as e:
    print(f"モデル初期化エラー: {e}")
    raise e

In [5]:
# # 評価システム初期化
# generator = MahjongQuestionGenerator(
#     model=claude_3_7_sonnet, query_template=generate_question_prompt_template
# )

# # Use the async version in Jupyter notebook
# generator.generate_question(query="答えが2翻20符になる問題を作ってください")

In [6]:
# 評価システム初期化(with tools)
# generator = MahjongQuestionGenerator(
#     model=gemini,
#     use_tools=True,
#     query_template=generate_question_with_tools_prompt_template,
# )

# # Use the async version in Jupyter notebook
# result = generator.generate_question(query="答えが2翻70符になる問題を作ってください")
# print(result)

In [7]:
# from entity.entity import Hand
# from llmmj.llmmj import calculate_score


# if isinstance(result, dict):
#     hand = Hand(**result)
# elif isinstance(result, Hand):
#     hand = result

# calculate_score(hand)

In [8]:
# dataset = [
#     {
#         "query": "答えが2翻40符になる問題を作ってください",
#         "answer": {
#             "fu": 40,
#             "han": 2
#         }
#     }
# ]

In [9]:
# evaluator = MahjongEvaluator(generator=generator)
# df = evaluator.evals(dataset=dataset)
# df

In [10]:
# e = MultiModelEvaluator(models=[claude_sonnet], query_template=generate_question_prompt_template)
# df = e.evals(dataset)
# df

In [11]:
# with open("../dataset/queries.json", "r") as f:
#     dataset = json.load(f)

In [12]:
with open("../dataset/queries.min.json", "r") as f:
    dataset = json.load(f)

In [13]:
# dataset

In [14]:
models = {"claude_sonnet": claude_sonnet, "gemini": gemini, "deepseek": deepseek}

In [15]:
# from prompts.prompts import generate_question_prompt_template

# for k, m in models.items():
#     e = MultiModelEvaluator(
#         models=[m], query_template=generate_question_prompt_template
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/zeroshot/evals-gen-question-raw-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )
#     print(f"{k} done :)")

In [16]:
# from prompts.prompts import generate_question_with_cot_prompt_template

# for k, m in models.items():
#     e = MultiModelEvaluator(
#         models=[m], query_template=generate_question_with_cot_prompt_template
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/cot/evals-gen-question-with-cot-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )
#     print(f"{k} done :)")

In [17]:
# from prompts.prompts import generate_question_with_cot_and_rule_prompt_template

# for k, v in models.items():
#     e = MultiModelEvaluator(
#         models=[v], query_template=generate_question_with_cot_and_rule_prompt_template
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )

In [18]:
# models = {"claude_sonnet": claude_sonnet, "gemini": gemini, "deepseek": deepseek}

In [19]:
# from prompts.prompts import generate_question_with_tools_prompt_template

# for k, v in models.items():
#     e = MultiModelEvaluator(
#         models=[v],
#         query_template=generate_question_with_tools_prompt_template,
#         use_tools=True,
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/tools/evals-gen-question-with-tools-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )

In [20]:
from evaluator.agents_evaluator import MahjongEvaluatorSequential

evaluator = MahjongEvaluatorSequential(runner_type="loop")

df = evaluator.evals(dataset)

INFO:google_adk.google.adk.models.registry:Updating LLM class for gemini-.* from <class 'google.adk.models.google_llm.Gemini'> to <class 'google.adk.models.google_llm.Gemini'>
INFO:google_adk.google.adk.models.registry:Updating LLM class for projects\/.+\/locations\/.+\/endpoints\/.+ from <class 'google.adk.models.google_llm.Gemini'> to <class 'google.adk.models.google_llm.Gemini'>
INFO:google_adk.google.adk.models.registry:Updating LLM class for projects\/.+\/locations\/.+\/publishers\/google\/models\/gemini.+ from <class 'google.adk.models.google_llm.Gemini'> to <class 'google.adk.models.google_llm.Gemini'>
INFO:google_adk.google.adk.models.registry:Updating LLM class for gemini-.* from <class 'google.adk.models.google_llm.Gemini'> to <class 'google.adk.models.google_llm.Gemini'>
INFO:google_adk.google.adk.models.registry:Updating LLM class for projects\/.+\/locations\/.+\/endpoints\/.+ from <class 'google.adk.models.google_llm.Gemini'> to <class 'google.adk.models.google_llm.Gemini'

✅ New InMemorySessionService created for state demonstration.
✅ Agent 'mahjong_score_question_generator_agent' redefined.
✅ Agent 'mahjong_score_question_checker_agent' redefined.
✅ Root Agent 'mahjong_supervisor_agent' created using stateful tool and output_key.
✅ Agent 'final_output_json_generator_agent' redefined.
📝 Agent interaction logs are being saved to: agent_logs/agent_interactions_20250701_014401.log

>>> User Query: 答えが2飜30符になる問題を作ってください


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_generator_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 16:44:03 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=1377","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_generator_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_versi

pon!! CheckStatusAndEscalate: 2, 30
✅ New InMemorySessionService created for state demonstration.
✅ Agent 'mahjong_score_question_generator_agent' redefined.
✅ Agent 'mahjong_score_question_checker_agent' redefined.
✅ Root Agent 'mahjong_supervisor_agent' created using stateful tool and output_key.
✅ Agent 'final_output_json_generator_agent' redefined.

>>> User Query: 答えが3飜50符になる問題を作ってください


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_generator_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 16:44:26 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=1785","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_generator_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_versi

pon!! CheckStatusAndEscalate: 3, 50
✅ New InMemorySessionService created for state demonstration.
✅ Agent 'mahjong_score_question_generator_agent' redefined.
✅ Agent 'mahjong_score_question_checker_agent' redefined.
✅ Root Agent 'mahjong_supervisor_agent' created using stateful tool and output_key.
✅ Agent 'final_output_json_generator_agent' redefined.

>>> User Query: 答えが3飜70符になる問題を作ってください


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_generator_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 16:46:30 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=2653","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_generator_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_versi

pon!! CheckStatusAndEscalate: 5, 70


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_checker_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 16:46:57 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=2350","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_checker_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_version":

pon!! CheckStatusAndEscalate: 5, 70


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_checker_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 16:47:39 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=5031","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_checker_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_version":

pon!! CheckStatusAndEscalate: 5, 70


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_checker_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 16:53:21 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=2802","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_checker_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_version":

pon!! CheckStatusAndEscalate: 5, 70


INFO:google_adk.google.adk.models.google_llm:
LLM Response:
-----------------------------------------------------------
Text:
None
-----------------------------------------------------------
Function calls:
name: transfer_to_agent, args: {'agent_name': 'mahjong_score_question_checker_agent'}
-----------------------------------------------------------
Raw response:
{"sdk_http_response":{"headers":{"Content-Type":"application/json; charset=UTF-8","Vary":"Referer","Content-Encoding":"gzip","Date":"Mon, 30 Jun 2025 17:00:09 GMT","Server":"scaffolding on HTTPServer2","X-XSS-Protection":"0","X-Frame-Options":"SAMEORIGIN","X-Content-Type-Options":"nosniff","Server-Timing":"gfet4t7; dur=1742","Alt-Svc":"h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000","Transfer-Encoding":"chunked"}},"candidates":[{"content":{"parts":[{"function_call":{"args":{"agent_name":"mahjong_score_question_checker_agent"},"name":"transfer_to_agent"}}],"role":"model"},"finish_reason":"STOP","index":0}],"model_version":

pon!! CheckStatusAndEscalate: 5, 70


In [21]:
df.to_csv(
    f"../dist/multi_agents/evals-gen-question-with-sequential-agent-{datetime.now().strftime('%Y%m%d')}.csv",
    index=False,
)

In [22]:
import glob


def result_by_model(name: str):
    # 20250619のファイルを検索
    csv_files = glob.glob(f"../dist/{name}/*.csv")
    print(f"Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"  - {file}")

    # 各ファイルを読み込んでconcat
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        # ファイル名からモデル名を抽出
        model_name = file.split("-")[-2]  # 例: claude_sonnet
        df["model"] = model_name
        dfs.append(df)

    # 全てのデータを結合
    df = pd.concat(dfs, ignore_index=True)

    accuracy_by_model = (
        df.groupby("model")["correct"].agg(["count", "sum", "mean"]).round(3)
    )
    accuracy_by_model.columns = ["総問題数", "正解数", "正解率"]
    print("\n=== モデルごとの正解率 ===")
    print(accuracy_by_model)

    accuracy_by_model = (
        df.groupby("model")["is_error"].agg(["count", "sum", "mean"]).round(3)
    )
    accuracy_by_model.columns = ["総問題数", "エラー数", "エラー率"]
    print("\n=== モデルごとのエラー率 ===")
    print(accuracy_by_model)

In [23]:
result_by_model("zeroshot")

Found 3 CSV files:
  - ../dist/zeroshot/evals-gen-question-raw-prompt-gemini-20250629.csv
  - ../dist/zeroshot/evals-gen-question-raw-prompt-claude_sonnet-20250629.csv
  - ../dist/zeroshot/evals-gen-question-raw-prompt-deepseek-20250629.csv

=== モデルごとの正解率 ===
               総問題数  正解数   正解率
model                         
claude_sonnet    20    1  0.05
deepseek         20    0  0.00
gemini           20    6  0.30

=== モデルごとのエラー率 ===
               総問題数  エラー数  エラー率
model                          
claude_sonnet    20     0   0.0
deepseek         20     0   0.0
gemini           20     0   0.0


In [24]:
result_by_model("cot")

Found 3 CSV files:
  - ../dist/cot/evals-gen-question-with-cot-prompt-gemini-20250629.csv
  - ../dist/cot/evals-gen-question-with-cot-prompt-claude_sonnet-20250629.csv
  - ../dist/cot/evals-gen-question-with-cot-prompt-deepseek-20250629.csv

=== モデルごとの正解率 ===
               総問題数  正解数  正解率
model                        
claude_sonnet    20    0  0.0
deepseek         20    0  0.0
gemini           20    2  0.1

=== モデルごとのエラー率 ===
               総問題数  エラー数  エラー率
model                          
claude_sonnet    20    18   0.9
deepseek         20     0   0.0
gemini           20     0   0.0


In [25]:
result_by_model("cot_and_rule")

Found 3 CSV files:
  - ../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-deepseek-20250629.csv
  - ../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-gemini-20250629.csv
  - ../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-claude_sonnet-20250629.csv

=== モデルごとの正解率 ===
               総問題数  正解数  正解率
model                        
claude_sonnet    20    0  0.0
deepseek         20    0  0.0
gemini           20    6  0.3

=== モデルごとのエラー率 ===
               総問題数  エラー数  エラー率
model                          
claude_sonnet    20    18   0.9
deepseek         20     0   0.0
gemini           20     0   0.0


In [26]:
result_by_model("tools")

Found 3 CSV files:
  - ../dist/tools/evals-gen-question-with-tools-prompt-claude_sonnet-20250629.csv
  - ../dist/tools/evals-gen-question-with-tools-prompt-deepseek-20250629.csv
  - ../dist/tools/evals-gen-question-with-tools-prompt-gemini-20250629.csv

=== モデルごとの正解率 ===
               総問題数  正解数  正解率
model                        
claude_sonnet    20    2  0.1
deepseek         20    0  0.0
gemini           20    4  0.2

=== モデルごとのエラー率 ===
               総問題数  エラー数  エラー率
model                          
claude_sonnet    20    13  0.65
deepseek         20     0  0.00
gemini           20     0  0.00


In [27]:
result_by_model("multi_agents")

Found 2 CSV files:
  - ../dist/multi_agents/evals-gen-question-with-sequential-agent-20250630.csv
  - ../dist/multi_agents/evals-gen-question-with-sequential-agent-20250701.csv

=== モデルごとの正解率 ===
       総問題数  正解数  正解率
model                
agent     6    0  0.0

=== モデルごとのエラー率 ===
       総問題数  エラー数   エラー率
model                   
agent     6     4  0.667
