In [1]:
import os

from dotenv import load_dotenv

load_dotenv()

# if not os.getenv("OPENAI_API_KEY"):
#     raise ValueError("OPENAI_API_KEY is not set in .env file")
if not os.getenv("ANTHROPIC_API_KEY"):
    raise ValueError("ANTHROPIC_API_KEY is not set in .env file")
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY is not set in .env file")
# if not os.getenv("DEEPSEEK_API_KEY"):
#     raise ValueError("DEEPSEEK_API_KEY is not set in .env file")

In [None]:
import sys
import json
import pandas as pd

# LangChain関連
from langchain_anthropic import ChatAnthropic
from langchain_deepseek import ChatDeepSeek
from langchain_google_genai import ChatGoogleGenerativeAI

# OpenAI Evals関連

# Add the project root directory to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
sys.path.append(project_root)

In [3]:
# Configure logging to see error messages
import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger()

os.environ["GOOGLE_ADK_LOG_LEVEL"] = "WARNING"

In [4]:
# モデル設定（APIキーが必要）
try:
    # # OpenAI GPT-4
    # gpt4 = ChatOpenAI(model_name="gpt-4o", temperature=0)

    # Claude (Anthropic APIキーが必要)
    claude_sonnet = ChatAnthropic(model="claude-sonnet-4-20250514", temperature=0)
    # claude_opus = ChatAnthropic(model="claude-opus-4-20250514", temperature=0)
    claude_3_7_sonnet = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0)

    # Google Gemini
    gemini = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-preview-05-20", temperature=0
    )

    # DeepSeek
    deepseek = ChatDeepSeek(model="deepseek-chat", temperature=0)

except Exception as e:
    print(f"モデル初期化エラー: {e}")
    raise e

In [None]:
DebugExec = True
USE_MIN_DATA = True

In [5]:
if DebugExec:
    from generator.generator import MahjongQuestionGenerator
    from prompts.prompts import generate_question_prompt_template

    generator = MahjongQuestionGenerator(
        model=claude_3_7_sonnet, query_template=generate_question_prompt_template
    )
    result = generator.generate_question(
        query="Please give me a mahjong scoring problem. Give me a problem where the answer is 3 han and 50 fu."
    )
    print(result)

In [6]:
if DebugExec:
    from generator.generator import MahjongQuestionGenerator
    from prompts.prompts import generate_question_with_cot_and_rule_prompt_template

    generator = MahjongQuestionGenerator(
        model=gemini, query_template=generate_question_with_cot_and_rule_prompt_template
    )
    result = generator.generate_question(
        query="Please give me a mahjong scoring problem. Give me a problem where the answer is 3 han and 50 fu."
    )
    print(result)

In [7]:
if DebugExec:
    from generator.generator import MahjongQuestionGenerator
    from prompts.prompts import generate_question_with_tools_prompt_template

    generator = MahjongQuestionGenerator(
        model=gemini,
        use_tools=True,
        query_template=generate_question_with_tools_prompt_template,
    )

    # Use the async version in Jupyter notebook
    result = generator.generate_question(
        query="Please give me a mahjong scoring problem. Give me a problem where the answer is 3 han and 50 fu."
    )
    print(result)

In [9]:
dataset_path = "../dataset/queries.json"
if USE_MIN_DATA:
    dataset_path = "../dataset/queries.min.json"

with open(dataset_path, "r") as f:
    dataset = json.load(f)

In [12]:
models = {
    "claude_sonnet": claude_sonnet,
    "gemini": gemini,
    # "deepseek": deepseek
}

In [None]:
from datetime import datetime

from evaluator.evaluator import MultiModelEvaluator
from prompts.prompts import generate_question_prompt_template


for k, m in models.items():
    e = MultiModelEvaluator(
        models=[m], query_template=generate_question_prompt_template
    )
    df = e.evals(dataset)

    df.to_csv(
        f"../dist/zeroshot/evals-gen-question-raw-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
        index=False,
    )
    print(f"{k} done :)")

In [None]:
from prompts.prompts import generate_question_with_cot_and_rule_prompt_template

for k, v in models.items():
    e = MultiModelEvaluator(
        models=[v], query_template=generate_question_with_cot_and_rule_prompt_template
    )
    df = e.evals(dataset)

    df.to_csv(
        f"../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
        index=False,
    )

In [None]:
from prompts.prompts import generate_question_with_tools_prompt_template

for k, v in models.items():
    e = MultiModelEvaluator(
        models=[v],
        query_template=generate_question_with_tools_prompt_template,
        use_tools=True,
    )
    df = e.evals(dataset)

    df.to_csv(
        f"../dist/tools/evals-gen-question-with-tools-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
        index=False,
    )

In [None]:
from evaluator.agents_evaluator import MahjongMultiAgentsEvaluator

evaluator = MahjongMultiAgentsEvaluator(runner_type="sequential")

df = evaluator.evals(dataset)

In [None]:
df.to_csv(
    f"../dist/sequencial_multi_agents/evals-gen-question-with-sequential-agent-{datetime.now().strftime('%Y%m%d')}.csv",
    index=False,
)

In [None]:
from evaluator.agents_evaluator import MahjongMultiAgentsEvaluator

evaluator = MahjongMultiAgentsEvaluator(runner_type="loop")

df = evaluator.evals(dataset)

In [19]:
df.to_csv(
    f"../dist/loop_multi_agents/evals-gen-question-with-loop-multi-agents-{datetime.now().strftime('%Y%m%d')}.csv",
    index=False,
)

In [20]:
import glob


def result_by_model(name: str):
    csv_files = glob.glob(f"../dist/{name}/*.csv")
    print(f"Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"  - {file}")

    # 各ファイルを読み込んでconcat
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        # ファイル名からモデル名を抽出
        model_name = file.split("-")[-2]  # 例: claude_sonnet
        df["model"] = model_name
        dfs.append(df)

    # 全てのデータを結合
    df = pd.concat(dfs, ignore_index=True)

    accuracy_by_model = (
        df.groupby("model")["correct"].agg(["count", "sum", "mean"]).round(3)
    )
    accuracy_by_model.columns = ["総問題数", "正解数", "正解率"]
    print("\n=== モデルごとの正解率 ===")
    print(accuracy_by_model)

    accuracy_by_model = (
        df.groupby("model")["is_error"].agg(["count", "sum", "mean"]).round(3)
    )
    accuracy_by_model.columns = ["総問題数", "エラー数", "エラー率"]
    print("\n=== モデルごとのエラー率 ===")
    print(accuracy_by_model)

In [None]:
result_by_model("zeroshot")

In [None]:
result_by_model("cot_and_rule")

In [None]:
result_by_model("tools")

In [None]:
# result_by_model("sequencial_multi_agents")

In [None]:
result_by_model("loop_multi_agents")