## 点数計算問題生成評価

In [12]:
import os
import sys
from datetime import datetime

project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
sys.path.append(project_root)

In [None]:
from exceptions import HandValidationError, ScoreCalculationError, JSONParseError

In [3]:
import os

from dotenv import load_dotenv

load_dotenv()

# if not os.getenv("OPENAI_API_KEY"):
#     raise ValueError("OPENAI_API_KEY is not set in .env file")
if not os.getenv("ANTHROPIC_API_KEY"):
    raise ValueError("ANTHROPIC_API_KEY is not set in .env file")
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY is not set in .env file")
# if not os.getenv("DEEPSEEK_API_KEY"):
#     raise ValueError("DEEPSEEK_API_KEY is not set in .env file")

In [4]:
import sys
import json
import pandas as pd

# LangChain関連
from langchain_anthropic import ChatAnthropic
from langchain_deepseek import ChatDeepSeek
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
)  # Use the async version in Jupyter notebook

project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Configure logging to see error messages
import logging

logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger()

os.environ["GOOGLE_ADK_LOG_LEVEL"] = "ERROR"

In [6]:
# モデル設定（APIキーが必要）
try:
    # # OpenAI GPT-4
    # gpt4 = ChatOpenAI(model_name="gpt-4o", temperature=0)

    # Claude (Anthropic APIキーが必要)
    claude_sonnet = ChatAnthropic(model="claude-sonnet-4-20250514", temperature=0)
    # claude_opus = ChatAnthropic(model="claude-opus-4-20250514", temperature=0)
    claude_3_7_sonnet = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0)

    # Google Gemini
    gemini = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-preview-05-20", temperature=0
    )

    # DeepSeek
    deepseek = ChatDeepSeek(model="deepseek-chat", temperature=0)

except Exception as e:
    print(f"モデル初期化エラー: {e}")
    raise e

### 実行環境設定

In [7]:
DebugExec = False
EvalExec = True
USE_MIN_DATA = False

### 動作確認

In [7]:
if DebugExec:
    from generator.generator import MahjongQuestionGenerator
    from prompts.prompts import generate_question_prompt_template

    generator = MahjongQuestionGenerator(
        model=claude_3_7_sonnet, query_template=generate_question_prompt_template
    )
    try:
        result = generator.generate_question(
            query="Please give me a mahjong scoring problem. Give me a problem where the answer is 3 han and 50 fu."
        )
        print(result)
    except JSONParseError | HandValidationError | ScoreCalculationError as e:
        print(e)
    except Exception:
        raise

In [8]:
if DebugExec:
    from generator.generator import MahjongQuestionGenerator
    from prompts.prompts import generate_question_with_cot_and_rule_prompt_template

    generator = MahjongQuestionGenerator(
        model=gemini, query_template=generate_question_with_cot_and_rule_prompt_template
    )
    try:
        result = generator.generate_question(
            query="Please give me a mahjong scoring problem. Give me a problem where the answer is 3 han and 50 fu."
        )
    except JSONParseError | HandValidationError | ScoreCalculationError as e:
        print(e)
    except Exception:
        raise
    print(result)

In [9]:
# if DebugExec:
#     from generator.generator import MahjongQuestionGenerator
#     from prompts.prompts import generate_question_with_tools_prompt_template

#     generator = MahjongQuestionGenerator(
#         model=gemini,
#         use_tools=True,
#         query_template=generate_question_with_tools_prompt_template,
#     )

#     try:
#         result = generator.generate_question(
#             query="Please give me a mahjong scoring problem. Give me a problem where the answer is 3 han and 50 fu."
#         )
#         print(result)
#     except (JSONParseError, HandValidationError, ScoreCalculationError) as e:
#         logger.error(e)
#         print(e)

In [10]:
if not EvalExec:
    raise ValueError("EvalExec is not set to True")

### 実験

In [9]:
dataset_path = "../dataset/queries.json"
if USE_MIN_DATA:
    dataset_path = "../dataset/queries.min.json"

with open(dataset_path, "r") as f:
    dataset = json.load(f)

In [12]:
models = {
    "claude_sonnet": claude_sonnet,
    "gemini": gemini,
    # "deepseek": deepseek
}

In [None]:
from evaluator.evaluator import MultiModelEvaluator
from prompts.prompts import generate_question_prompt_template


for k, m in models.items():
    e = MultiModelEvaluator(
        models=[m], query_template=generate_question_prompt_template
    )
    df = e.evals(dataset)

    df.to_csv(
        f"../dist/zeroshot/evals-gen-question-raw-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
        index=False,
    )
    print(f"{k} done :)")

ERROR:evaluator.libs:Error validating hand: Invalid win tile in hand. win_tile is not in tiles


claude_sonnet done :)
gemini done :)


In [14]:
from prompts.prompts import generate_question_with_cot_and_rule_prompt_template

for k, v in models.items():
    e = MultiModelEvaluator(
        models=[v], query_template=generate_question_with_cot_and_rule_prompt_template
    )
    df = e.evals(dataset)

    df.to_csv(
        f"../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
        index=False,
    )

ERROR:evaluator.libs:Error validating hand: Invalid tile count in hand. tiles is less than 14
ERROR:evaluator.libs:Error validating hand: Invalid tile count in hand. tiles is less than 14
ERROR:evaluator.libs:Error validating hand: Invalid tile count in hand. tiles is less than 14


In [15]:
# from prompts.prompts import generate_question_with_tools_prompt_template

# for k, v in models.items():
#     e = MultiModelEvaluator(
#         models=[v],
#         query_template=generate_question_with_tools_prompt_template,
#         use_tools=True,
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/tools/evals-gen-question-with-tools-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )

In [16]:
from evaluator.agents_evaluator import MahjongMultiAgentsEvaluator

evaluator = MahjongMultiAgentsEvaluator(runner_type="sequential")

df = evaluator.evals(dataset)

In [17]:
df.to_csv(
    f"../dist/sequencial_multi_agents/evals-gen-question-with-sequential-agent-{datetime.now().strftime('%Y%m%d')}.csv",
    index=False,
)

In [10]:
from evaluator.agents_evaluator import MahjongMultiAgentsEvaluator

evaluator = MahjongMultiAgentsEvaluator(runner_type="loop")

df = evaluator.evals(dataset)

In [13]:
df.to_csv(
    f"../dist/loop_multi_agents/evals-gen-question-with-loop-multi-agents-{datetime.now().strftime('%Y%m%d')}.csv",
    index=False,
)

### 評価

In [None]:
import glob


def result_by_model(name: str):
    csv_files = glob.glob(f"../dist/{name}/*.csv")
    print(f"Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"  - {file}")

    # 各ファイルを読み込んでconcat
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        # ファイル名からモデル名を抽出
        model_name = file.split("-")[-2]  # 例: claude_sonnet
        df["model"] = model_name
        dfs.append(df)

    # 全てのデータを結合
    df = pd.concat(dfs, ignore_index=True)

    # 結果を格納するリスト
    results = []
    
    for model in df["model"].unique():
        model_df = df[df["model"] == model]
        
        # 基本統計
        total_count = len(model_df)
        correct_count = model_df["correct"].sum()
        correct_rate = round(correct_count / total_count, 3) if total_count > 0 else 0
        error_count = model_df["is_error"].sum()
        error_rate = round(error_count / total_count, 3) if total_count > 0 else 0
        
        # 結果辞書を作成
        result = {
            "Model": model,
            "Total": total_count,
            "Correct": correct_count,
            "Accuracy": correct_rate,
            "Errors": error_count,
            "Error_Rate": error_rate
        }
        
        # エラータイプ別のカウントと率
        error_df = model_df[model_df["is_error"] == True]
        error_types = error_df["error_type"].value_counts()
        
        for error_type, count in error_types.items():
            result[f"Error_{error_type}_Count"] = count
            result[f"Error_{error_type}_Rate"] = round(count / total_count, 3)
        
        results.append(result)
    
    # DataFrameに変換
    result_df = pd.DataFrame(results)
    
    # カラムの順序を整理
    base_cols = ["Model", "Total", "Correct", "Accuracy", "Errors", "Error_Rate"]
    error_cols = [col for col in result_df.columns if col not in base_cols]
    result_df = result_df[base_cols + sorted(error_cols)]
    
    # NaNを0に置換
    result_df = result_df.fillna(0)
    
    print("\n=== Results Summary ===")
    print(result_df)
    
    return result_df

In [15]:
result_by_model("zeroshot")

Found 2 CSV files:
  - ../dist/zeroshot/evals-gen-question-raw-prompt-gemini-20250707.csv
  - ../dist/zeroshot/evals-gen-question-raw-prompt-claude_sonnet-20250707.csv

=== モデルごとの正解率 ===
               総問題数  正解数   正解率
model                         
claude_sonnet    20    1  0.05
gemini           20    6  0.30

=== モデルごとのエラー率 ===
               総問題数  エラー数  エラー率
model                          
claude_sonnet    20     2   0.1
gemini           20     2   0.1


In [16]:
result_by_model("cot_and_rule")

Found 2 CSV files:
  - ../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-claude_sonnet-20250707.csv
  - ../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-gemini-20250707.csv

=== モデルごとの正解率 ===
               総問題数  正解数   正解率
model                         
claude_sonnet    20    0  0.00
gemini           20    9  0.45

=== モデルごとのエラー率 ===
               総問題数  エラー数  エラー率
model                          
claude_sonnet    20     4   0.2
gemini           20     0   0.0


In [17]:
# result_by_model("tools")

In [18]:
result_by_model("sequencial_multi_agents")

Found 1 CSV files:
  - ../dist/sequencial_multi_agents/evals-gen-question-with-sequential-agent-20250707.csv

=== モデルごとの正解率 ===
       総問題数  正解数   正解率
model                 
agent    20   11  0.55

=== モデルごとのエラー率 ===
       総問題数  エラー数  エラー率
model                  
agent    20     1  0.05


In [20]:
result_by_model("loop_multi_agents")

Found 1 CSV files:
  - ../dist/loop_multi_agents/evals-gen-question-with-loop-multi-agents-20250707.csv

=== モデルごとの正解率 ===
        総問題数  正解数  正解率
model                 
agents    20   18  0.9

=== モデルごとのエラー率 ===
        総問題数  エラー数  エラー率
model                   
agents    20     0   0.0
