In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

# if not os.getenv("OPENAI_API_KEY"):
#     raise ValueError("OPENAI_API_KEY is not set in .env file")
if not os.getenv("ANTHROPIC_API_KEY"):
    raise ValueError("ANTHROPIC_API_KEY is not set in .env file")
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY is not set in .env file")
if not os.getenv("DEEPSEEK_API_KEY"):
    raise ValueError("DEEPSEEK_API_KEY is not set in .env file")

In [None]:
import sys
import json
import asyncio
from typing import List, Dict, Any
from datetime import datetime
import pandas as pd

# LangChain関連
from langchain.llms import OpenAI
from langchain.schema import HumanMessage
from langchain.callbacks import get_openai_callback
from langchain_anthropic import ChatAnthropic
from langchain_openai import ChatOpenAI
from langchain_deepseek import ChatDeepSeek
from langchain_google_genai import ChatGoogleGenerativeAI

# OpenAI Evals関連
from evals.eval import Eval
from evals.registry import Registry
from evals.data import get_jsonl

# Add the project root directory to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
sys.path.append(project_root)
from generator.generator import MahjongQuestionGenerator
from prompts.prompts import (
    generate_question_prompt_template,
    generate_question_with_cot_prompt_template,
    generate_question_with_cot_and_rule_prompt_template,
    generate_question_with_tools_prompt_template
)
from evalutor.evals import MahjongEvaluator, MultiModelEvaluator

In [3]:
# Configure logging to see error messages
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

In [4]:
# モデル設定（APIキーが必要）
try:
    # # OpenAI GPT-4
    # gpt4 = ChatOpenAI(model_name="gpt-4o", temperature=0)

    # Claude (Anthropic APIキーが必要)
    claude_sonnet = ChatAnthropic(model="claude-sonnet-4-20250514", temperature=0)
    # claude_opus = ChatAnthropic(model="claude-opus-4-20250514", temperature=0)
    claude_3_7_sonnet = ChatAnthropic(model="claude-3-7-sonnet-20250219", temperature=0)

    # Google Gemini
    gemini = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash-preview-05-20", temperature=0
    )

    # DeepSeek
    deepseek = ChatDeepSeek(model="deepseek-chat", temperature=0)

except Exception as e:
    print(f"モデル初期化エラー: {e}")
    raise e

In [5]:
# # 評価システム初期化
# generator = MahjongQuestionGenerator(
#     model=claude_3_7_sonnet, query_template=generate_question_prompt_template
# )

# # Use the async version in Jupyter notebook
# generator.generate_question(query="答えが2翻20符になる問題を作ってください")

In [None]:
# 評価システム初期化(with tools)
generator = MahjongQuestionGenerator(
    model=gemini, use_tools=True, query_template=generate_question_with_tools_prompt_template
)

# Use the async version in Jupyter notebook
generator.generate_question(query="答えが2翻70符になる問題を作ってください")

In [7]:
# dataset = [
#     {
#         "query": "答えが2翻40符になる問題を作ってください",
#         "answer": {
#             "fu": 40,
#             "han": 2
#         }
#     }
# ]

In [8]:
# evaluator = MahjongEvaluator(generator=generator)
# df = evaluator.evals(dataset=dataset)
# df

In [9]:
# e = MultiModelEvaluator(models=[claude_sonnet], query_template=generate_question_prompt_template)
# df = e.evals(dataset)
# df

In [10]:
with open("../dataset/queries.json", "r") as f:
    dataset = json.load(f)

In [11]:
# dataset

In [12]:
models = {
    "claude_sonnet": claude_sonnet,
    # "claude_opus": claude_opus,
    "gemini": gemini,
    # "deepseek": deepseek
}

In [13]:
# for k, m in models.items():
#     e = MultiModelEvaluator(
#         models=[m], query_template=generate_question_prompt_template
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/zeroshot/evals-gen-question-raw-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )
#     print(f"{k} done :)")

In [14]:
# for k, m in models.items():
#     e = MultiModelEvaluator(
#         models=[m], query_template=generate_question_with_cot_prompt_template
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/cot/evals-gen-question-with-cot-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )
#     print(f"{k} done :)")

In [15]:
# for k, v in models.items():
#     e = MultiModelEvaluator(
#         models=[v], query_template=generate_question_with_cot_and_rule_prompt_template
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/cot_and_rule/evals-gen-question-with-cot-and-rule-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )

In [16]:
# for k, v in models.items():
#     e = MultiModelEvaluator(
#         models=[v], query_template=generate_question_with_tools_prompt_template, use_tools=True
#     )
#     df = e.evals(dataset)

#     df.to_csv(
#         f"../dist/tools/evals-gen-question-with-tools-prompt-{k}-{datetime.now().strftime('%Y%m%d')}.csv",
#         index=False,
#     )

In [17]:
import glob
import pandas as pd

def result_by_model(name: str):
    # 20250619のファイルを検索
    csv_files = glob.glob(f"../dist/{name}/*.csv")
    print(f"Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"  - {file}")

    # 各ファイルを読み込んでconcat
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        # ファイル名からモデル名を抽出
        model_name = file.split('-')[-2]  # 例: claude_sonnet
        df['model'] = model_name
        dfs.append(df)

    # 全てのデータを結合
    df = pd.concat(dfs, ignore_index=True)

    accuracy_by_model = df.groupby('model')['correct'].agg(['count', 'sum', 'mean']).round(3)
    accuracy_by_model.columns = ['総問題数', '正解数', '正解率']
    print("\n=== モデルごとの正解率 ===")
    print(accuracy_by_model)

    accuracy_by_model = df.groupby('model')['is_error'].agg(['count', 'sum', 'mean']).round(3)
    accuracy_by_model.columns = ['総問題数', 'エラー数', 'エラー率']
    print("\n=== モデルごとのエラー率 ===")
    print(accuracy_by_model)    


In [None]:
result_by_model("zeroshot")

In [None]:
result_by_model("cot")

In [None]:
result_by_model("cot_and_rule")

In [None]:
result_by_model("tools")