In [1]:

import os
from dotenv import load_dotenv

load_dotenv()

if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY is not set in .env file")
if not os.getenv("ANTHROPIC_API_KEY"):
    raise ValueError("ANTHROPIC_API_KEY is not set in .env file")
if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY is not set in .env file")



In [None]:


import sys
import json
import asyncio
from typing import List, Dict, Any
from datetime import datetime
import pandas as pd

# LangChain関連
from langchain.llms import OpenAI
from langchain.schema import HumanMessage
from langchain.callbacks import get_openai_callback
from langchain_anthropic import ChatAnthropic
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI

# OpenAI Evals関連
from evals.eval import Eval
from evals.registry import Registry
from evals.data import get_jsonl

# Add the project root directory to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
sys.path.append(project_root)
from libs.generator import MahjongQuestionGenerator, generate_question_prompt_template
from libs.evals import MahjongEvaluator, MultiModelEvaluator

In [3]:
# モデル設定（APIキーが必要）
try:
    # OpenAI GPT-4
    gpt4 = ChatOpenAI(model_name="gpt-4o", temperature=0)
    
    # Claude (Anthropic APIキーが必要)
    claude = ChatAnthropic(model="claude-3-sonnet-20240229", temperature=0)
    
    # Google Gemini
    gemini = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
    
except Exception as e:
    print(f"モデル初期化エラー: {e}")
    raise e

In [4]:
# 評価システム初期化
generator = MahjongQuestionGenerator(model=claude, query_template=generate_question_prompt_template)

# generator.generate_question(query="答えが2翻20符になる問題を作ってください")

In [5]:
# dataset = [
#     {
#         "query": "答えが2翻40符になる問題を作ってください",
#         "answer": {
#             "fu": 40,
#             "han": 2
#         }
#     }
# ]

In [6]:
# evaluator = MahjongEvaluator(generator=generator)
# df = evaluator.evals(dataset=dataset)
# df

In [None]:
# e = MultiModelEvaluator(models=[claude, gemini], query_template=generate_question_prompt_template)
# df = e.evals(dataset)
# df

In [9]:
with open('../dataset/queries.json', 'r') as f:
    dataset = json.load(f)

In [10]:
dataset

[{'query': '答えが2飜30符になる問題を作ってください', 'answer': {'fu': 30, 'han': 2}},
 {'query': '答えが3飜50符になる問題を作ってください', 'answer': {'fu': 50, 'han': 3}},
 {'query': '答えが4飜60符になる問題を作ってください', 'answer': {'fu': 60, 'han': 4}},
 {'query': '答えが3飜70符になる問題を作ってください', 'answer': {'fu': 70, 'han': 5}},
 {'query': '答えが1飜10符になる問題を作ってください', 'answer': {'fu': 10, 'han': 1}}]

In [None]:
e = MultiModelEvaluator(models=[claude, gemini], query_template=generate_question_prompt_template)
df = e.evals(dataset)
df

In [None]:
print(df.groupby('model')['correct'].mean())  # Accuracy by model