In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
import os
os.environ['EVAL_MODEL'] = "gpt-4o-mini"
 
# Langchain Eval types
EVAL_TYPES={
    "hallucination": True,
    "conciseness": True,
    "relevance": True,
    "coherence": True,
    "harmfulness": True,
    "maliciousness": True,
    "helpfulness": True,
    "controversiality": True,
    "misogyny": True,
    "criminality": True,
    "insensitivity": True
}

In [5]:
from langfuse import Langfuse
 
langfuse = Langfuse()
 
langfuse.auth_check()

True

データの取得

In [6]:
def fetch_all_pages(name=None, user_id = None, limit=50):
    page = 1
    all_data = []
 
    while True:
        response = langfuse.get_generations(name=name, limit=limit, user_id=user_id, page=page)
        if not response.data:
            break
 
        all_data.extend(response.data)
        page += 1
 
    return all_data

In [7]:
# generations = fetch_all_pages(user_id='session-1234')
generations = fetch_all_pages()


In [11]:
generations

[ObservationsView(id='cdd4ab68-89e2-4011-9165-de4e785b0cdb', trace_id='509c5557-a118-47e3-ae5a-d7bd69eb7a27', type='GENERATION', name='ChatOpenAI', start_time=datetime.datetime(2024, 8, 17, 11, 36, 43, 851000, tzinfo=datetime.timezone.utc), end_time=datetime.datetime(2024, 8, 17, 11, 36, 46, 15000, tzinfo=datetime.timezone.utc), completion_start_time=None, model='gpt-4o-mini', model_parameters={'temperature': '0.0'}, input=[{'role': 'system', 'content': "You are an assistant for question-answering tasks.     Use the following pieces of retrieved context to answer the question.     If you don't know the answer, just say that you don't know.     Use three sentences maximum and keep the answer concise.\n    The advanced camera system on iPhone 15 and iPhone 15 Plus helps users capture everyday moments and cherished memories. A 48MP Main camera shoots sharp photos and videos while capturing fine details, with a quad-pixel sensor and 100 percent Focus Pixels for fast autofocus. Using the po

In [None]:
generations.count()

評価関数を設定する

In [10]:
from langchain.evaluation import load_evaluator
from langchain_openai import OpenAI, ChatOpenAI
from langchain.evaluation.criteria import LabeledCriteriaEvalChain
 
def get_evaluator_for_key(key: str):
  # llm = OpenAI(temperature=0, model=os.environ.get('EVAL_MODEL'))
  llm = ChatOpenAI(temperature=0, model=os.environ.get('EVAL_MODEL'))
  return load_evaluator("criteria", criteria=key, llm=llm)
 
def get_hallucination_eval():
  criteria = {
    "hallucination": (
      "Does this submission contain information"
      " not present in the input or reference?"
    ),
  }
  llm = ChatOpenAI(temperature=0, model=os.environ.get('EVAL_MODEL'))

  # llm = OpenAI(temperature=0, model=os.environ.get('EVAL_MODEL'))
 
  return LabeledCriteriaEvalChain.from_llm(
      llm=llm,
      criteria=criteria,
  )

評価を実行する

In [11]:
def execute_eval_and_score():
 
  for generation in generations:
    criteria = [key for key, value in EVAL_TYPES.items() if value and key != "hallucination"]
 
    for criterion in criteria:
      eval_result = get_evaluator_for_key(criterion).evaluate_strings(
          prediction=generation.output,
          input=generation.input,
      )
      print(eval_result)
 
      langfuse.score(name=criterion, trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])
 
execute_eval_and_score()
 

{'reasoning': 'To assess whether the submission meets the criteria of conciseness, I will evaluate the following:\n\n1. **Understanding the Task**: The user input specifies that the assistant should respond with only the name of the capital city of the country provided. In this case, the user input is "Italy," and the expected response is the capital city of Italy.\n\n2. **Evaluating the Submission**: The assistant\'s response is "Rome." This is indeed the correct capital of Italy.\n\n3. **Conciseness**: The criterion of conciseness requires that the response be direct and to the point, without any unnecessary information. The assistant\'s response consists solely of the name of the capital city, "Rome," without any additional commentary or extraneous details.\n\n4. **Conclusion**: Since the assistant\'s response is exactly what was requested—just the name of the capital city, and nothing more—it is both accurate and concise.\n\nBased on this reasoning, the submission meets the criteri

KeyboardInterrupt: 

In [12]:
# hallucination
 
 
def eval_hallucination():
 
  chain = get_hallucination_eval()
 
  for generation in generations:
    eval_result = chain.evaluate_strings(
      prediction=generation.output,
      input=generation.input,
      reference=generation.input
    )
    print(eval_result)
    if eval_result is not None and eval_result["score"] is not None and eval_result["reasoning"] is not None:
      langfuse.score(name='hallucination', trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])
 

In [13]:
if EVAL_TYPES.get("hallucination") == True:
  eval_hallucination()

{'reasoning': 'To assess whether the submission meets the criteria regarding hallucination, I will analyze the provided data step by step.\n\n1. **Understanding the Input**: The input consists of a system message instructing the assistant to respond with the name of the capital city when a country is provided. The user input is "Italy".\n\n2. **Understanding the Submission**: The assistant\'s response is "Rome", which is the capital of Italy. The submission also includes an additional_kwargs field with \'refusal\' set to None, indicating that the assistant did not refuse to answer.\n\n3. **Criteria for Hallucination**: The criterion for hallucination is whether the submission contains information not present in the input or reference. In this case, the input only specifies the country "Italy".\n\n4. **Evaluating the Submission Against the Input**: The assistant\'s response "Rome" is a factual statement that corresponds directly to the user input of "Italy". There is no additional infor

KeyboardInterrupt: 