In [15]:
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
# import
from langfuse import Langfuse
import openai
 
# init
langfuse = Langfuse()

In [17]:
langfuse.create_dataset(name="capital_cities");

In [18]:
# example items, could also be json instead of strings
local_items = [
    {"input": {"country": "Italy"}, "expected_output": "Rome"},
    {"input": {"country": "Spain"}, "expected_output": "Madrid"},
    {"input": {"country": "Brazil"}, "expected_output": "Brasília"},
    {"input": {"country": "Japan"}, "expected_output": "Tokyo"},
    {"input": {"country": "India"}, "expected_output": "New Delhi"},
    {"input": {"country": "Canada"}, "expected_output": "Ottawa"},
    {"input": {"country": "South Korea"}, "expected_output": "Seoul"},
    {"input": {"country": "Argentina"}, "expected_output": "Buenos Aires"},
    {"input": {"country": "South Africa"}, "expected_output": "Pretoria"},
    {"input": {"country": "Egypt"}, "expected_output": "Cairo"},
]

In [19]:
local_items

[{'input': {'country': 'Italy'}, 'expected_output': 'Rome'},
 {'input': {'country': 'Spain'}, 'expected_output': 'Madrid'},
 {'input': {'country': 'Brazil'}, 'expected_output': 'Brasília'},
 {'input': {'country': 'Japan'}, 'expected_output': 'Tokyo'},
 {'input': {'country': 'India'}, 'expected_output': 'New Delhi'},
 {'input': {'country': 'Canada'}, 'expected_output': 'Ottawa'},
 {'input': {'country': 'South Korea'}, 'expected_output': 'Seoul'},
 {'input': {'country': 'Argentina'}, 'expected_output': 'Buenos Aires'},
 {'input': {'country': 'South Africa'}, 'expected_output': 'Pretoria'},
 {'input': {'country': 'Egypt'}, 'expected_output': 'Cairo'}]

In [20]:
# Upload to Langfuse
for item in local_items:
  langfuse.create_dataset_item(
      dataset_name="capital_cities",
      # any python object or value
      input=item["input"],
      # any python object or value, optional
      expected_output=item["expected_output"]
)

### アプリケーションを定義し、実験を実行する

In [21]:
# we use a very simple eval here, you can use any eval library
# see https://langfuse.com/docs/scores/model-based-evals for details
def simple_evaluation(output, expected_output):
  return output == expected_output

カスタムアプリ

In [22]:
from datetime import datetime
 
def run_my_custom_llm_app(input, system_prompt):
  messages = [
      {"role":"system", "content": system_prompt},
      {"role":"user", "content": input["country"]}
  ]
 
  generationStartTime = datetime.now()
 
  openai_completion = openai.chat.completions.create(
      model="gpt-4o-mini",
      messages=messages
  ).choices[0].message.content
 
  langfuse_generation = langfuse.generation(
    name="guess-countries",
    input=messages,
    output=openai_completion,
    model="gpt-4o-mini",
    start_time=generationStartTime,
    end_time=datetime.now()
  )
 
  return openai_completion, langfuse_generation

In [23]:
def run_experiment(experiment_name, system_prompt):
  dataset = langfuse.get_dataset("capital_cities")
 
  for item in dataset.items:
    completion, langfuse_generation = run_my_custom_llm_app(item.input, system_prompt)
 
    item.link(langfuse_generation, experiment_name) # pass the observation/generation object or the id
 
    langfuse_generation.score(
      name="exact_match",
      value=simple_evaluation(completion, item.expected_output)
    )

In [24]:
run_experiment(
    "famous_city",
    "ユーザーは国名を入力し、その国で最も有名な都市名を回答する。"
)
run_experiment(
    "directly_ask",
    "次の国の首都はどこですか??"
)
run_experiment(
    "asking_specifically",
    "ユーザーは国名を入力し、首都名のみで応答する"
)
run_experiment(
    "asking_specifically_2nd_try",
    "ユーザーは国名を入力し、首都名のみで回答します。州名は都市名のみを記載します。."
)

Langchainアプリケーション

In [25]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage
 
def run_my_langchain_llm_app(input, system_message, callback_handler):
  prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_message,
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
  )
  chat = ChatOpenAI(model="gpt-4o-mini")
  chain = prompt | chat
 
  res = chain.invoke(
    { "messages": [HumanMessage(content=input)] },
    config={"callbacks":[callback_handler]}
  )
  
  return res

In [26]:
def run_langchain_experiment(experiment_name, system_message):
  dataset = langfuse.get_dataset("capital_cities")
 
  for item in dataset.items:
    handler = item.get_langchain_handler(run_name=experiment_name)
 
    completion = run_my_langchain_llm_app(item.input["country"], system_message, handler)
 
    handler.trace.score(
      name="exact_match",
      value=simple_evaluation(completion, item.expected_output)
    )

In [28]:
run_langchain_experiment(
    "langchain_famous_city",
    "The user will input countries, respond with the most famous city in this country"
)
run_langchain_experiment(
    "langchain_directly_ask",
    "What is the capital of the following country?"
)
run_langchain_experiment(
    "langchain_asking_specifically",
    "The user will input countries, respond with only the name of the capital"
)
run_langchain_experiment(
    "langchain_asking_specifically_2nd_try",
    "The user will input countries, respond with only the name of the capital. State only the name of the city."
)