#### Langfuse Evaluation

In [46]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [47]:
from langfuse import Langfuse
from langfuse.langchain import CallbackHandler

langfuse = Langfuse(
    public_key="pk-lf-c7be1653-c7b1-4136-90e9-6132b375e823",
    secret_key="sk-lf-dc2be4ce-811a-46b5-a385-7bbe1ab307b8",
    host="http://localhost:3000",
)

langfuse_handler = CallbackHandler()

In [48]:
from langfuse import get_client

langfuse = get_client()

# Verify connection
if langfuse.auth_check():
    print("Langfuse client is authenticated and ready!")
else:
    print("Authentication failed. Please check your credentials and host.")

Langfuse client is authenticated and ready!


In [None]:
langfuse.create_dataset(name="capital_cities")

In [None]:
local_items = [
    {"input": {"country": "Italy"}, "expected_output": "Rome"},
    {"input": {"country": "Spain"}, "expected_output": "Madrid"},
    {"input": {"country": "Brazil"}, "expected_output": "Brasília"},
    {"input": {"country": "Japan"}, "expected_output": "Tokyo"},
    {"input": {"country": "India"}, "expected_output": "New Delhi"},
    {"input": {"country": "Canada"}, "expected_output": "Ottawa"},
    {"input": {"country": "South Korea"}, "expected_output": "Seoul"},
    {"input": {"country": "Argentina"}, "expected_output": "Buenos Aires"},
    {"input": {"country": "South Africa"}, "expected_output": "Pretoria"},
    {"input": {"country": "Egypt"}, "expected_output": "Cairo"},
    {"input": {"country": "Westeros"}, "expected_output": "Meereen"},
]

In [51]:
# Upload to Langfuse
for item in local_items:
    langfuse.create_dataset_item(
        dataset_name="capital_cities",
        # any python object or value
        input=item["input"],
        # any python object or value, optional
        expected_output=item["expected_output"],
    )

In [52]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI


def run_my_langchain_llm_app(input, system_message, callback_handler):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                system_message,
            ),
            MessagesPlaceholder(variable_name="messages"),
        ]
    )
    chat = ChatOpenAI(model="gpt-4o-mini")
    chain = prompt | chat

    res = chain.invoke(
        {"messages": [HumanMessage(content=input)]}, config={"callbacks": [callback_handler]}
    )

    return res.content

In [53]:
def simple_evaluation(output, expected_output):
    return output == expected_output

In [54]:
from langfuse.langchain import CallbackHandler


def run_langchain_experiment(experiment_name, system_prompt):
    dataset = langfuse.get_dataset("capital_cities")

    # Initialize the Langfuse handler
    langfuse_handler = CallbackHandler()

    for item in dataset.items:
        # Use the item.run() context manager
        with item.run(
            run_name=experiment_name,
        ) as root_span:  # root_span is the root span of the new trace for this item and run.
            # All subsequent langfuse operations within this block are part of this trace.

            # Call your application logic
            output = run_my_langchain_llm_app(
                item.input["country"], system_prompt, langfuse_handler
            )

            # Optionally, score the result against the expected output
            root_span.score_trace(
                name="exact_match", value=simple_evaluation(output, item.expected_output)
            )

    print(f"\nFinished processing dataset 'capital_cities' for run '{experiment_name}'.")

In [None]:
run_langchain_experiment(
    "langchain_famous_city",
    "The user will input countries, respond with the most famous city in this country",
)
run_langchain_experiment("langchain_directly_ask", "What is the capital of the following country?")
run_langchain_experiment(
    "langchain_asking_specifically",
    "The user will input countries, respond with only the name of the capital",
)
run_langchain_experiment(
    "langchain_asking_specifically_2nd_try",
    "The user will input countries, respond with only the name of the capital. State only the name of the city.",
)


Finished processing dataset 'capital_cities' for run 'langchain_famous_city'.

Finished processing dataset 'capital_cities' for run 'langchain_directly_ask'.

Finished processing dataset 'capital_cities' for run 'langchain_asking_specifically'.

Finished processing dataset 'capital_cities' for run 'langchain_asking_specifically_2nd_try'.
