https://docs.smith.langchain.com/evaluation/tutorials/evaluation

# Evaluate a chatbot

# Setup

In [7]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Create a dataset

In [8]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "QA Example Dataset"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['93847b75-9645-42c9-ba7d-904688d7bab3',
  '5b5d2e2a-e1ed-4472-9297-43b211993077',
  '294389ea-b75e-4348-b1a9-c0e3057fd0bc',
  '8276b532-d357-40f4-b80a-314c056a2f6c',
  'ff235f8a-85da-4569-82b8-ae622fbe4de4'],
 'count': 5}

# Define metrics

In [None]:
import openai
from langsmith import wrappers

openai_client = wrappers.wrap_openai(openai.OpenAI())

eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""You are grading the following question:
                {inputs['question']}
                Here is the real answer:
                {reference_outputs['answer']}
                You are grading the following predicted answer:
                {outputs['response']}
                Respond with CORRECT or INCORRECT:
                Grade:
                """
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content},
        ],
    ).choices[0].message.content
    return response == "CORRECT"

In [10]:
def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

# Run Evaluations

In [11]:
default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."

def my_app(question: str, model: str = "gpt-4o-mini", instructions: str = default_instructions) -> str:
    return openai_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [12]:
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"])}

In [13]:
experiment_results = client.evaluate(
    ls_target, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[concision, correctness], # The evaluators to score the results
    experiment_prefix="openai-4o-mini", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'openai-4o-mini-e72c1e05' at:
https://smith.langchain.com/o/cb610865-43f4-4b0e-98d4-572b2702d7c7/datasets/53c78873-415e-4223-bb88-9c41ce2c4d00/compare?selectedSessions=b969e863-068b-44ad-8b43-8557790ec927




0it [00:00, ?it/s]

In [14]:
def ls_target_v2(inputs: str) -> dict:
    return {"response": my_app(inputs["question"], model="gpt-4-turbo")}

experiment_results = client.evaluate(
    ls_target_v2,
    data=dataset_name,
    evaluators=[concision, correctness],
    experiment_prefix="openai-4-turbo",
)

View the evaluation results for experiment: 'openai-4-turbo-e201ad13' at:
https://smith.langchain.com/o/cb610865-43f4-4b0e-98d4-572b2702d7c7/datasets/53c78873-415e-4223-bb88-9c41ce2c4d00/compare?selectedSessions=50aaef30-5064-4332-83b9-9d4447ff8c13




0it [00:00, ?it/s]

Error running target function: Error code: 403 - {'error': {'message': 'Project `proj_WGWSvViJeuMybNZuPlpT6up4` does not have access to model `gpt-4-turbo`', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
Traceback (most recent call last):
  File "/home/keny/Programs/Python/RAG/langsmitsh_tutorial/.venv/lib/python3.13/site-packages/langsmith/evaluation/_runner.py", line 1915, in _forward
    fn(
    ~~^
        *args,
        ^^^^^^
        langsmith_extra=langsmith_extra,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/home/keny/Programs/Python/RAG/langsmitsh_tutorial/.venv/lib/python3.13/site-packages/langsmith/run_helpers.py", line 634, in wrapper
    raise e
  File "/home/keny/Programs/Python/RAG/langsmitsh_tutorial/.venv/lib/python3.13/site-packages/langsmith/run_helpers.py", line 631, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
  File "/tmp/ipykernel_222072/1529853405.py", line 2, in ls_target_v2


In [15]:
instructions_v3 = "Respond to the users question in a short, concise manner (one short sentence). Do NOT use more than ten words."

def ls_target_v3(inputs: str) -> dict:
    response = my_app(
        inputs["question"], 
        model="gpt-4-turbo",
        instructions=instructions_v3
    )
    return {"response": response}


experiment_results = client.evaluate(
    ls_target_v3,
    data=dataset_name,
    evaluators=[concision, correctness],
    experiment_prefix="strict-openai-4-turbo",
)

View the evaluation results for experiment: 'strict-openai-4-turbo-bf74528f' at:
https://smith.langchain.com/o/cb610865-43f4-4b0e-98d4-572b2702d7c7/datasets/53c78873-415e-4223-bb88-9c41ce2c4d00/compare?selectedSessions=8adabec8-df54-4b2e-94c9-df55d284812d




0it [00:00, ?it/s]

Error running target function: Error code: 403 - {'error': {'message': 'Project `proj_WGWSvViJeuMybNZuPlpT6up4` does not have access to model `gpt-4-turbo`', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
Traceback (most recent call last):
  File "/home/keny/Programs/Python/RAG/langsmitsh_tutorial/.venv/lib/python3.13/site-packages/langsmith/evaluation/_runner.py", line 1915, in _forward
    fn(
    ~~^
        *args,
        ^^^^^^
        langsmith_extra=langsmith_extra,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/home/keny/Programs/Python/RAG/langsmitsh_tutorial/.venv/lib/python3.13/site-packages/langsmith/run_helpers.py", line 634, in wrapper
    raise e
  File "/home/keny/Programs/Python/RAG/langsmitsh_tutorial/.venv/lib/python3.13/site-packages/langsmith/run_helpers.py", line 631, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
  File "/tmp/ipykernel_222072/1433557006.py", line 4, in ls_target_v3
