**Make sure you load the API keys for cloud providers!**

You can set your environment keys yourself or use a script. Please note that since keys are private, they are not included in the repository.

In [13]:
# setting the environment variables, the keys
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

from config import set_environment
# for the keys - as explained early in chapter 2
set_environment()

# Chain-of-Thought Evaluation

In [2]:
from langchain.evaluation import load_evaluator

# Simulated chain-of-thought reasoning provided by the agent:
agent_reasoning = (
    "The current interest rate is 0.25%. I determined this by recalling that recent monetary policies have aimed "
    "to stimulate economic growth by keeping borrowing costs low. A rate of 0.25% is consistent with the ongoing "
    "trend of low rates, which encourages consumer spending and business investment."
)

# Expected reasoning reference:
expected_reasoning = (
    "An ideal reasoning should mention that the Federal Reserve has maintained a low interest rate—around 0.25%—to "
    "support economic growth, and it should briefly explain the implications for borrowing costs and consumer spending."
)

# Load the chain-of-thought evaluator.
cot_evaluator = load_evaluator("cot_qa")

result_reasoning = cot_evaluator.evaluate_strings(
    input="What is the current Federal Reserve interest rate and why does it matter?",
    prediction=agent_reasoning,
    reference=expected_reasoning,
)

print("\nChain-of-Thought Reasoning Evaluation:")
print(result_reasoning)


Chain-of-Thought Reasoning Evaluation:
{'reasoning': "The student correctly identifies the current Federal Reserve interest rate as 0.25%. They also correctly explain that this low rate is intended to stimulate economic growth by keeping borrowing costs low. They further explain that this encourages consumer spending and business investment, which aligns with the context provided. Therefore, the student's answer is factually accurate and complete.\nGRADE: CORRECT", 'value': 'CORRECT', 'score': 1}


# Agent Trajectory Evaluation

In [16]:
from langsmith import Client
def trajectory_subsequence(outputs: dict, reference_outputs: dict) -> float:
    """Check how many of the desired steps the agent took."""
    if len(reference_outputs['trajectory']) > len(outputs['trajectory']):
        return 0.0
    
    i = j = 0
    while i < len(reference_outputs['trajectory']) and j < len(outputs['trajectory']):
        if reference_outputs['trajectory'][i] == outputs['trajectory'][j]:
            i += 1
        j += 1
    
    return i / len(reference_outputs['trajectory'])

# Create example dataset with expected trajectories
client = Client()
trajectory_dataset = client.create_dataset(
    "Healthcare Agent Trajectory Evaluation",
    description="Evaluates agent trajectory for medication queries"
)

# Add example with expected trajectory
client.create_example(
    inputs={
        "question": "What is the recommended dosage of ibuprofen for an adult?"
    },
    outputs={
        "trajectory": [
            "intent_classifier",
            "healthcare_agent",
            "MedicalDatabaseSearch",
            "format_response"
        ],
        "response": "Typically, 200-400mg every 4-6 hours, not exceeding 3200mg per day."
    },
    dataset_id=trajectory_dataset.id
)

## Run evaluation with our custom trajectory evaluator

In [17]:
# Note: This is an async function, so in a notebook you'd need to use await
experiment_results = await client.aevaluate(
    run_graph_with_trajectory,
    data=trajectory_dataset.id,
    evaluators=[trajectory_subsequence],
    experiment_prefix="healthcare-agent-trajectory",
    num_repetitions=1,
    max_concurrency=4,
)

# For demonstration without async:
# results_df = experiment_results.to_pandas()
# print(f"Average trajectory match score: {results_df['trajectory_subsequence'].mean()}")

View the evaluation results for experiment: 'healthcare-agent-trajectory-b9022eb9' at:
https://smith.langchain.com/o/750cc20f-7859-4635-9c3e-f4be486b0bea/datasets/fd143397-c3d1-4dbf-8dff-42b30e2560a2/compare?selectedSessions=5573ca38-e4e1-475f-a549-cd946e1aee31




0it [00:00, ?it/s]

In [18]:
results_df = experiment_results.to_pandas()
print(f"Average trajectory match score: {results_df['feedback.trajectory_subsequence'].mean()}")

Average trajectory match score: 1.0
