# 08. Evaluation and Testing

## 安装依赖

In [None]:
%uv pip install langchain~=0.3 langchain-core~=0.3 langchain-community~=0.3 langchain-openai~=0.3 langgraph~=0.6

In [None]:
%uv pip install python-dotenv~=1.1

工具类

In [None]:
import os

import dotenv
from langchain_openai import ChatOpenAI
import langsmith


class Config:
    def __init__(self):
        # By default, load_dotenv doesn't override existing environment variables and looks for a .env file in same directory as python script or searches for it incrementally higher up.
        dotenv_path = dotenv.find_dotenv(usecwd=True)
        if not dotenv_path:
            raise ValueError("No .env file found")
        dotenv.load_dotenv(dotenv_path=dotenv_path)

        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY is not set")

        base_url = os.getenv("OPENAI_API_BASE_URL")
        if not base_url:
            raise ValueError("OPENAI_API_BASE_URL is not set")

        model = os.getenv("OPENAI_MODEL")
        if not model:
            raise ValueError("OPENAI_MODEL is not set")

        self.api_key = api_key
        self.base_url = base_url
        self.model = model

        self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")

    def new_openai_like(self, **kwargs) -> ChatOpenAI:
        # 参考：https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # 参考：https://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI 文档参考：https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatOpenAI(
            api_key=self.api_key, base_url=self.base_url, model=self.model, **kwargs
        )

    def new_langsmith_client(self, **kwargs) -> langsmith.Client:
        if not self.langsmith_api_key:
            raise ValueError("LANGSMITH_API_KEY is not set")

        return langsmith.Client(api_key=self.langsmith_api_key, **kwargs)

## Why evaluation matters
### Safety and alignment
### Performance and efficiency
### User and stakeholder value
### Building consensus for LLM evaluation
## What we evaluate: core agent capabilities
### Task performance evaluation
### Tool usage evaluation
### RAG evaluation
### Planning and reasoning evaluation
## How we evaluate: methodologies and approaches
### Automated evaluation approaches
### Human-in-the-loop evaluation
### System-level evaluation
## Evaluating LLM agents in practice
### Evaluating the correctness of results

In [None]:
from langchain.evaluation import ExactMatchStringEvaluator

prompt = "What is the current Federal Reserve interest rate?"
reference_answer = "0.25%"  # Suppose this is the correct answer.

# Example predictions:
prediction_correct = "0.25%"
prediction_incorrect = "0.50%"

# Initialize an Exact Match evaluator that ignores case differences.
exact_evaluator = ExactMatchStringEvaluator(ignore_case=True)

# Evaluate the correct prediction.
exact_result_correct = exact_evaluator.evaluate_strings(
    prediction=prediction_correct, reference=reference_answer
)
print("Exact match result (correct answer):", exact_result_correct)

# Evaluate an incorrect prediction.
exact_result_incorrect = exact_evaluator.evaluate_strings(
    prediction=prediction_incorrect, reference=reference_answer
)
print("Exact match result (incorrect answer):", exact_result_incorrect)

In [None]:
from langchain.evaluation.scoring import ScoreStringEvalChain

# Initialize the evaluator LLM
llm = Config().new_openai_like()

# Create the ScoreStringEvalChain from the LLM
chain = ScoreStringEvalChain.from_llm(llm=llm)

# Define the finance-related input, prediction, and reference answer
finance_input = "What is the current Federal Reserve interest rate?"
finance_prediction = "The current interest rate is 0.25%."
finance_reference = "The Federal Reserve's current interest rate is 0.25%."

# Evaluate the prediction using the scoring chain
result_finance = chain.evaluate_strings(
    input=finance_input,
    prediction=finance_prediction,
)

print("Finance Evaluation Result:")
print(result_finance)

In [None]:
from langchain.evaluation.scoring import LabeledScoreStringEvalChain

# Initialize the evaluator LLM
llm = Config().new_openai_like()

# Create the evaluation chain that can use reference answers
labeled_chain = LabeledScoreStringEvalChain.from_llm(llm=llm)

# Define the finance-related input, prediction, and reference answer
finance_input = "What is the current Federal Reserve interest rate?"
finance_prediction = "The current interest rate is 0.25%."
finance_reference = "The Federal Reserve's current interest rate is 0.25%."

# Evaluate the prediction against the reference
labeled_result = labeled_chain.evaluate_strings(
    input=finance_input,
    prediction=finance_prediction,
    reference=finance_reference,
)

print("Finance Evaluation Result (with reference):")
print(labeled_result)

### Evaluating tone and conciseness

In [None]:
evaluation_llm = Config().new_openai_like()

In [None]:
prompt_health = "What is a healthy blood pressure range for adults?"

# A sample LLM output from your healthcare assistant:
prediction_health = (
    "A normal blood pressure reading is typically around 120/80 mmHg. "
    "It's important to follow your doctor's advice for personal health management!"
)

In [None]:
from langchain.evaluation import EvaluatorType, load_evaluator

conciseness_evaluator = load_evaluator(
    EvaluatorType.CRITERIA, criteria="conciseness", llm=evaluation_llm
)
conciseness_result = conciseness_evaluator.evaluate_strings(
    prediction=prediction_health, input=prompt_health
)
print("Conciseness evaluation result:", conciseness_result)

In [None]:
# Evaluate friendliness with custom criterion
custom_friendliness = {
    "friendliness": "Is the response written in a friendly and approachable tone?"
}
friendliness_evaluator = load_evaluator(
    EvaluatorType.CRITERIA, criteria=custom_friendliness, llm=evaluation_llm
)
friendliness_result = friendliness_evaluator.evaluate_strings(
    prediction=prediction_health, input=prompt_health
)
print("Friendliness evaluation result:", friendliness_result)

### Evaluating the output format

In [None]:
from langchain.evaluation import JsonValidityEvaluator

# Initialize the JSON validity evaluator.
json_validator = JsonValidityEvaluator()

valid_json_output = '{"company": "Acme Corp", "revenue": 1000000, "profit": 200000}'
invalid_json_output = '{"company": "Acme Corp", "revenue": 1000000, "profit": 200000,}'

# Evaluate the valid JSON.
valid_result = json_validator.evaluate_strings(prediction=valid_json_output)
print("JSON validity result (valid):", valid_result)

# Evaluate the invalid JSON.
invalid_result = json_validator.evaluate_strings(prediction=invalid_json_output)
print("JSON validity result (invalid):", invalid_result)

### Evaluating agent trajectory

In [None]:
%uv pip install langsmith~=0.3

In [None]:
def trajectory_subsequence(outputs: dict, reference_outputs: dict) -> float:
    """Check how many of the desired steps the agent took."""
    if len(reference_outputs["trajectory"]) > len(outputs["trajectory"]):
        return 0.0

    i = j = 0
    while i < len(reference_outputs["trajectory"]) and j < len(outputs["trajectory"]):
        if reference_outputs["trajectory"][i] == outputs["trajectory"][j]:
            i += 1
        j += 1

    return i / len(reference_outputs["trajectory"])


# Create example dataset with expected trajectories
client = Config().new_langsmith_client()
trajectory_dataset = client.create_dataset(
    "Healthcare Agent Trajectory Evaluation",
    description="Evaluates agent trajectory for medication queries",
)

# Add example with expected trajectory
client.create_example(
    inputs={"question": "What is the recommended dosage of ibuprofen for an adult?"},
    outputs={
        "trajectory": [
            "intent_classifier",
            "healthcare_agent",
            "MedicalDatabaseSearch",
            "format_response",
        ],
        "response": "Typically, 200-400mg every 4-6 hours, not exceeding 3200mg per day.",
    },
    dataset_id=trajectory_dataset.id,
)

In [None]:
# Function to run graph with trajectory tracking (example implementation)
async def run_graph_with_trajectory(inputs: dict) -> dict:
    """Run graph and track the trajectory it takes along with the final response."""
    trajectory = []
    final_response = ""
    # Here you would implement your actual graph execution
    # For the example, we'll just return a sample result
    trajectory = [
        "intent_classifier",
        "healthcare_agent",
        "MedicalDatabaseSearch",
        "format_response",
    ]
    final_response = (
        "Typically, 200-400mg every 4-6 hours, not exceeding 3200mg per day."
    )
    return {"trajectory": trajectory, "response": final_response}


# Note: This is an async function, so in a notebook you'd need to use await
experiment_results = await client.aevaluate(
    run_graph_with_trajectory,
    data=trajectory_dataset.id,
    evaluators=[trajectory_subsequence],
    experiment_prefix="healthcare-agent-trajectory",
    num_repetitions=1,
    max_concurrency=4,
)

In [None]:
%uv pip install pandas~=2.3

In [None]:
# For demonstration without async:
results_df = experiment_results.to_pandas()
print(
    f"Average trajectory match score: {results_df['feedback.trajectory_subsequence'].mean()}"
)

### Evaluating CoT reasoning

In [None]:
from langchain.evaluation import load_evaluator, EvaluatorType

# Simulated chain-of-thought reasoning provided by the agent:
agent_reasoning = (
    "The current interest rate is 0.25%. I determined this by recalling that recent monetary policies have aimed "
    "to stimulate economic growth by keeping borrowing costs low. A rate of 0.25% is consistent with the ongoing "
    "trend of low rates, which encourages consumer spending and business investment."
)

# Expected reasoning reference:
expected_reasoning = (
    "An ideal reasoning should mention that the Federal Reserve has maintained a low interest rate—around 0.25%—to "
    "support economic growth, and it should briefly explain the implications for borrowing costs and consumer spending."
)

llm = Config().new_openai_like()

# Load the chain-of-thought evaluator.
cot_evaluator = load_evaluator(EvaluatorType.COT_QA, llm=llm)

result_reasoning = cot_evaluator.evaluate_strings(
    input="What is the current Federal Reserve interest rate and why does it matter?",
    prediction=agent_reasoning,
    reference=expected_reasoning,
)

print("\nChain-of-Thought Reasoning Evaluation:")
print(result_reasoning)

## Offline evaluation
### Evaluating RAG systems

In [None]:
client = Config().new_langsmith_client()

# Sample financial examples
financial_examples = [
    {
        "inputs": {
            "question": "What are the tax implications of early 401(k) withdrawal?",
            "context_needed": ["retirement", "taxation", "penalties"],
        },
        "outputs": {
            "answer": "Early withdrawals from a 401(k) typically incur a 10% penalty if you're under 59½ years old, in addition to regular income taxes. However, certain hardship withdrawals may qualify for penalty exemptions.",
            "key_points": ["10% penalty", "income tax", "hardship exemptions"],
            "documents": ["IRS publication 575", "Retirement plan guidelines"],
        },
    },
    {
        "inputs": {
            "question": "How does dollar-cost averaging compare to lump-sum investing?",
            "context_needed": [
                "investment strategy",
                "risk management",
                "market timing",
            ],
        },
        "outputs": {
            "answer": "Dollar-cost averaging spreads investments over time to reduce timing risk, while lump-sum investing typically outperforms in rising markets due to longer market exposure. DCA may provide psychological benefits through reduced volatility exposure.",
            "key_points": ["timing risk", "market exposure", "psychological benefits"],
            "documents": ["Investment strategy comparisons", "Market timing research"],
        },
    },
]

# Create dataset in LangSmith
dataset_name = "Financial Advisory RAG Evaluation"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Evaluation dataset for financial advisory RAG systems covering retirement, investments, and tax planning.",
)

# Add examples to the dataset
for example in financial_examples:
    client.create_example(
        inputs=example["inputs"], outputs=example["outputs"], dataset_id=dataset.id
    )
print(f"Created evaluation dataset with {len(financial_examples)} examples")

### Evaluating a benchmark in LangSmith

In [None]:
# Example configuration for LangSmith:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "My Project"
# os.environ["LANGSMITH_API_KEY"] = "lsv2_pt_xxxxxxx"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"

In [None]:
# TODO: 没在 langsmith 看到追踪信息

# Create a simple LLM call that will be traced in LangSmith
llm = Config().new_openai_like()

response = llm.invoke("Hello, world!")
print(f"Model response: {response.content}")
print("\nThis run has been logged to LangSmith.")
print("You can view it in the LangSmith UI: https://smith.langchain.com")

In [None]:
# Define evaluation configuration
from langchain.smith import RunEvalConfig

# Define evaluation criteria specific to RAG systems
evaluation_config = RunEvalConfig(
    evaluators=[
        {
            "criteria": {
                "factual_accuracy": "Does the response contain only factually correct information consistent with the reference answer?"
            },
            "evaluator_type": "criteria",
        },
        {
            "criteria": {
                "groundedness": "Is the response fully supported by the retrieved documents without introducing unsupported information?"
            },
            "evaluator_type": "criteria",
        },
        {
            "criteria": {
                "retrieval_relevance": "Are the retrieved documents relevant to answering the question?"
            },
            "evaluator_type": "criteria",
        },
    ]
)

In [None]:
# TODO: 补充实现
def construct_chain():
    # This would be your actual RAG implementation
    # For example: return RAGChain(...)
    pass

In [None]:
from langchain.smith import run_on_dataset

results = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=construct_chain,
    evaluation=evaluation_config,
)

### Evaluating a benchmark with HF datasets and Evaluate

In [None]:
%uv pip install datasets~=3.4 evaluate~=0.4

In [None]:
import os

os.environ["HF_ALLOW_CODE_EVAL"] = "1"

In [None]:
from datasets import load_dataset
from evaluate import load

# from langchain_core.messages import HumanMessage


human_eval = load_dataset("openai_humaneval", split="test")
code_eval_metric = load("code_eval")

test_cases = ["assert add(2,3)==5"]
candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]

pass_at_k, results = code_eval_metric.compute(
    references=test_cases, predictions=candidates, k=[1, 2]
)
print(pass_at_k)

### Evaluating email extraction

In [None]:
# Define a list of synthetic insurance claim examples
example_inputs = [
    (
        "I was involved in a car accident on 2023-08-15. My name is Jane Smith, Claim ID INS78910, "
        "Policy Number POL12345, and the damage is estimated at $3500.",
        {
            "claimant_name": "Jane Smith",
            "claim_id": "INS78910",
            "policy_number": "POL12345",
            "claim_amount": "$3500",
            "accident_date": "2023-08-15",
            "accident_description": "Car accident causing damage",
            "status": "pending",
        },
    ),
    (
        "My motorcycle was hit in a minor collision on 2023-07-20. I am John Doe, with Claim ID INS112233 "
        "and Policy Number POL99887. The estimated damage is $1500.",
        {
            "claimant_name": "John Doe",
            "claim_id": "INS112233",
            "policy_number": "POL99887",
            "claim_amount": "$1500",
            "accident_date": "2023-07-20",
            "accident_description": "Minor motorcycle collision",
            "status": "pending",
        },
    ),
]

In [None]:
client = Config().new_langsmith_client()

dataset_name = "Insurance Claims"

# Create the dataset in LangSmith
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Synthetic dataset for insurance claim extraction tasks",
)

# Store examples in the dataset
for input_text, expected_output in example_inputs:
    client.create_example(
        inputs={"input": input_text},
        outputs={"output": expected_output},
        metadata={"source": "Synthetic"},
        dataset_id=dataset.id,
    )

In [None]:
# Define the extraction schema
from pydantic import BaseModel, Field


class InsuranceClaim(BaseModel):
    claimant_name: str = Field(..., description="The name of the claimant")
    claim_id: str = Field(..., description="The unique insurance claim identifier")
    policy_number: str = Field(
        ..., description="The policy number associated with the claim"
    )
    claim_amount: str = Field(..., description="The claimed amount (e.g., '$5000')")
    accident_date: str = Field(..., description="The date of the accident (YYYY-MM-DD)")
    accident_description: str = Field(
        ..., description="A brief description of the accident"
    )
    status: str = Field("pending", description="The current status of the claim")

In [None]:
# Create extraction chain
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Set up a parser + inject instructions into the prompt template.
output_parser = JsonOutputParser(pydantic_object=InsuranceClaim)

instructions = PromptTemplate(
    template=(
        "Extract the following structured information from the insurance claim text: "
        "claimant_name, claim_id, policy_number, claim_amount, accident_date, "
        "accident_description, and status.\n"
        "{format_instructions}\n"
        "{input}\n"
    ),
    input_variables=["input"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

llm = Config().new_openai_like()

extraction_chain = instructions | llm | output_parser

# Test the extraction chain
sample_claim_text = (
    "I was involved in a car accident on 2023-08-15. My name is Jane Smith, "
    "Claim ID INS78910, Policy Number POL12345, and the damage is estimated at $3500. "
    "Please process my claim."
)

result = extraction_chain.invoke({"input": sample_claim_text})
print("Extraction Result:")
print(result)