# 08. Evaluation and Testing

## ÂÆâË£Ö‰æùËµñ

In [6]:
%uv pip install langchain~=1.0 langchain-core~=1.0 langgraph~=1.0

[2mAudited [1m3 packages[0m [2min 2ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
%uv pip install langchain-classic~=1.0 langchain-community~=0.4 langchain-openai~=1.0

[2mAudited [1m3 packages[0m [2min 3ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
%uv pip install python-dotenv~=1.1

[2mAudited [1m1 package[0m [2min 1ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


Â∑•ÂÖ∑Á±ª

In [4]:
import os

import dotenv
from langchain_openai import ChatOpenAI
import langsmith


class Config:
    def __init__(self):
        # By default, load_dotenv doesn't override existing environment variables and looks for a .env file in same directory as python script or searches for it incrementally higher up.
        dotenv_path = dotenv.find_dotenv(usecwd=True)
        if not dotenv_path:
            raise ValueError("No .env file found")
        dotenv.load_dotenv(dotenv_path=dotenv_path)

        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY is not set")

        base_url = os.getenv("OPENAI_API_BASE_URL")
        if not base_url:
            raise ValueError("OPENAI_API_BASE_URL is not set")

        model = os.getenv("OPENAI_MODEL")
        if not model:
            raise ValueError("OPENAI_MODEL is not set")

        self.api_key = api_key
        self.base_url = base_url
        self.model = model

        self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")

    def new_openai_like(self, **kwargs) -> ChatOpenAI:
        # ÂèÇËÄÉÔºöhttps://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # ÂèÇËÄÉÔºöhttps://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI ÊñáÊ°£ÂèÇËÄÉÔºöhttps://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatOpenAI(
            api_key=self.api_key, base_url=self.base_url, model=self.model, **kwargs
        )

    def new_langsmith_client(self, **kwargs) -> langsmith.Client:
        if not self.langsmith_api_key:
            raise ValueError("LANGSMITH_API_KEY is not set")

        return langsmith.Client(api_key=self.langsmith_api_key, **kwargs)

## Why evaluation matters
### Safety and alignment
### Performance and efficiency
### User and stakeholder value
### Building consensus for LLM evaluation
## What we evaluate: core agent capabilities
### Task performance evaluation
### Tool usage evaluation
### RAG evaluation
### Planning and reasoning evaluation
## How we evaluate: methodologies and approaches
### Automated evaluation approaches
### Human-in-the-loop evaluation
### System-level evaluation
## Evaluating LLM agents in practice
### Evaluating the correctness of results

In [8]:
from langchain_classic.evaluation import ExactMatchStringEvaluator

prompt = "What is the current Federal Reserve interest rate?"
reference_answer = "0.25%"  # Suppose this is the correct answer.

# Example predictions:
prediction_correct = "0.25%"
prediction_incorrect = "0.50%"

# Initialize an Exact Match evaluator that ignores case differences.
exact_evaluator = ExactMatchStringEvaluator(ignore_case=True)

# Evaluate the correct prediction.
exact_result_correct = exact_evaluator.evaluate_strings(
    prediction=prediction_correct, reference=reference_answer
)
print("Exact match result (correct answer):", exact_result_correct)

# Evaluate an incorrect prediction.
exact_result_incorrect = exact_evaluator.evaluate_strings(
    prediction=prediction_incorrect, reference=reference_answer
)
print("Exact match result (incorrect answer):", exact_result_incorrect)

Exact match result (correct answer): {'score': 1}
Exact match result (incorrect answer): {'score': 0}


In [9]:
from langchain_classic.evaluation.scoring import ScoreStringEvalChain

# Initialize the evaluator LLM
llm = Config().new_openai_like()

# Create the ScoreStringEvalChain from the LLM
chain = ScoreStringEvalChain.from_llm(llm=llm)

# Define the finance-related input, prediction, and reference answer
finance_input = "What is the current Federal Reserve interest rate?"
finance_prediction = "The current interest rate is 0.25%."
finance_reference = "The Federal Reserve's current interest rate is 0.25%."

# Evaluate the prediction using the scoring chain
result_finance = chain.evaluate_strings(
    input=finance_input,
    prediction=finance_prediction,
)

print("Finance Evaluation Result:")
print(result_finance)

Finance Evaluation Result:
{'reasoning': "The assistant's response states that the current Federal Reserve interest rate is 0.25%. However, this figure is outdated and factually incorrect as of 2024. The Federal Reserve adjusts its target federal funds rate regularly in response to economic conditions, and as of mid-2024, the rate has been significantly higher‚Äîranging between 5.25% and 5.50%‚Äîas part of efforts to combat inflation. Because the answer provides an inaccurate value without context or a date reference, it fails on correctness. It also lacks helpfulness and depth, offering no explanation of what the rate represents or how to find the most up-to-date information. While the response is relevant to the question in intent, its factual inaccuracy severely undermines its quality.\n\nRating: [[2]]", 'score': 2}


Âü∫‰∫éÂèÇËÄÉÁ≠îÊ°àÁöÑËØÑ‰º∞ÔºàÈÄÇÂêàÂ∑≤ÊúâÊòéÁ°Æ„ÄÅÁúüÂÆûÁ≠îÊ°àÁöÑÂú∫ÊôØÔºâ

In [10]:
from langchain_classic.evaluation.scoring import LabeledScoreStringEvalChain

# Initialize the evaluator LLM
llm = Config().new_openai_like()

# Create the evaluation chain that can use reference answers
labeled_chain = LabeledScoreStringEvalChain.from_llm(llm=llm)

# Define the finance-related input, prediction, and reference answer
finance_input = "What is the current Federal Reserve interest rate?"
finance_prediction = "The current interest rate is 0.25%."
finance_reference = "The Federal Reserve's current interest rate is 0.25%."

# Evaluate the prediction against the reference
labeled_result = labeled_chain.evaluate_strings(
    input=finance_input,
    prediction=finance_prediction,
    reference=finance_reference,
)

print("Finance Evaluation Result (with reference):")
print(labeled_result)

Finance Evaluation Result (with reference):
{'reasoning': "The assistant's response directly states that the current Federal Reserve interest rate is 0.25%, which aligns exactly with the provided ground truth. The answer is concise, accurate, and relevant to the user's question. While it lacks elaboration or context (such as the date of the rate or potential implications), the question only asks for the rate itself, so additional detail is not strictly necessary. The response meets the criteria of correctness, relevance, and helpfulness for the given query. Depth is minimal but appropriate given the straightforward nature of the question.\n\nRating: [[10]]", 'score': 10}


### Evaluating tone and conciseness

In [11]:
evaluation_llm = Config().new_openai_like()

In [12]:
prompt_health = "What is a healthy blood pressure range for adults?"

# A sample LLM output from your healthcare assistant:
prediction_health = (
    "A normal blood pressure reading is typically around 120/80 mmHg. "
    "It's important to follow your doctor's advice for personal health management!"
)

In [13]:
from langchain_classic.evaluation import EvaluatorType, load_evaluator

conciseness_evaluator = load_evaluator(
    EvaluatorType.CRITERIA, criteria="conciseness", llm=evaluation_llm
)
conciseness_result = conciseness_evaluator.evaluate_strings(
    prediction=prediction_health, input=prompt_health
)
print("Conciseness evaluation result:", conciseness_result)

Conciseness evaluation result: {'reasoning': 'To evaluate whether the submission meets the "conciseness" criterion, we need to assess if it is brief and directly addresses the question without unnecessary elaboration.\n\nStep 1: Identify the core information requested.  \nThe input asks for a healthy blood pressure range for adults. The ideal answer should state the typical healthy range (e.g., less than 120/80 mmHg) clearly and succinctly.\n\nStep 2: Examine the submission content.  \nThe submission states: "A normal blood pressure reading is typically around 120/80 mmHg. It\'s important to follow your doctor\'s advice for personal health management!"\n\nStep 3: Assess conciseness.  \nThe first sentence directly answers the question and is concise. However, the second sentence adds general advice that, while well-intentioned, is not required to answer the specific question about the healthy range. This extra sentence makes the response slightly less concise than it could be.\n\nStep 4

In [14]:
# Evaluate friendliness with custom criterion
custom_friendliness = {
    "friendliness": "Is the response written in a friendly and approachable tone?"
}
friendliness_evaluator = load_evaluator(
    EvaluatorType.CRITERIA, criteria=custom_friendliness, llm=evaluation_llm
)
friendliness_result = friendliness_evaluator.evaluate_strings(
    prediction=prediction_health, input=prompt_health
)
print("Friendliness evaluation result:", friendliness_result)

Friendliness evaluation result: {'reasoning': 'To assess whether the submission meets the "friendliness" criterion, we need to evaluate if the tone is friendly and approachable.\n\nStep 1: Examine the language used. The submission states: "A normal blood pressure reading is typically around 120/80 mmHg." This is factual and neutral, but not unfriendly.\n\nStep 2: Look at the second sentence: "It\'s important to follow your doctor\'s advice for personal health management!" This uses an exclamation mark, which often conveys warmth or encouragement. The phrase "your doctor\'s advice" is personalized ("your"), making it feel more conversational and caring.\n\nStep 3: Consider overall tone. The response avoids clinical coldness by adding a supportive, advisory note that encourages the reader to consult a professional, framed in a positive and considerate way.\n\nStep 4: Determine if this qualifies as "friendly and approachable." Yes‚Äîthe use of inclusive language ("your"), the exclamation 

### Evaluating the output format

In [15]:
from langchain_classic.evaluation import JsonValidityEvaluator

# Initialize the JSON validity evaluator.
json_validator = JsonValidityEvaluator()

valid_json_output = '{"company": "Acme Corp", "revenue": 1000000, "profit": 200000}'
invalid_json_output = '{"company": "Acme Corp", "revenue": 1000000, "profit": 200000,}'

# Evaluate the valid JSON.
valid_result = json_validator.evaluate_strings(prediction=valid_json_output)
print("JSON validity result (valid):", valid_result)

# Evaluate the invalid JSON.
invalid_result = json_validator.evaluate_strings(prediction=invalid_json_output)
print("JSON validity result (invalid):", invalid_result)

JSON validity result (valid): {'score': 1}
JSON validity result (invalid): {'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes: line 1 column 63 (char 62)'}


### Evaluating agent trajectory

In [16]:
%uv pip install langsmith~=0.4

[2mAudited [1m1 package[0m [2min 2ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
def trajectory_subsequence(outputs: dict, reference_outputs: dict) -> float:
    """Check how many of the desired steps the agent took."""
    if len(reference_outputs["trajectory"]) > len(outputs["trajectory"]):
        return 0.0

    i = j = 0
    while i < len(reference_outputs["trajectory"]) and j < len(outputs["trajectory"]):
        if reference_outputs["trajectory"][i] == outputs["trajectory"][j]:
            i += 1
        j += 1

    return i / len(reference_outputs["trajectory"])


# Create example dataset with expected trajectories
client = Config().new_langsmith_client()
trajectory_dataset = client.create_dataset(
    "Healthcare Agent Trajectory Evaluation",
    description="Evaluates agent trajectory for medication queries",
)

# Add example with expected trajectory
client.create_example(
    inputs={"question": "What is the recommended dosage of ibuprofen for an adult?"},
    outputs={
        "trajectory": [
            "intent_classifier",
            "healthcare_agent",
            "MedicalDatabaseSearch",
            "format_response",
        ],
        "response": "Typically, 200-400mg every 4-6 hours, not exceeding 3200mg per day.",
    },
    dataset_id=trajectory_dataset.id,
)

<class 'langsmith.schemas.Example'>(id=d55aff8b-f268-48ce-8a34-0965e7e03f23, dataset_id=db679684-73d5-461c-a354-d91d8b10415c, link='https://smith.langchain.com/o/87dee905-f3ea-4b4d-b51e-bfa1e3588b48/datasets/db679684-73d5-461c-a354-d91d8b10415c/e/d55aff8b-f268-48ce-8a34-0965e7e03f23')

In [19]:
# Function to run graph with trajectory tracking (example implementation)
async def run_graph_with_trajectory(inputs: dict) -> dict:
    """Run graph and track the trajectory it takes along with the final response."""
    trajectory = []
    final_response = ""
    # Here you would implement your actual graph execution
    # For the example, we'll just return a sample result
    trajectory = [
        "intent_classifier",
        "healthcare_agent",
        "MedicalDatabaseSearch",
        "format_response",
    ]
    final_response = (
        "Typically, 200-400mg every 4-6 hours, not exceeding 3200mg per day."
    )
    return {"trajectory": trajectory, "response": final_response}


# Note: This is an async function, so in a notebook you'd need to use await
experiment_results = await client.aevaluate(
    run_graph_with_trajectory,
    data=trajectory_dataset.id,
    evaluators=[trajectory_subsequence],
    experiment_prefix="healthcare-agent-trajectory",
    num_repetitions=1,
    max_concurrency=4,
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'healthcare-agent-trajectory-aee10872' at:
https://smith.langchain.com/o/87dee905-f3ea-4b4d-b51e-bfa1e3588b48/datasets/db679684-73d5-461c-a354-d91d8b10415c/compare?selectedSessions=314ed5d3-50f7-47f8-9305-96d264199f35




1it [00:00,  1.70it/s]


In [20]:
%uv pip install pandas~=2.3

[2K[2mResolved [1m6 packages[0m [2min 104ms[0m[0m                                         [0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m3 packages[0m [2min 107ms[0m[0m                               [0m
 [32m+[39m [1mpandas[0m[2m==2.3.3[0m
 [32m+[39m [1mpytz[0m[2m==2025.2[0m
 [32m+[39m [1mtzdata[0m[2m==2025.2[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
# For demonstration without async:
results_df = experiment_results.to_pandas()
print(
    f"Average trajectory match score: {results_df['feedback.trajectory_subsequence'].mean()}"
)

Average trajectory match score: 1.0


### Evaluating CoT reasoning

In [22]:
from langchain_classic.evaluation import load_evaluator, EvaluatorType

# Simulated chain-of-thought reasoning provided by the agent:
agent_reasoning = (
    "The current interest rate is 0.25%. I determined this by recalling that recent monetary policies have aimed "
    "to stimulate economic growth by keeping borrowing costs low. A rate of 0.25% is consistent with the ongoing "
    "trend of low rates, which encourages consumer spending and business investment."
)

# Expected reasoning reference:
expected_reasoning = (
    "An ideal reasoning should mention that the Federal Reserve has maintained a low interest rate‚Äîaround 0.25%‚Äîto "
    "support economic growth, and it should briefly explain the implications for borrowing costs and consumer spending."
)

llm = Config().new_openai_like()

# Load the chain-of-thought evaluator.
cot_evaluator = load_evaluator(EvaluatorType.COT_QA, llm=llm)

result_reasoning = cot_evaluator.evaluate_strings(
    input="What is the current Federal Reserve interest rate and why does it matter?",
    prediction=agent_reasoning,
    reference=expected_reasoning,
)

print("\nChain-of-Thought Reasoning Evaluation:")
print(result_reasoning)


Chain-of-Thought Reasoning Evaluation:
{'reasoning': 'EXPLANATION:  \nStep 1: Check whether the student correctly states the current Federal Reserve interest rate according to the context. The context specifies that the rate is ‚Äúaround 0.25%,‚Äù and the student states it as ‚Äú0.25%.‚Äù This aligns with the provided context.\n\nStep 2: Evaluate whether the student explains why the rate matters, as required by the question and supported by the context. The context indicates the low rate is maintained ‚Äúto support economic growth‚Äù and should mention implications for ‚Äúborrowing costs and consumer spending.‚Äù\n\nStep 3: The student explains that the low rate ‚Äúaims to stimulate economic growth by keeping borrowing costs low,‚Äù which directly corresponds to the context‚Äôs point about supporting economic growth and lowering borrowing costs.\n\nStep 4: The student also notes that this ‚Äúencourages consumer spending and business investment,‚Äù which aligns with the context‚Äôs men

## Offline evaluation
### Evaluating RAG systems

In [23]:
client = Config().new_langsmith_client()

# Sample financial examples
financial_examples = [
    {
        "inputs": {
            "question": "What are the tax implications of early 401(k) withdrawal?",
            "context_needed": ["retirement", "taxation", "penalties"],
        },
        "outputs": {
            "answer": "Early withdrawals from a 401(k) typically incur a 10% penalty if you're under 59¬Ω years old, in addition to regular income taxes. However, certain hardship withdrawals may qualify for penalty exemptions.",
            "key_points": ["10% penalty", "income tax", "hardship exemptions"],
            "documents": ["IRS publication 575", "Retirement plan guidelines"],
        },
    },
    {
        "inputs": {
            "question": "How does dollar-cost averaging compare to lump-sum investing?",
            "context_needed": [
                "investment strategy",
                "risk management",
                "market timing",
            ],
        },
        "outputs": {
            "answer": "Dollar-cost averaging spreads investments over time to reduce timing risk, while lump-sum investing typically outperforms in rising markets due to longer market exposure. DCA may provide psychological benefits through reduced volatility exposure.",
            "key_points": ["timing risk", "market exposure", "psychological benefits"],
            "documents": ["Investment strategy comparisons", "Market timing research"],
        },
    },
]

# Create dataset in LangSmith
dataset_name = "Financial Advisory RAG Evaluation"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Evaluation dataset for financial advisory RAG systems covering retirement, investments, and tax planning.",
)

# Add examples to the dataset
for example in financial_examples:
    client.create_example(
        inputs=example["inputs"], outputs=example["outputs"], dataset_id=dataset.id
    )
print(f"Created evaluation dataset with {len(financial_examples)} examples")

Created evaluation dataset with 2 examples


### Evaluating a benchmark in LangSmith

In [24]:
# Example configuration for LangSmith:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "My Project"
# os.environ["LANGSMITH_API_KEY"] = "lsv2_pt_xxxxxxx"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"

In [26]:
# TODO: Ê≤°Âú® langsmith ÁúãÂà∞ËøΩË∏™‰ø°ÊÅØ

# Create a simple LLM call that will be traced in LangSmith
llm = Config().new_openai_like()

response = llm.invoke("Hello, world!")
print(f"Model response: {response.content}")
print("\nThis run has been logged to LangSmith.")
print("You can view it in the LangSmith UI: https://smith.langchain.com")

Model response: Hello! How can I help you today? üòä

This run has been logged to LangSmith.
You can view it in the LangSmith UI: https://smith.langchain.com


In [27]:
# Define evaluation configuration
from langchain_classic.smith import RunEvalConfig

# Define evaluation criteria specific to RAG systems
evaluation_config = RunEvalConfig(
    evaluators=[
        {
            "criteria": {
                "factual_accuracy": "Does the response contain only factually correct information consistent with the reference answer?"
            },
            "evaluator_type": "criteria",
        },
        {
            "criteria": {
                "groundedness": "Is the response fully supported by the retrieved documents without introducing unsupported information?"
            },
            "evaluator_type": "criteria",
        },
        {
            "criteria": {
                "retrieval_relevance": "Are the retrieved documents relevant to answering the question?"
            },
            "evaluator_type": "criteria",
        },
    ]
)

In [28]:
# TODO: Ë°•ÂÖÖÂÆûÁé∞
def construct_chain(input, **kwargs):
    # This would be your actual RAG implementation
    # For example: return RAGChain(...)
    pass

In [29]:
from langchain_classic.smith import run_on_dataset

results = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=construct_chain,
    evaluation=evaluation_config,
)

View the evaluation results for project 'formal-bill-74' at:
https://smith.langchain.com/o/87dee905-f3ea-4b4d-b51e-bfa1e3588b48/datasets/a27a5894-a32a-4bd6-8b1c-bac2e87e1bcf/compare?selectedSessions=72eb9db1-5999-4867-8de4-37b76e4b054e

View all tests for Dataset Financial Advisory RAG Evaluation at:
https://smith.langchain.com/o/87dee905-f3ea-4b4d-b51e-bfa1e3588b48/datasets/a27a5894-a32a-4bd6-8b1c-bac2e87e1bcf
[>                                                 ] 0/2

Error evaluating run 45b09524-2365-4ca0-aac2-ea0fd82aa2f7
Traceback (most recent call last):
  File "/github.com/sammyne/generative-ai-with-lang-chain-2ed/.venv/lib/python3.12/site-packages/langchain_classic/smith/evaluation/string_run_evaluator.py", line 374, in evaluate_run
    result = self({"run": run, "example": example}, include_run_info=True)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    return wrapped(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/github.com/sammyne/generative-ai-with-lang-chain-2ed/.venv/lib/python3.12/site-packages/langchain_classic/chains/base.py", line 413, in __call__
    return self.invoke(
           ^^^^^^^^^^^^
  File "/github.com/sammyne/generative-ai-with-lang-chain-2ed/.venv/lib/python3.12/site-packages/langchain_classic/chains/base.py", line 167, in invoke
    self._call(inputs, run_manager=run_manager)
  File "/github.com/sammyne/generative-ai-with-lang-chain-2ed/.venv/lib/python3.12/site-packages/la

[------------------------------------------------->] 2/2


### Evaluating a benchmark with HF datasets and Evaluate

In [30]:
%uv pip install datasets~=3.4 evaluate~=0.4

[2K[2mResolved [1m41 packages[0m [2min 326ms[0m[0m                                        [0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m12 packages[0m [2min 192ms[0m[0m                              [0m
 [32m+[39m [1mclick[0m[2m==8.3.1[0m
 [32m+[39m [1mdatasets[0m[2m==3.6.0[0m
 [32m+[39m [1mdill[0m[2m==0.3.8[0m
 [32m+[39m [1mevaluate[0m[2m==0.4.6[0m
 [32m+[39m [1mfilelock[0m[2m==3.20.0[0m
 [32m+[39m [1mfsspec[0m[2m==2025.3.0[0m
 [32m+[39m [1mhf-xet[0m[2m==1.2.0[0m
 [32m+[39m [1mhuggingface-hub[0m[2m==1.1.5[0m
 [32m+[39m [1mmultiprocess[0m[2m==0.70.16[0m
 [32m+[39m [1mpyarrow[0m[2m==22.0.0[0m
 [32m+[39m [1mshellingham[0m[2m==1.5.4[0m
 [32m+[39m [1mtyper-slim[0m[2m==0.20.0[0m
Note: you may need to restart the kernel to use updated packages.


In [31]:
import os

os.environ["HF_ALLOW_CODE_EVAL"] = "1"

In [32]:
from datasets import load_dataset
from evaluate import load

# from langchain_core.messages import HumanMessage


human_eval = load_dataset("openai_humaneval", split="test")
code_eval_metric = load("code_eval")

test_cases = ["assert add(2,3)==5"]
candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]

pass_at_k, results = code_eval_metric.compute(
    references=test_cases, predictions=candidates, k=[1, 2]
)
print(pass_at_k)

{'pass@1': np.float64(0.5), 'pass@2': np.float64(1.0)}


### Evaluating email extraction

In [33]:
# Define a list of synthetic insurance claim examples
example_inputs = [
    (
        "I was involved in a car accident on 2023-08-15. My name is Jane Smith, Claim ID INS78910, "
        "Policy Number POL12345, and the damage is estimated at $3500.",
        {
            "claimant_name": "Jane Smith",
            "claim_id": "INS78910",
            "policy_number": "POL12345",
            "claim_amount": "$3500",
            "accident_date": "2023-08-15",
            "accident_description": "Car accident causing damage",
            "status": "pending",
        },
    ),
    (
        "My motorcycle was hit in a minor collision on 2023-07-20. I am John Doe, with Claim ID INS112233 "
        "and Policy Number POL99887. The estimated damage is $1500.",
        {
            "claimant_name": "John Doe",
            "claim_id": "INS112233",
            "policy_number": "POL99887",
            "claim_amount": "$1500",
            "accident_date": "2023-07-20",
            "accident_description": "Minor motorcycle collision",
            "status": "pending",
        },
    ),
]

In [5]:
client = Config().new_langsmith_client()

dataset_name = "Insurance Claims"

# Create the dataset in LangSmith
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Synthetic dataset for insurance claim extraction tasks",
)

# Store examples in the dataset
for input_text, expected_output in example_inputs:
    client.create_example(
        inputs={"input": input_text},
        outputs={"output": expected_output},
        metadata={"source": "Synthetic"},
        dataset_id=dataset.id,
    )

In [34]:
# Define the extraction schema
from pydantic import BaseModel, Field


class InsuranceClaim(BaseModel):
    claimant_name: str = Field(..., description="The name of the claimant")
    claim_id: str = Field(..., description="The unique insurance claim identifier")
    policy_number: str = Field(
        ..., description="The policy number associated with the claim"
    )
    claim_amount: str = Field(..., description="The claimed amount (e.g., '$5000')")
    accident_date: str = Field(..., description="The date of the accident (YYYY-MM-DD)")
    accident_description: str = Field(
        ..., description="A brief description of the accident"
    )
    status: str = Field("pending", description="The current status of the claim")

In [35]:
# Create extraction chain
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Set up a parser + inject instructions into the prompt template.
output_parser = JsonOutputParser(pydantic_object=InsuranceClaim)

instructions = PromptTemplate(
    template=(
        "Extract the following structured information from the insurance claim text: "
        "claimant_name, claim_id, policy_number, claim_amount, accident_date, "
        "accident_description, and status.\n"
        "{format_instructions}\n"
        "{input}\n"
    ),
    input_variables=["input"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

llm = Config().new_openai_like()

extraction_chain = instructions | llm | output_parser

# Test the extraction chain
sample_claim_text = (
    "I was involved in a car accident on 2023-08-15. My name is Jane Smith, "
    "Claim ID INS78910, Policy Number POL12345, and the damage is estimated at $3500. "
    "Please process my claim."
)

result = extraction_chain.invoke({"input": sample_claim_text})
print("Extraction Result:")
print(result)

Extraction Result:
{'claimant_name': 'Jane Smith', 'claim_id': 'INS78910', 'policy_number': 'POL12345', 'claim_amount': '$3500', 'accident_date': '2023-08-15', 'accident_description': 'Car accident', 'status': 'pending'}
