### **Testing an Agent’s Final Response**

In [None]:
from langsmith import Client

client = Client()

# Create a dataset
# Create a dataset
examples = [
    ("Which country's customers spent the most? And how much did they spend?",
        """The country whose customers spent the most is the USA, with a total 
        expenditure of $523.06"""),
    ("What was the most purchased track of 2013?", 
        "The most purchased track of 2013 was Hot Girl."),
    ("How many albums does the artist Led Zeppelin have?",
        "Led Zeppelin has 14 albums"),
    ("What is the total price for the album “Big Ones”?",
        "The total price for the album 'Big Ones' is 14.85"),
    ("Which sales agent made the most in sales in 2009?", 
        "Steve Johnson made the most sales in 2009"),
]

In [None]:
dataset_name = "SQL Agent Response"
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name=dataset_name)
    inputs, outputs = zip(
        *[({"input": text}, {"output": label}) for text, label in examples]
    )
    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

## chain
def predict_sql_agent_answer(example: dict):
    """Use this for answer evaluation"""
    msg = {"messages": ("user", example["input"])}
    messages = graph.invoke(msg, config)
    return {"response": messages["messages"][-1].content}

Now we can use the generated answer with the reference answer:

In [None]:
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langsmith.evaluation import evaluate

# Grade prompt
grade_prompt_answer_accuracy = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """A simple evaluator for RAG answer accuracy"""

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input"]
    reference = example.outputs["output"]
    prediction = run.outputs["response"]

    # LLM grader 
    llm = AzureChatOpenAI(model="gpt-4o", temperature=0, azure_deployment="gpt-4o", api_version="2024-10-21")

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm 

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}

## RUn evaluation
experiment_results = evaluate(
    predict_sql_agent_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    num_repetitions=3
)

**Testing a Single Step of an Agent**

Evaluating individual decisions made by an agent helps pinpoint performance issues.

**Typical Setup:**

1. **Inputs**  
   - User input to a specific step (e.g., user prompt, available tools)  
   - May include previous steps for context

2. **Output**  
   - LLM-generated response for this step  
   - Often includes tool call and input parameters

3. **Evaluator**  
   - Binary score for tool selection correctness  
   - Heuristic evaluation of the tool input’s accuracy

**Example:**  
Use a custom evaluator to verify if the agent selected the correct tool and provided valid inputs.


In [None]:
from langsmith.schemas import Example, Run

def predict_assistant(example: dict):
    """Invoke assistant for single tool call evaluation"""
    msg = [("user", example["input"])]
    result = assistant_runnable.invoke({"messages":msg})
    return {"response": result}

def check_specific_tool_call(root_run: Run, example: Example) -> dict:
    """
    Check if the first tool call in the response matches the expected tool call.
    """
    # Expected tool call
    expected_tool_call = 'sql_db_list_tables'

    # Run
    response = root_run.outputs["response"]

    # Get tool call
    try:
        tool_call = getattr(response, "tool_calls", [])[0]["name"]
    except (IndexError, KeyError):
        tool_call = None
    
    score = 1 if tool_call == expected_tool_call else 0
    return {"score": score, "key": "single_tool_call"}

experiment_results = evaluate(
    predict_assistant,
    data=dataset_name,
    evaluators=[check_specific_tool_call],
    num_repetitions=3,
    metadata={"version": metadata},
)

**Testing an Agent’s Trajectory**

Analyzing the sequence of actions taken by an agent helps determine if the control flow aligns with expectations.

**Typical Setup:**

1. **Inputs**  
   - User input  
   - Optionally includes a predefined set of tools

2. **Output**  
   - Expected sequence of tool calls  
   - Can also allow for unordered tool calls

3. **Evaluator**  
   - Custom function applied to the agent’s steps  
   - Can use:
     - Binary score for exact match
     - Count of incorrect or extra steps
   - For LLM-as-a-judge: pass the full trajectory as a message list and ask for an evaluation

**Example:**  
Compare the actual sequence of tool calls with the expected reference trajectory using a custom evaluation function.

In [None]:
def predict_sql_agent(example: dict):
    """Use this for answer evaluation"""
    msg = {"messages": ("user", example["input"])}
    messages = graph.invoke(msg, config)
    return {"response": messages}

def find_tool_calls(messages):
    """Find all tool calls in the messages returned"""
    tools_calls = [
        tc["name"]
        for m in messages["messages"] for tc in getattr(m, "tools_calls", [])
    ]
    return tools_calls

def contains_all_tool_calls_any_order(
        root_run: Run, example: Example
) -> dict:
    """
    Check if the response contains all expected tool calls in any order.
    """
    # Expected tool calls
    expected = [
        'sql_db_list_tables',
        'sql_db_schema',
        'sql_db_query_checker',
        'sql_db_query',
        'check_result'
    ]
    messages = root_run.outputs["response"]
    tool_calls = find_tool_calls(messages)
    # Optionally, log the tool calls -
    # print("Here are my tool calls:")
    # print(tool_calls)
    if set(expected) <= set(tool_calls):
        score = 1
    else:
        score = 0
    return {"score":int(score), "key": "multi_tool_call_any_roder"}

def contains_all_tool_calls_in_order(root_run: Run, example: Example) -> dict:
    """
    Check if all expected tools are called in exact order.
    """
    messages = root_run.outputs["response"]
    tool_calls = find_tool_calls(messages)
    # Optionally, log the tool calls -
    #print("Here are my tool calls:")
    #print(tool_calls)
    it = iter(tool_calls)
    expected = [
        'sql_db_list_tables', 
        'sql_db_schema', 
        'sql_db_query_checker',
        'sql_db_query', 
        'check_result'
    ]
    if all(elem in it for elem in expected):
        score = 1
    else:
        score = 0
    return {"score": int(score), "key": "multi_tool_call_in_order"}

def contains_all_tool_calls_in_order_exact_match(
    root_run: Run, example: Example
) -> dict:
    """
    Check if all expected tools are called in exact order and without any 
        additional tool calls.
    """
    expected = [
        'sql_db_list_tables',
        'sql_db_schema',
        'sql_db_query_checker',
        'sql_db_query',
        'check_result'
    ]
    messages = root_run.outputs["response"]
    tool_calls = find_tool_calls(messages)
    # Optionally, log the tool calls -
    #print("Here are my tool calls:")
    #print(tool_calls)
    if tool_calls == expected:
        score = 1
    else:
        score = 0

    return {"score": int(score), "key": "multi_tool_call_in_exact_order"}

experiment_results = evaluate(
    predict_sql_agent_messages,
    data=dataset_name,
    evaluators=[
        contains_all_tool_calls_any_order,
        contains_all_tool_calls_in_order,
        contains_all_tool_calls_in_order_exact_match
    ],
    num_repetitions=3,
)

**Implementation Breakdown**

This example includes several steps to evaluate an agent's trajectory:

1. **Invocation of the Agent**  
   - Uses `graph.invoke` to run a precompiled LangGraph agent with a specific prompt.

2. **Specialized Agent Configuration**  
   - Tools are embedded within the agent’s logic instead of being dynamically passed with the dataset input.

3. **Tool Call Extraction**  
   - Uses the function `find_tool_calls` to obtain the list of tools the agent used during execution.

4. **Tool Call Verification**  
   - Verifies whether:
     - All expected tools were called, in any order: `contains_all_tool_calls_any_order`
     - All expected tools were called in the correct order: `contains_all_tool_calls_in_order`
     - All expected tools were called in the exact same order without variation: `contains_all_tool_calls_in_order_exact_match`