## Test a ReAct agent with Pytest and LangSmith

We will create a ReAct Agent tha answers questions about publicly traded stocks and write a comprehensive test suite for it.

In [None]:
from dotenv import load_dotenv
load_dotenv(override=True)

In [None]:
# Initialize the LLM options
from langchain.chat_models import init_chat_model
model_gemini_flash = init_chat_model("gemini-2.5-flash", model_provider="google_genai", timeout=30, temperature=0)
model_llama_groq = init_chat_model("llama-3.1-8b-instant", model_provider="groq", timeout=30, temperature=0)
model_gpt_4o_mini = init_chat_model("gpt-4o-mini", model_provider="openai", timeout=30, temperature=0)


In [None]:
from typing_extensions import Annotated, Literal, TypedDict
from langchain_community.tools import TavilySearchResults
from e2b_code_interpreter import Sandbox
from langchain_community.utilities.polygon import PolygonAPIWrapper
from langchain_community.tools.polygon.aggregates import PolygonAggregates
# Define search tool
search_tool = TavilySearchResults(
  max_results=5,
  include_raw_content=True,
)

# Define code tool
def code_tool(code: str) -> str:
  """Execute python code and return the result."""
  sbx = Sandbox()
  execution = sbx.run_code(code)

  if execution.error:
      return f"Error: {execution.error}"
  return f"Results: {execution.results}, Logs: {execution.logs}"


# Define input schema for stock ticker tool
class TickerToolInput(TypedDict):
  """Input format for the ticker tool.
    The tool will pull data in aggregate blocks (timespan_multiplier * timespan) from the from_date to the to_date
  """
  ticker: Annotated[str, ..., "The ticker symbol of the stock"]
  timespan: Annotated[Literal["minute", "hour", "day", "week", "month", "quarter", "year"], ..., "The size of the time window."]
  timespan_multiplier: Annotated[int, ..., "The multiplier for the time window"]
  from_date: Annotated[str, ..., "The date to start pulling data from, YYYY-MM-DD format - ONLY include the year month and day"]
  to_date: Annotated[str, ..., "The date to stop pulling data, YYYY-MM-DD format - ONLY include the year month and day"]

api_wrapper = PolygonAPIWrapper()
polygon_aggregates = PolygonAggregates(api_wrapper=api_wrapper)

# Define stock ticker tool
def ticker_tool(query: TickerToolInput) -> str:
  """Pull data for the ticker."""
  return polygon_aggregates.invoke(query)

### Define Agent

In [None]:
from langchain.agents import create_agent
class AgentOutputFormat(TypedDict):
    numeric_answer: Annotated[float | None, ..., "The numeric answer, if the user asked for one"]
    text_answer: Annotated[str | None, ..., "The text answer, if the user asked for one"]
    reasoning: Annotated[str, ..., "The reasoning behind the answer"]

agent = create_agent(
    model=model_gemini_flash,
    tools=[code_tool, search_tool, polygon_aggregates],
    response_format=AgentOutputFormat,
    system_prompt="You are a financial expert. Respond to the users query accurately",
)



In [None]:
from langchain_core.messages import HumanMessage


response = agent.invoke({"messages":HumanMessage("Hello ! How are you doing?")})
for msg in response["messages"]:
    msg.pretty_print()

In [None]:
response = agent.invoke({"messages":HumanMessage("What was the price of  Apple stock one month back?")})
for msg in response["messages"]:
    msg.pretty_print()

### Test 1: No tool should get called for off-topic chatter

In [None]:
from langsmith import testing as t
import pytest

@pytest.mark.langsmith
@pytest.mark.parametrize(
  # <-- Can still use all normal pytest markers
  "query",
  ["Hello!", "How are you doing?"],
)
def test_no_tools_on_offtopic_query(query: str) -> None:
  """Test that the agent does not use tools on offtopic queries."""
  # Log the test example
  t.log_inputs({"query": query})
  expected = []
  t.log_reference_outputs({"tool_calls": expected})
  # Call the agent's model node directly instead of running the ReACT loop.
  result = agent.nodes["agent"].invoke(
      {"messages": [{"role": "user", "content": query}]}
  )
  #this for loop is just for activity tracing in notebook cell output
  for msg in result["messages"]:
    msg.pretty_print()

  actual = result["messages"][0].tool_calls
  t.log_outputs({"tool_calls": actual})
  # Check that no tool calls were made.
  assert actual == expected

### Test 2: Simple Tool Calling

For tool calling we are going to verify that the agent calls the correct tool with the correct parameters

In [None]:
@pytest.mark.langsmith
def test_searches_for_correct_ticker() -> None:
    """Test that the model looks up the correct ticker on simple query."""
    # Log the test example
    query = "What is the price of Apple?"
    t.log_inputs({"query": query})
    expected = "AAPL"
    t.log_reference_outputs({"ticker": expected})
    # Call the agent's model node directly instead of running the full ReACT loop.
    result = agent.nodes["agent"].invoke(
      {"messages": [{"role": "user", "content": query}]}
    )

    #this for loop is just for activity tracing in notebook cell output
    for msg in result["messages"]:
        msg.pretty_print()

    tool_calls = result["messages"][0].tool_calls

    if tool_calls[0]["name"] == polygon_aggregates.name:
      actual = tool_calls[0]["args"]["ticker"]

    else:
      actual = None

    # Check that the right ticker was queried
    assert actual == expected

### Test 3 : Complex tool calling

With the coding tool, the inputs and outputs of the tool are much less constrained, and there are lots of ways to get to the right answer. In this case, it’s simpler to test that the tool is used correctly by running the full agent and asserting that it both calls the coding tool and that it ends up with the right answer.

In [None]:
@pytest.mark.langsmith
def test_executes_code_when_needed() -> None:
    query = (
      "In the past year Facebook stock went up by 66.76%, "
      "Apple by 25.24%, Google by 37.11%, Amazon by 47.52%, "
      "Netflix by 78.31%. Whats the avg return in the past "
      "year of the FAANG stocks, expressed as a percentage?"
    )

    t.log_inputs({"query": query})
    expected = 50.988
    t.log_reference_outputs({"response": expected})
    # Test that the agent executes code when needed
    result = agent.invoke({"messages": [{"role": "user", "content": query}]})
    t.log_outputs({"result": result["structured_response"].get("numeric_answer")})
    # Grab all the tool calls made by the LLM
    tool_calls = [
      tc["name"]
      for msg in result["messages"]
      for tc in getattr(msg, "tool_calls", [])
    ]

    # This will log the number of steps taken by the agent, which is useful for
    # determining how efficiently the agent gets to an answer.
    t.log_feedback(key="num_steps", score=len(result["messages"]) - 1)
    # Assert that the code tool was used
    assert "code_tool" in tool_calls
    # Assert that a numeric answer was provided:
    assert result["structured_response"].get("numeric_answer") is not None
    # Assert that the answer is correct
    assert abs(result["structured_response"]["numeric_answer"] - expected) <= 0.01


### Test 4: LLM-as-judge

We are going to ensure that the agent’s answer is grounded in the search results by running an LLM-as-a-judge evaluation. In order to trace the LLM as a judge call separately from our agent, we will use the LangSmith provided trace_feedback

In [None]:
from typing_extensions import Annotated, TypedDict

class Grade(TypedDict):
  """Evaluate the groundedness of an answer in source documents."""
  score: Annotated[
      bool,
      ...,
      "Return True if the answer is fully grounded in the source documents, otherwise False.",
  ]

judge_llm = model_gemini_flash.with_structured_output(Grade)

@pytest.mark.langsmith
def test_grounded_in_source_info() -> None:
    """Test that response is grounded in the tool outputs."""
    query = "How did Nvidia stock do in 2024 according to analysts?"
    t.log_inputs({"query": query})
    result = agent.invoke({"messages": [{"role": "user", "content": query}]})
    # Grab all the search calls made by the LLM
    search_results = "\n\n".join(
      msg.content
      for msg in result["messages"]
      if msg.type == "tool" and msg.name == search_tool.name
    )
    t.log_outputs(
      {
          "response": result["structured_response"].get("text_answer"),
          "search_results": search_results,
      }
    )
    # Trace the feedback LLM run separately from the agent run.
    with t.trace_feedback():
      # Instructions for the LLM judge
      instructions = (
          "Grade the following ANSWER. "
          "The ANSWER should be fully grounded in (i.e. supported by) the source DOCUMENTS. "
          "Return True if the ANSWER is fully grounded in the DOCUMENTS. "
          "Return False if the ANSWER is not grounded in the DOCUMENTS."
      )
      answer_and_docs = (
          f"ANSWER: {result['structured_response'].get('text_answer', '')}\n"
          f"DOCUMENTS:\n{search_results}"
      )
      # Run the judge LLM
      grade = judge_llm.invoke(
          [
              {"role": "system", "content": instructions},
              {"role": "user", "content": answer_and_docs},
          ]
      )
      t.log_feedback(key="groundedness", score=grade["score"])
    assert grade['score']