In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain.evaluation import load_evaluator
from langchain_openai import ChatOpenAI

evaluator = load_evaluator("trajectory",llm=ChatOpenAI(model="gpt-4o-mini", temperature=0))

### Capturing Trajectory

In [5]:
import subprocess
from urllib.parse import urlparse

from langchain.agents import AgentType, initialize_agent
from langchain.tools import tool
from langchain_openai import ChatOpenAI
from pydantic import HttpUrl


@tool
def ping(url: str, return_error: bool) -> str:  # HttpUrl -> str
    """Ping the fully specified url. Must include https:// in the url."""
    hostname = urlparse(url).netloc  # str(url) -> url
    completed_process = subprocess.run(
        ["ping", "-c", "1", hostname], capture_output=True, text=True
    )
    output = completed_process.stdout
    if return_error and completed_process.returncode != 0:
        return completed_process.stderr
    return output


@tool
def trace_route(url: str, return_error: bool) -> str:  # HttpUrl -> str
    """Trace the route to the specified url. Must include https:// in the url."""
    hostname = urlparse(url).netloc  # str(url) -> url
    completed_process = subprocess.run(
        ["traceroute", hostname], capture_output=True, text=True
    )
    output = completed_process.stdout
    if return_error and completed_process.returncode != 0:
        return completed_process.stderr
    return output


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
agent = initialize_agent(
    llm=llm,
    tools=[ping, trace_route],
    agent=AgentType.OPENAI_MULTI_FUNCTIONS,
    return_intermediate_steps=True,  # IMPORTANT!
)

result = agent("What's the latency like for https://langchain.com?")

### Evaluate Trajectory

In [6]:
evaluation_result = evaluator.evaluate_agent_trajectory(
    prediction=result["output"],
    input=result["input"],
    agent_trajectory=result["intermediate_steps"],
)
evaluation_result

{'score': 1.0,
 'reasoning': "Let's evaluate the AI language model's answer based on the provided criteria:\n\ni. **Is the final answer helpful?**\n   - Yes, the final answer provides a specific latency measurement (approximately 9.848 ms) for the website in question, along with information about packet loss, which is relevant and useful for understanding the site's responsiveness.\n\nii. **Does the AI language model use a logical sequence of tools to answer the question?**\n   - Yes, the model used the ping tool, which is appropriate for measuring latency. The sequence is logical as it directly addresses the user's question about latency.\n\niii. **Does the AI language model use the tools in a helpful way?**\n   - Yes, the model effectively utilized the ping tool to gather the necessary data about latency. The output from the ping command was correctly interpreted and presented in the final answer.\n\niv. **Does the AI language model use too many steps to answer the question?**\n   - 