In [None]:
import os
import sys

sys.path.insert(0, "/run/media/sahilsasane/Disk_1/Sahil/TMDC/test_repo/travel-agent/notebooks")

from dotenv import load_dotenv
from pretty_print import pretty_print_messages

load_dotenv()
os.environ["LANGSMITH_PROJECT"] = "pr-definite-invite-58"

In [3]:
from typing import Annotated, Literal, TypedDict

from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode


class State(TypedDict):
    messages: Annotated[list, add_messages]


@tool
def search(query: str) -> str:
    """Call to surf the web."""
    if "sf" in query.lower() or "san francisco" in query.lower():
        return "It's 60 degrees and foggy."
    return "It's 90 degrees and sunny."


tools = [search]
tool_node = ToolNode(tools)
# llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20")
# llm = ChatGroq(model="llama-3.3-70b-versatile")
llm = ChatOpenAI(model="gpt-4o-mini")
model = llm.bind_tools(tools)


def should_continue(state: State) -> Literal["tools", END]:
    messages = state["messages"]
    last_message = messages[-1]
    if last_message.tool_calls:
        return "tools"
    return END


def call_model(state: State):
    messages = state["messages"]
    response = model.invoke(messages)
    return {"messages": [response]}


workflow = StateGraph(State)

workflow.add_node("agent", call_model)
workflow.add_node("tools", tool_node)

workflow.add_edge(START, "agent")

workflow.add_conditional_edges(
    "agent",
    should_continue,
)

workflow.add_edge("tools", "agent")

app = workflow.compile()

In [4]:
# from langfuse import Langfuse
# from langfuse.langchain import CallbackHandler

# langfuse = Langfuse(
#     public_key="pk-lf-c7be1653-c7b1-4136-90e9-6132b375e823",
#     secret_key="sk-lf-dc2be4ce-811a-46b5-a385-7bbe1ab307b8",
#     host="http://localhost:3000",
# )

# langfuse_handler = CallbackHandler()

# # <Your Langchain code here>

# # Add handler to run/invoke/call/chat

# # chain.invoke({"input": "<user_input>"}, config={"callbacks": [langfuse_handler]})
# state = await app.ainvoke(
#     {"messages": [{"role": "user", "content": "whats the weather in sf?"}]},
#     config={"callbacks": [langfuse_handler]},
# )

# for chunk in app.stream(
#     {
#         "messages": [
#             {
#                 "role": "user",
#                 "content": "whats the weather in sf?",
#             }
#         ]
#     },
#     config={"callbacks": [langfuse_handler]},
#     debug=False,
# ):
#     pretty_print_messages(chunk, last_message=True)

Update from node agent:


Tool Calls:
  search (call_o5NTwUdDgmA7yKXzw47ivUfY)
 Call ID: call_o5NTwUdDgmA7yKXzw47ivUfY
  Args:
    query: current weather in San Francisco


Update from node tools:


Name: search

It's 60 degrees and foggy.


Update from node agent:



The current weather in San Francisco is 60 degrees and foggy.




In [None]:
from langsmith import Client

questions = [
    "what's the weather in sf",
    "whats the weather in san fran",
    "whats the weather in tangier",
]
answers = [
    "It's 60 degrees and foggy.",
    "It's 60 degrees and foggy.",
    "It's 90 degrees and sunny.",
]

ls_client = Client()

dataset = ls_client.create_dataset("weather agent")
ls_client.create_examples(
    dataset_id=dataset.id,
    inputs=[
        {"question": "what's the weather in sf"},
        {"question": "whats the weather in san fran"},
        {"question": "whats the weather in tangier"},
    ],
    outputs=[
        {"answer": "It's 60 degrees and foggy."},
        {"answer": "It's 60 degrees and foggy."},
        {"answer": "It's 90 degrees and sunny."},
    ],
)


In [6]:
judge_llm = llm


async def correct(outputs: dict, reference_outputs: dict) -> bool:
    instructions = (
        "Given an actual answer and an expected answer, determine whether"
        " the actual answer contains all of the information in the"
        " expected answer. Respond with 'CORRECT' if the actual answer"
        " does contain all of the expected information and 'INCORRECT'"
        " otherwise. Do not include anything else in your response."
    )

    actual_answer = outputs["messages"][-1].content
    expected_answer = reference_outputs["answer"]
    user_msg = f"ACTUAL ANSWER: {actual_answer}\n\nEXPECTED ANSWER: {expected_answer}"
    response = await judge_llm.ainvoke(
        [{"role": "system", "content": instructions}, {"role": "user", "content": user_msg}]
    )
    return response.content.upper() == "CORRECT"

In [8]:
from langsmith import aevaluate


def example_to_state(inputs: dict) -> dict:
    # print(inputs)
    return {"messages": [{"role": "user", "content": inputs["question"]}]}


target = example_to_state | app

experiment_results = await aevaluate(
    target,
    data="weather agent",
    evaluators=[correct],
    max_concurrency=4,
)

View the evaluation results for experiment: 'dear-place-15' at:
https://smith.langchain.com/o/dd73791d-a177-49ce-88d8-6d38e552760c/datasets/7cf133cd-bda0-43f7-b64f-23937057e460/compare?selectedSessions=6e813bc3-fc17-4612-bd77-9bff01475b91




3it [00:03,  1.27s/it]


In [None]:
def right_tool(outputs: dict) -> bool:
    tool_calls = outputs["messages"][1].tool_calls
    return bool(tool_calls and tool_calls[0]["name"] == "search")


experiment_results = await aevaluate(
    target,
    data="weather agent",
    evaluators=[correct, right_tool],
    # max_concurrency=4,  # optional
    # experiment_prefix="claude-3.5-baseline",  # optional
)

View the evaluation results for experiment: 'dependable-animal-5' at:
https://smith.langchain.com/o/dd73791d-a177-49ce-88d8-6d38e552760c/datasets/7cf133cd-bda0-43f7-b64f-23937057e460/compare?selectedSessions=c31d1a3c-24a3-4c17-aca9-27d89d6b5bc0




0it [00:00, ?it/s]

{'question': 'whats the weather in tangier'}
{'question': 'whats the weather in san fran'}
{'question': "what's the weather in sf"}


1it [00:00,  1.97it/s]

reference_outputs {'answer': "It's 60 degrees and foggy."}
outputs {'messages': [HumanMessage(content='whats the weather in san fran', additional_kwargs={}, response_metadata={}, id='5c99bb2d-07bd-4c74-9099-39970ecec9b6'), AIMessage(content='<function=search{"query": "San Francisco weather"}</function>', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 215, 'total_tokens': 230, 'completion_time': 0.079605657, 'prompt_time': 0.014459086, 'queue_time': 0.052128954000000005, 'total_time': 0.094064743}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_2ddfbb0da0', 'finish_reason': 'stop', 'logprobs': None}, id='run--ca889378-12ab-45e3-a216-e753f7d7cf4d-0', usage_metadata={'input_tokens': 215, 'output_tokens': 15, 'total_tokens': 230})]}
reference_outputs {'answer': "It's 90 degrees and sunny."}
outputs {'messages': [HumanMessage(content='whats the weather in tangier', additional_kwargs={}, response_metadata={}, id='1f1af0b

3it [00:01,  2.67it/s]


In [None]:
from langsmith.schemas import Example, Run


def right_tool_from_run(run: Run, example: Example) -> dict:
    # Get documents and answer
    first_model_run = next(run for run in run.child_runs if run.name == "agent")
    tool_calls = first_model_run.outputs["messages"][-1].tool_calls
    right_tool = bool(tool_calls and tool_calls[0]["name"] == "search")
    return {"key": "right_tool", "value": right_tool}


experiment_results = await aevaluate(
    target,
    data="weather agent",
    evaluators=[correct, right_tool_from_run],
    # max_concurrency=4,
)