In [1]:
from dotenv import dotenv_values

In [6]:
LANGCHAIN_TRACING_V2="true"

config = dotenv_values(".env")

In [7]:
import os
from uuid import uuid4

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"tracing_walkthrough"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = config["LANGSMITH_API_KEY"]  # Update to your API key

# Used by the agent in this tutorial
os.environ["OPENAI_API_KEY"] = config["OPEN_AI_KEY"]

In [8]:
from langsmith import Client

client = Client()

In [10]:
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_openai import ChatOpenAI

# Fetches the latest version of this prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")

llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
)

tools = [
    DuckDuckGoSearchResults(
        name="duck_duck_go"
    ),  # General internet search using DuckDuckGo
]

llm_with_tools = llm.bind_tools(tools)

runnable_agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)

In [11]:
# We are running the agent concurrently on multiple inputs to reduce latency. Runs get logged to LangSmith in the background so execution latency is unaffected.

inputs = [
    "What is LangChain?",
    "What's LangSmith?",
    "When was Llama-v2 released?",
    "What is the langsmith cookbook?",
    "When did langchain first announce the hub?",
]

results = agent_executor.batch([{"input": x} for x in inputs], return_exceptions=True)

In [12]:
results[:2]

[{'input': 'What is LangChain?',
  'output': 'I\'m sorry, but I couldn\'t find any information about "LangChain". Could you please provide more context or clarify your question?'},
 {'input': "What's LangSmith?",
  'output': 'I\'m sorry, but I couldn\'t find any information about "LangSmith". It could be a company, a product, or a person. Can you provide more context or details about what you are referring to?'}]

In [13]:
# Evaluate Agents

# Create a LangSmith dataset
outputs = [
    "LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.",
    "LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain",
    "July 18, 2023",
    "The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.",
    "September 5, 2023",
]

In [15]:
dataset_name = f"agent-qa-may_2_2024"

dataset = client.create_dataset(
    dataset_name,
    description="An example dataset of questions over the LangSmith documentation.",
)

client.create_examples(
    inputs=[{"input": query} for query in inputs],
    outputs=[{"output": answer} for answer in outputs],
    dataset_id=dataset.id,
)

In [16]:
from langchain import hub
from langchain.agents import AgentExecutor, AgentType, initialize_agent, load_tools
from langchain_openai import ChatOpenAI


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def create_agent(prompt, llm_with_tools):
    runnable_agent = (
        {
            "input": lambda x: x["input"],
            "agent_scratchpad": lambda x: format_to_openai_tool_messages(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | OpenAIToolsAgentOutputParser()
    )
    return AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)

In [17]:
# Heuristic Evaluator
from langsmith.evaluation import EvaluationResult
from langsmith.schemas import Example, Run


def check_not_idk(run: Run, example: Example):
    """Illustration of a custom evaluator."""
    agent_response = run.outputs["output"]
    if "don't know" in agent_response or "not sure" in agent_response:
        score = 0
    else:
        score = 1
    # You can access the dataset labels in example.outputs[key]
    # You can also access the model inputs in run.inputs[key]
    return EvaluationResult(
        key="not_uncertain",
        score=score,
    )

In [18]:
# Batch evaluator

from typing import List


def max_pred_length(runs: List[Run], examples: List[Example]):
    predictions = [len(run.outputs["output"]) for run in runs]
    return EvaluationResult(key="max_pred_length", score=max(predictions))

In [19]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        check_not_idk,
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,
        # Measure the embedding distance between the output and the reference answer
        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())
        EvaluatorType.EMBEDDING_DISTANCE,
        # Grade whether the output satisfies the stated criteria.
        # You can select a default one such as "helpfulness" or provide your own.
        RunEvalConfig.LabeledCriteria("helpfulness"),
        # The LabeledScoreString evaluator outputs a score on a scale from 1-10.
        # You can use default criteria or write our own rubric
        RunEvalConfig.LabeledScoreString(
            {
                "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
        ),
    ],
    batch_evaluators=[max_pred_length],
)

In [20]:
# Run agent and evaluators against dataset

from langchain import hub

# We will test this version of the prompt
prompt = hub.pull("wfh/langsmith-agent-prompt:798e7324")

In [22]:
import functools

from langchain.smith import arun_on_dataset, run_on_dataset

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_agent, prompt=prompt, llm_with_tools=llm_with_tools
    ),
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"tools-agent-test-5d466cbc-may_2_2024",
    # Project metadata communicates the experiment parameters,
    # Useful for reviewing the test results
    project_metadata={
        "env": "testing-notebook",
        "model": "gpt-3.5-turbo",
        "prompt": "5d466cbc",
    },
)

# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.
# These are logged as warnings here and captured as errors in the tracing UI.

View the evaluation results for project 'tools-agent-test-5d466cbc-may_2_2024' at:
https://smith.langchain.com/o/bbad3093-addd-5544-b896-000e8d59d9dd/datasets/4e0ed8b4-034a-49a4-90a7-cb9d6b212621/compare?selectedSessions=cb001395-8c8d-43e7-b861-1329c64694db

View all tests for Dataset agent-qa-may_2_2024 at:
https://smith.langchain.com/o/bbad3093-addd-5544-b896-000e8d59d9dd/datasets/4e0ed8b4-034a-49a4-90a7-cb9d6b212621
[>                                                 ] 0/5

  warn_deprecated(
Chain failed for example a308de7a-33fd-43e4-a351-3f18d12197a4 with inputs {'input': "What's LangSmith?"}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-6_1'.
Chain failed for example 0beffd64-3869-4df0-a901-adc95d75b88b with inputs {'input': 'When was Llama-v2 released?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-6_2'.


[------------------->                              ] 2/5

Chain failed for example 7aff3dbb-e2b1-45c1-b23c-40fd6484e39d with inputs {'input': 'What is the langsmith cookbook?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-6_3'.
Chain failed for example 6958fff2-2c6d-468e-8a65-29d3ee0855ca with inputs {'input': 'When did langchain first announce the hub?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-6_4'.


[--------------------------------------->          ] 4/5

Error running batch evaluator <function max_pred_length at 0x000001EDFF687D30>: 'NoneType' object is not subscriptable


[------------------------------------------------->] 5/5

In [24]:
chain_results.to_dataframe()

Unnamed: 0,inputs.input,outputs.input,outputs.output,reference.output,feedback.not_uncertain,feedback.correctness,feedback.embedding_cosine_distance,feedback.helpfulness,feedback.score_string:accuracy,error,execution_time,run_id
114ebe74-69b8-41cd-9324-d9cd230b44c1,What is LangChain?,What is LangChain?,LangChain is a decentralized blockchain platfo...,LangChain is an open-source framework for buil...,1.0,0.0,0.092592,0.0,0.1,,2.217575,93f0f655-874f-4e52-a281-aa89ddb54b58
a308de7a-33fd-43e4-a351-3f18d12197a4,What's LangSmith?,,,"LangSmith is a unified platform for debugging,...",,,,,,There is no current event loop in thread 'Thre...,1.305044,0ca8d22c-df61-4de6-bf09-8a88cd409e9e
0beffd64-3869-4df0-a901-adc95d75b88b,When was Llama-v2 released?,,,"July 18, 2023",,,,,,There is no current event loop in thread 'Thre...,1.360046,4cce8539-8185-4925-b552-94d38c863503
7aff3dbb-e2b1-45c1-b23c-40fd6484e39d,What is the langsmith cookbook?,,,The langsmith cookbook is a github repository ...,,,,,,There is no current event loop in thread 'Thre...,1.650587,f3fa0dca-95ef-4e67-8e65-9f5cd79566d9
6958fff2-2c6d-468e-8a65-29d3ee0855ca,When did langchain first announce the hub?,,,"September 5, 2023",,,,,,There is no current event loop in thread 'Thre...,1.640587,57601fa9-7ec8-4f8e-9d32-c94338b5fa31


In [26]:
# Compare to another prompt

candidate_prompt = hub.pull("wfh/langsmith-agent-prompt:39f3bbd0")

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_agent, prompt=candidate_prompt, llm_with_tools=llm_with_tools
    ),
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"tools-agent-test-39f3bbd0-may_5_2024",
    project_metadata={
        "env": "testing-notebook",
        "model": "gpt-3.5-turbo",
        "prompt": "39f3bbd0",
    },
)

View the evaluation results for project 'tools-agent-test-39f3bbd0-may_5_2024' at:
https://smith.langchain.com/o/bbad3093-addd-5544-b896-000e8d59d9dd/datasets/4e0ed8b4-034a-49a4-90a7-cb9d6b212621/compare?selectedSessions=9c679346-5159-42b3-8e87-615e5d390846

View all tests for Dataset agent-qa-may_2_2024 at:
https://smith.langchain.com/o/bbad3093-addd-5544-b896-000e8d59d9dd/datasets/4e0ed8b4-034a-49a4-90a7-cb9d6b212621
[>                                                 ] 0/5

Chain failed for example 6958fff2-2c6d-468e-8a65-29d3ee0855ca with inputs {'input': 'When did langchain first announce the hub?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-13_4'.
Chain failed for example 7aff3dbb-e2b1-45c1-b23c-40fd6484e39d with inputs {'input': 'What is the langsmith cookbook?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-13_3'.


[----------------------------->                    ] 3/5

Chain failed for example 0beffd64-3869-4df0-a901-adc95d75b88b with inputs {'input': 'When was Llama-v2 released?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-13_2'.
Chain failed for example 114ebe74-69b8-41cd-9324-d9cd230b44c1 with inputs {'input': 'What is LangChain?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-13_0'.
Chain failed for example a308de7a-33fd-43e4-a351-3f18d12197a4 with inputs {'input': "What's LangSmith?"}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-13_1'.


[------------------------------------------------->] 5/5

Error running batch evaluator <function max_pred_length at 0x000001EDFF687D30>: 'NoneType' object is not subscriptable


Unnamed: 0,error,execution_time,run_id
count,5,5.0,5
unique,5,,5
top,There is no current event loop in thread 'Thre...,,d9b91210-25c0-43f5-af09-82d7b740733c
freq,1,,1
mean,,23.658641,
std,,0.177201,
min,,23.429328,
25%,,23.58733,
50%,,23.638329,
75%,,23.729615,
