# Experiments

### Setup

In [2]:
!pip install --quiet -U langchain-google-genai langgraph langgraph-sdk langgraph-checkpoint-sqlite langsmith langchain-community langchain-core
!pip install --quiet notebook python-dotenv lxml scikit-learn pandas pyarrow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.5/378.5 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.0/444.0 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install --quiet python-dotenv

In [4]:
from dotenv import load_dotenv
load_dotenv(".env")

True

In [None]:
# You can set them inline
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langsmith-academy"

In [None]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

Here is the RAG Application that we've been working with throughout this course

In [20]:
import os
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_community.vectorstores import SKLearnVectorStore
# from langchain_openai import OpenAIEmbeddings
from langsmith import traceable
# from openai import OpenAI
from typing import List
import nest_asyncio

# TODO: Configure this model!
MODEL_NAME = "gemini-2.5-flash" # "gemini-2.5-flash-lite"
MODEL_PROVIDER = "google"
APP_VERSION = 1.0
RAG_SYSTEM_PROMPT = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the latest question in the conversation.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.
"""

import os
from google import genai
# openai_client = OpenAI()
g_client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))



from langchain_community.embeddings import HuggingFaceEmbeddings

def get_vector_db_retriever():
    """
    Creates or loads a scikit-learn based vector store retriever.

    This function replaces OpenAIEmbeddings with a local, open-source model
    from Hugging Face (sentence-transformers/all-MiniLM-L6-v2) for generating
    document embeddings.

    Returns:
        A retriever object for querying the vector store.
    """
    persist_path = os.path.join(tempfile.gettempdir(), "union_local.parquet")

    # Initialize a local, open-source embedding model from Hugging Face.
    # This model runs on your machine and does not require an API key.
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs = {'device': 'cpu'} # Use CPU for embedding
    encode_kwargs = {'normalize_embeddings': False}
    embd = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )

    # If vector store exists, then load it
    if os.path.exists(persist_path):
        print(f"Loading existing vector store from: {persist_path}")
        vectorstore = SKLearnVectorStore(
            embedding=embd,
            persist_path=persist_path,
            serializer="parquet"
        )
        # lambda_mult=0 is used for Maximal Marginal Relevance (MMR) search.
        # It effectively disables MMR and performs a standard similarity search.
        return vectorstore.as_retriever(lambda_mult=0)

    # Otherwise, index LangSmith documents and create a new vector store
    print("No existing vector store found. Indexing documents...")
    ls_docs_sitemap_loader = SitemapLoader(
        web_path="https://docs.smith.langchain.com/sitemap.xml",
        continue_on_failure=True,
        # Optional: Filter URLs to only include relevant documentation pages
        # filter_urls=["https://docs.smith.langchain.com/"]
    )

    # Set a custom user-agent to be respectful when scraping
    ls_docs_sitemap_loader.headers = {
        "User-Agent": "LocalVectorDBBuilder/1.0 (https://example.com/bot-info)"
    }
    ls_docs = ls_docs_sitemap_loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500, chunk_overlap=100 # Increased overlap for better context
    )
    doc_splits = text_splitter.split_documents(ls_docs)

    print(f"Created {len(doc_splits)} document splits. Creating vector store...")

    vectorstore = SKLearnVectorStore.from_documents(
        documents=doc_splits,
        embedding=embd,
        persist_path=persist_path,
        serializer="parquet"
    )

    print(f"Persisting vector store to: {persist_path}")
    vectorstore.persist()
    return vectorstore.as_retriever(lambda_mult=0)

nest_asyncio.apply()
retriever = get_vector_db_retriever()

"""
retrieve_documents
- Returns documents fetched from a vectorstore based on the user's question
"""
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

"""
generate_response
- Calls `call_openai` to generate a model response after formatting inputs
"""
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    # messages = [
    #     {
    #         "role": "system",
    #         "content": RAG_SYSTEM_PROMPT
    #     },
    #     {
    #         "role": "user",
    #         "content": f"Context: {formatted_docs} \n\n Question: {question}"
    #     }
    # ]
    messages = [
                  {
                      "role": "user",
                      "parts": [
                          {"text": RAG_SYSTEM_PROMPT},
                          {"text": f"Context: {formatted_docs} \n\n Question: {question}"}
                      ]
                  }
              ]
    return call_gemini(messages)

"""
call_gemini
- Returns the chat completion output from OpenAI
"""
@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_gemini(messages: List[dict]) -> str:
    # return openai_client.chat.completions.create(
    #     model=MODEL_NAME,
    #     messages=messages,
    # )
    return g_client.models.generate_content(
    model=MODEL_NAME, contents=messages
    )

"""
langsmith_rag
- Calls `retrieve_documents` to fetch documents
- Calls `generate_response` to generate a response based on the fetched documents
- Returns the model response
"""
@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    # return response.choices[0].message.content
    return response.candidates[0].content.parts[0].text


Loading existing vector store from: /tmp/union_local.parquet


### Experiment

Here is a code snippet that should look similar to what you see from the starter code!

There are a few important components here.

1. We have defined an Evaluator
2. We pipe our dataset examples (dict) to the shape of input that our function `langsmith_rag` takes (str) using a target function

In [21]:
from langsmith import evaluate, Client

ls_client = Client()
dataset_name = "RAG Application Golden Dataset"

def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gemini-2.5-flash-lite"
)

View the evaluation results for experiment: 'gemini-2.5-flash-lite-bec18b58' at:
https://smith.langchain.com/o/10172e86-d29b-46ce-8113-9bee95385ee1/datasets/79400911-becd-44c0-87c1-3e13372ccf15/compare?selectedSessions=8e168f52-b85e-4bfc-a6fe-7f9c99bf2932




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Does LangSmith support offline evaluation?,"LangSmith supports online evaluations, which p...",,"Yes, LangSmith supports offline evaluation thr...",1,0.552721,0152c119-1cf5-4150-ac52-4fee77387f9a,b4686225-0787-4a26-975d-c7f2195c9b7e
1,Can LangSmith be used for finetuning and model...,"Based on the provided context, LangSmith is a ...",,"Yes, LangSmith can be used for fine-tuning and...",1,0.539152,1462cb35-590d-4e93-9258-a68b6c1fbbe0,24442a6f-7893-4ab5-afc1-011e145b6539
2,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluations, wh...",,"Yes, LangSmith supports online evaluation as a...",1,0.454146,6b3a736a-7a0e-48d7-a5ad-e752cc334d3f,0a433ca1-c7d0-4890-afeb-d59c5a766b9d
3,How can I trace with the @traceable decorator?,You can trace with the `@traceable` decorator ...,,To trace with the @traceable decorator in Pyth...,1,0.625389,b2822d2a-ae86-4f28-9d98-5777f097697d,fb23bf91-5b09-420b-9ee6-6a0ddc594560
4,Is there a Javascript Langsmith SDK?,"I am sorry, but I cannot answer your question....",,"Yes, there is a Javascript Langsmith SDK.",0,0.470025,9cad5240-b862-4a73-8aa6-1b4e1dccf713,22f12a0c-16bc-44be-ac88-cae6b0341b8c
5,What testing capabilities does LangSmith have?,LangSmith allows you to test prompts and run e...,,LangSmith offers capabilities for creating dat...,1,0.481816,1259cfcc-05ca-43f2-a82c-813e4064881d,b2c0ae47-a1e8-4fca-b350-a19216da5e26
6,How do I pass metadata in with @traceable?,You can pass metadata with the `@traceable` de...,,You can pass metadata with the @traceable deco...,1,0.687705,17faded4-3541-4b1c-8743-d6bcca435958,9fc41046-be08-4066-8040-c82b5db6f916
7,How do I create user feedback with the LangSmi...,The provided context does not contain informat...,,To create user feedback with the LangSmith SDK...,1,0.500749,2b5dc92e-e546-4970-9c43-725d0c4078d9,88e3763e-1a57-4539-a46c-9cf5acb98228
8,How do I set up tracing to LangSmith if I'm us...,To set up tracing with LangSmith when using La...,,To set up tracing to LangSmith while using Lan...,1,0.56428,7c858c38-14b0-4503-a836-b44efe38513c,53efe867-bc1d-4fcf-bfd8-10bd8998df8a
9,What is LangSmith used for in three sentences?,LangSmith is used for storing and processing t...,,LangSmith is a platform designed for the devel...,1,0.400476,b892ff56-efa7-4773-8b6d-68d83685036a,aaa1e1b6-d81d-465f-85f8-0a5ebc79c36b


### Modifying your Application

Now, let's change our model to gpt-35-turbo and see how it performs!

Make this change, and then run this code snippet!

In [17]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gemini-2.5-flash"
)

View the evaluation results for experiment: 'gemini-2.5-flash-85c27fa6' at:
https://smith.langchain.com/o/10172e86-d29b-46ce-8113-9bee95385ee1/datasets/79400911-becd-44c0-87c1-3e13372ccf15/compare?selectedSessions=d77a1535-6098-4149-b607-88134f8ef2ed




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Does LangSmith support offline evaluation?,LangSmith does not explicitly mention support ...,,"Yes, LangSmith supports offline evaluation thr...",1,0.725592,0152c119-1cf5-4150-ac52-4fee77387f9a,d89513d9-41a9-41e8-8891-a47df3c563cb
1,Can LangSmith be used for finetuning and model...,"Based on the provided context, LangSmith is a ...",,"Yes, LangSmith can be used for fine-tuning and...",1,0.529775,1462cb35-590d-4e93-9258-a68b6c1fbbe0,3c29109d-3553-4e7a-af11-4a1fee9d6a9b
2,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluations, wh...",,"Yes, LangSmith supports online evaluation as a...",1,0.573426,6b3a736a-7a0e-48d7-a5ad-e752cc334d3f,41096b17-994d-4550-88be-2f89ae73cda6
3,How can I trace with the @traceable decorator?,You can trace with the `@traceable` decorator ...,,To trace with the @traceable decorator in Pyth...,1,0.583375,b2822d2a-ae86-4f28-9d98-5777f097697d,beca6e3e-1da5-4fac-886f-82ce8a4914d5
4,Is there a Javascript Langsmith SDK?,"I apologize, but the provided context does not...",,"Yes, there is a Javascript Langsmith SDK.",0,0.418032,9cad5240-b862-4a73-8aa6-1b4e1dccf713,8a029c9b-46a7-4305-960b-7402bd256563
5,What testing capabilities does LangSmith have?,LangSmith focuses on evaluating and tracing ap...,,LangSmith offers capabilities for creating dat...,1,0.483213,1259cfcc-05ca-43f2-a82c-813e4064881d,01a393b9-d03d-4e45-b5be-b11b2e5e9864
6,How do I pass metadata in with @traceable?,You can pass metadata to the `@traceable` deco...,,You can pass metadata with the @traceable deco...,1,0.555058,17faded4-3541-4b1c-8743-d6bcca435958,d82486eb-26c7-4075-a729-6dcc5d166285
7,How do I create user feedback with the LangSmi...,To create user feedback with the LangSmith SDK...,,To create user feedback with the LangSmith SDK...,1,0.654494,2b5dc92e-e546-4970-9c43-725d0c4078d9,04748b23-ca09-4195-8cd2-e85ba8de6255
8,How do I set up tracing to LangSmith if I'm us...,"To set up tracing to LangSmith with LangChain,...",,To set up tracing to LangSmith while using Lan...,1,0.457805,7c858c38-14b0-4503-a836-b44efe38513c,d68645a7-e26d-4f4d-ac5c-c1034e20b74a
9,What is LangSmith used for in three sentences?,"LangSmith is a platform for testing, debugging...",,LangSmith is a platform designed for the devel...,1,0.439613,b892ff56-efa7-4773-8b6d-68d83685036a,7a1954be-ab4e-4d16-9f15-3ae59b39da54


### Running over Different pieces of Data

##### Dataset Version

You can execute an experiment on a specific version of a dataset in the sdk by using the `as_of` parameter in `list_examples`

Let's try running on just our initial dataset.

In [26]:
evaluate(
    target_function,
    data=ls_client.list_examples(dataset_name=dataset_name, as_of="initial dataset"),   # We use as_of to specify a version
    evaluators=[is_concise_enough],
    experiment_prefix="initial dataset version"
)

View the evaluation results for experiment: 'initial dataset version-3b4a0db3' at:
https://smith.langchain.com/o/10172e86-d29b-46ce-8113-9bee95385ee1/datasets/79400911-becd-44c0-87c1-3e13372ccf15/compare?selectedSessions=800cb812-ced8-4065-9578-efa6c508f469




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Does LangSmith support offline evaluation?,"Yes, LangSmith supports offline evaluation. Yo...",,"Yes, LangSmith supports offline evaluation thr...",1,0.776882,0152c119-1cf5-4150-ac52-4fee77387f9a,44870608-c853-4778-bdec-90cff8c1e3b2
1,Can LangSmith be used for finetuning and model...,"Based on the provided context, LangSmith is a ...",,"Yes, LangSmith can be used for fine-tuning and...",1,0.606937,1462cb35-590d-4e93-9258-a68b6c1fbbe0,83e1df6d-5bb1-4e54-82be-d6b2dc4e061e
2,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluations, wh...",,"Yes, LangSmith supports online evaluation as a...",1,0.547077,6b3a736a-7a0e-48d7-a5ad-e752cc334d3f,c8bfb64e-64ec-42e7-a0b8-d9c87eab1130
3,How can I trace with the @traceable decorator?,You can trace with the `@traceable` decorator ...,,To trace with the @traceable decorator in Pyth...,1,0.567268,b2822d2a-ae86-4f28-9d98-5777f097697d,590fa2ed-f224-4208-a0b1-d9f3771038db
4,Is there a Javascript Langsmith SDK?,I cannot answer your question. The provided te...,,"Yes, there is a Javascript Langsmith SDK.",0,0.646441,9cad5240-b862-4a73-8aa6-1b4e1dccf713,4addbf40-aa7f-4cd4-b95b-373e9fb4c722
5,What testing capabilities does LangSmith have?,LangSmith allows you to test prompts and run e...,,LangSmith offers capabilities for creating dat...,1,0.640603,1259cfcc-05ca-43f2-a82c-813e4064881d,3e49d620-e7ed-490c-a451-c588932f46e0
6,How do I pass metadata in with @traceable?,You can pass metadata to the `@traceable` deco...,,You can pass metadata with the @traceable deco...,1,0.614621,17faded4-3541-4b1c-8743-d6bcca435958,a95e0bb8-f95d-4555-87f5-eafd0e686e8b
7,How do I create user feedback with the LangSmi...,The provided context does not contain informat...,,To create user feedback with the LangSmith SDK...,1,0.427047,2b5dc92e-e546-4970-9c43-725d0c4078d9,dbafadbb-cea5-4ed4-8323-508506192d2b
8,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith when using Lang...,,To set up tracing to LangSmith while using Lan...,1,0.500855,7c858c38-14b0-4503-a836-b44efe38513c,8cbf0b63-a8f7-4d1e-9074-bc999fcd9070
9,What is LangSmith used for in three sentences?,LangSmith is used to store and process trace d...,,LangSmith is a platform designed for the devel...,1,0.504577,b892ff56-efa7-4773-8b6d-68d83685036a,acfd8e3e-21c3-420a-9844-1114972a5f49


##### Dataset Split

You can run an experiment on a specific split of your dataset, let's try running on the Crucial Examples split.

In [24]:
evaluate(
    target_function,
    data=ls_client.list_examples(dataset_name=dataset_name, splits=["Crucial Examples"]),  # We pass in a list of Splits
    evaluators=[is_concise_enough],
    experiment_prefix="Crucial Examples split"
)

View the evaluation results for experiment: 'Crucial Examples split-4f5014f5' at:
https://smith.langchain.com/o/10172e86-d29b-46ce-8113-9bee95385ee1/datasets/79400911-becd-44c0-87c1-3e13372ccf15/compare?selectedSessions=1d51fa94-ec71-4e3f-b681-8c240e8d2d59




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I create user feedback with the LangSmi...,The provided context does not contain informat...,,To create user feedback with the LangSmith SDK...,1,0.727501,2b5dc92e-e546-4970-9c43-725d0c4078d9,456d96f5-0afd-4ab6-bd6e-95a3366054d8
1,How do I set up tracing to LangSmith if I'm us...,To set up tracing to LangSmith when using Lang...,,To set up tracing to LangSmith while using Lan...,0,1.065922,7c858c38-14b0-4503-a836-b44efe38513c,ad1da197-bed2-4489-a199-f3454a02ef05
2,Is there a Javascript Langsmith SDK?,"I apologize, but the provided context does not...",,"Yes, there is a Javascript Langsmith SDK.",0,0.658622,9cad5240-b862-4a73-8aa6-1b4e1dccf713,42aee866-0dd4-4919-b567-25c25a48730b
3,What is LangSmith used for in three sentences?,LangSmith is used for storing and processing t...,,LangSmith is a platform designed for the devel...,1,0.406884,b892ff56-efa7-4773-8b6d-68d83685036a,206257d8-d726-4601-84ab-517f21d22cba


##### Specific Data Points

You can specify individual data points to run an experiment over as well

In [27]:
evaluate(
    target_function,
    data=ls_client.list_examples(
        dataset_name=dataset_name,
        example_ids=[   # We pass in a specific list of example_ids
            # TODO: You will need to paste in your own example ids for this to work!
            "9cad5240-b862-4a73-8aa6-1b4e1dccf713",
            "0152c119-1cf5-4150-ac52-4fee77387f9a"
        ]
    ),
    evaluators=[is_concise_enough],
    experiment_prefix="two specific example ids"
)

View the evaluation results for experiment: 'two specific example ids-857c9f57' at:
https://smith.langchain.com/o/10172e86-d29b-46ce-8113-9bee95385ee1/datasets/79400911-becd-44c0-87c1-3e13372ccf15/compare?selectedSessions=8f440c3e-c93d-4251-b9ee-60f708d60306




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Does LangSmith support offline evaluation?,"No, LangSmith primarily supports online evalua...",,"Yes, LangSmith supports offline evaluation thr...",1,0.776592,0152c119-1cf5-4150-ac52-4fee77387f9a,953910e9-9aee-4201-84cd-f5ca0142e89a
1,Is there a Javascript Langsmith SDK?,"I'm sorry, but the provided context does not c...",,"Yes, there is a Javascript Langsmith SDK.",0,0.50922,9cad5240-b862-4a73-8aa6-1b4e1dccf713,5051325a-18c6-4302-accf-90ed2e6b09e5


### Other Parameters

##### Repetitions

You can run an experiment several times to make sure you have consistent results

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="two repetitions",
    num_repetitions=2   # This field defaults to 1
)

##### Concurrency
You can also kick off concurrent threads of execution to make your experiments finish faster!

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="concurrency",
    max_concurrency=3,  # This defaults to None, so this is an improvement!
)

##### Metadata

You can (and should) add metadata to your experiments, to make them easier to find in the UI

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="metadata added",
    metadata={  # We can pass custom metadata for the experiment, such as the model name
        "model_name": MODEL_NAME
    }
)