**Make sure you load the API keys for cloud providers!**

You can set your environment keys yourself or use a script. Please note that since keys are private, they are not included in the repository.

In [1]:
# setting the environment variables, the keys
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

from config import set_environment
# for the keys - as explained early in chapter 2
set_environment()

In [16]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "My Project"
os.environ["LANGSMITH_API_KEY"] = "***REMOVED***"
os.environ['LANGSMITH_ENDPOINT'] = "https://api.smith.langchain.com"

In [10]:
from langchain_openai import ChatOpenAI

# Create a simple LLM call that will be traced in LangSmith
llm = ChatOpenAI()
response = llm.invoke("Hello, world!")
print(f"Model response: {response.content}")
print("\nThis run has been logged to LangSmith.")
print("You can view it in the LangSmith UI: https://smith.langchain.com")

Model response: Hello! How can I assist you today?

This run has been logged to LangSmith.
You can view it in the LangSmith UI: https://smith.langchain.com


# Creating an evaluation dataset

In [18]:
from langsmith import Client
client = Client()

# Sample financial examples
financial_examples = [
    {
        "inputs": {
            "question": "What are the tax implications of early 401(k) withdrawal?",
            "context_needed": ["retirement", "taxation", "penalties"]
        },
        "outputs": {
            "answer": "Early withdrawals from a 401(k) typically incur a 10% penalty if you're under 59½ years old, in addition to regular income taxes. However, certain hardship withdrawals may qualify for penalty exemptions.",
            "key_points": ["10% penalty", "income tax", "hardship exemptions"],
            "documents": ["IRS publication 575", "Retirement plan guidelines"]
        }
    },
    {
        "inputs": {
            "question": "How does dollar-cost averaging compare to lump-sum investing?",
            "context_needed": ["investment strategy", "risk management", "market timing"]
        },
        "outputs": {
            "answer": "Dollar-cost averaging spreads investments over time to reduce timing risk, while lump-sum investing typically outperforms in rising markets due to longer market exposure. DCA may provide psychological benefits through reduced volatility exposure.",
            "key_points": ["timing risk", "market exposure", "psychological benefits"],
            "documents": ["Investment strategy comparisons", "Market timing research"]
        }
    }
]

# Create dataset in LangSmith
dataset_name = "Financial Advisory RAG Evaluation"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Evaluation dataset for financial advisory RAG systems covering retirement, investments, and tax planning."
)

# Add examples to the dataset
for example in financial_examples:
    client.create_example(
        inputs=example["inputs"],
        outputs=example["outputs"],
        dataset_id=dataset.id
    )
print(f"Created evaluation dataset with {len(financial_examples)} examples")

Created evaluation dataset with 2 examples


In [None]:
# Define evaluation configuration
from langchain.smith import RunEvalConfig

# Define evaluation criteria specific to RAG systems
evaluation_config = RunEvalConfig(
    evaluators=[
        # Correctness: Compare response to reference answer
        RunEvalConfig.LLM(
            criteria={
                "factual_accuracy": "Does the response contain only factually correct information consistent with the reference answer?"
            }
        ),
        # Groundedness: Ensure response is supported by retrieved context
        RunEvalConfig.LLM(
            criteria={
                "groundedness": "Is the response fully supported by the retrieved documents without introducing unsupported information?"
            }
        ),
        # Retrieval quality: Assess relevance of retrieved documents
        RunEvalConfig.LLM(
            criteria={
                "retrieval_relevance": "Are the retrieved documents relevant to answering the question?"
            }
        )
    ]
)


## Function to construct your RAG chain (placeholder)

In [None]:
def construct_chain():
    # This would be your actual RAG implementation
    # For example: return RAGChain(...)
    pass


## Run evaluation on dataset

In [None]:
from langchain.smith import run_on_dataset
results = run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=construct_chain,
    evaluation=evaluation_config
)

# Insurance Claim Extraction Evaluation Example

# HuggingFace evaluation for code generation

In [21]:
import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

In [22]:
from datasets import load_dataset
from evaluate import load
from langchain_core.messages import HumanMessage

human_eval = load_dataset("openai_humaneval", split="test")
code_eval_metric = load("code_eval")

test_cases = ["assert add(2,3)==5"]
candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]

pass_at_k, results = code_eval_metric.compute(references=test_cases, predictions=candidates, k=[1, 2])
print(pass_at_k)


{'pass@1': 0.5, 'pass@2': 1.0}
