In [4]:
from langsmith import traceable, wrappers
from openai import Client

openai = wrappers.wrap_openai(Client())

@traceable
def label_text(text):
    messages = [
        {
            "role": "system",
            "content": "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
        },
        {"role": "user", "content": text},
    ]
    result = openai.chat.completions.create(
        messages=messages, model="gpt-3.5-turbo", temperature=0
    )
    return result.choices[0].message.content

In [5]:
from langsmith import Client

client = Client()

# Create a dataset
examples = [
    ("Shut up, idiot", "Toxic"),
    ("You're a wonderful person", "Not toxic"),
    ("This is the worst thing ever", "Toxic"),
    ("I had a great day today", "Not toxic"),
    ("Nobody likes you", "Toxic"),
    ("This is unacceptable. I want to speak to the manager.", "Not toxic"),
]

dataset_name = "Toxic Queries"
dataset = client.create_dataset(dataset_name=dataset_name)
inputs, outputs = zip(
    *[({"text": text}, {"label": label}) for text, label in examples]
)
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

HTTPError: [Errno 409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets] {"detail":"Dataset with this name already exists."}

In [6]:
from langsmith.schemas import Example, Run

def correct_label(root_run: Run, example: Example) -> dict:
    score = root_run.outputs.get("output") == example.outputs.get("label")
    return {"score": int(score), "key": "correct_label"}

In [8]:
from langsmith.evaluation import evaluate
# from app import label_text
dataset_name = "Toxic Queries"

results = evaluate(
    lambda inputs: label_text(inputs["text"]),
    data=dataset_name,
    evaluators=[correct_label],
    experiment_prefix="Toxic Queries",
    description="Testing the baseline system.",  # optional
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'Toxic Queries-f9fac829' at:
https://smith.langchain.com/o/b47753b3-6d84-5fa3-b240-64adbbe1c7be/datasets/beb87acc-ecc1-479d-9c75-7820debf98a7/compare?selectedSessions=21be7e58-c913-4f45-a522-c4dbd0d6671d




6it [00:04,  1.35it/s]


In [9]:
from langsmith.evaluation import evaluate
# from app import label_text
dataset_name = "Toxic Queries"

results = evaluate(
    lambda inputs: label_text(inputs["text"]),
    data=dataset_name,
    evaluators=[correct_label],
    experiment_prefix="Toxic Queries with repeatation",
    description="Testing the baseline system with three responses",  # optional
    num_repetitions=3,)

View the evaluation results for experiment: 'Toxic Queries with repeatation-edd2abcc' at:
https://smith.langchain.com/o/b47753b3-6d84-5fa3-b240-64adbbe1c7be/datasets/beb87acc-ecc1-479d-9c75-7820debf98a7/compare?selectedSessions=0fff2793-9ae3-44a1-be24-8c6656a32f88




18it [00:03,  4.75it/s]


In [7]:
from langsmith.schemas import Example, Run

def summary_eval(runs: list[Run], examples: list[Example]) -> dict:
    correct = 0
    for i, run in enumerate(runs):
        if run.outputs["output"] == examples[i].outputs["label"]:
            correct += 1
    if correct / len(runs) > 0.5:
        return {"key": "pass", "score": True}
    else:
        return {"key": "pass", "score": False}

In [9]:
results = evaluate(
    lambda inputs: label_text(inputs["text"]),
    data=dataset_name,
    evaluators=[correct_label],
    experiment_prefix="Toxic Queries with summary and toxix label with three repatations",
    num_repetitions=3
)

View the evaluation results for experiment: 'Toxic Queries with summary and toxix label with three repatations-85c6afc2' at:
https://smith.langchain.com/o/b47753b3-6d84-5fa3-b240-64adbbe1c7be/datasets/ebd5b144-7f60-4109-b36c-b0fe8f5a2cca/compare?selectedSessions=a094f573-88fd-44d6-81b4-4fa40bce1c10




0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator summary_eval> on run 6cfc8af2-dcd0-4f02-a89a-2000e7e13918: AttributeError("'tuple' object has no attribute 'outputs'")
Traceback (most recent call last):
  File "c:\Users\shyams\Downloads\projects\rag with langsmith evaluation\.venv\Lib\site-packages\langsmith\evaluation\_runner.py", line 1231, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\shyams\Downloads\projects\rag with langsmith evaluation\.venv\Lib\site-packages\langsmith\evaluation\evaluator.py", line 278, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\shyams\Downloads\projects\rag with langsmith evaluation\.venv\Lib\site-packages\langsmith\run_helpers.py", line 568, in wrapper
    raise e
  File "c:\Users\shyams\Downloads\projects\rag with langsmith evaluation\.venv\Lib\site-packages\langsmith\run_helpers.py", line 565, in wrapper
    function_re

In [10]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_messages([
  ("system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."),
  ("user", "{text}")
])
chat_model = ChatOpenAI()
output_parser = StrOutputParser()

chain = prompt | chat_model | output_parser

In [11]:
from langsmith.evaluation import evaluate

results = evaluate(
    chain.invoke,
    data=dataset_name,
    evaluators=[correct_label],
    experiment_prefix="Toxic Queries",
)

View the evaluation results for experiment: 'Toxic Queries-3d274b3f' at:
https://smith.langchain.com/o/b47753b3-6d84-5fa3-b240-64adbbe1c7be/datasets/ebd5b144-7f60-4109-b36c-b0fe8f5a2cca/compare?selectedSessions=e7714f5d-bdc0-4507-b9a0-e528a9cbc1a6




6it [00:03,  1.95it/s]


In [14]:
import openai
import wikipedia as wp

from langsmith import traceable
from langsmith.wrappers import wrap_openai

openai = wrap_openai(openai.Client())

@traceable
def generate_wiki_search(question):
    messages = [
        {"role": "system", "content": "Generate a search query to pass into wikipedia to answer the user's question. Return only the search query and nothing more. This will passed in directly to the wikipedia search engine."},
        {"role": "user", "content": question}
    ]
    result = openai.chat.completions.create(messages=messages, model="gpt-3.5-turbo", temperature=0)
    return result.choices[0].message.content

@traceable(run_type="retriever")
def retrieve(query):
    results = []
    for term in wp.search(query, results = 10):
        try:
            page = wp.page(term, auto_suggest=False)
            results.append({
                "page_content": page.summary,
                "type": "Document",
                "metadata": {"url": page.url}
            })
        except wp.DisambiguationError:
            pass
        if len(results) >= 2:
            return results

@traceable
def generate_answer(question, context):
    messages = [
        {"role": "system", "content": f"Answer the user's question based ONLY on the content below:\n\n{context}"},
        {"role": "user", "content": question}
    ]
    result = openai.chat.completions.create(messages=messages, model="gpt-3.5-turbo", temperature=0)
    return result.choices[0].message.content

@traceable
def rag_pipeline(question):
    query = generate_wiki_search(question)
    context = "\n\n".join([doc["page_content"] for doc in retrieve(query)])
    answer = generate_answer(question, context)
    return answer

In [15]:
from langsmith import Client

client = Client()

examples = [
    ("What is LangChain?", "LangChain is an open-source framework for building applications using large language models."),
    ("What is LangSmith?", "LangSmith is an observability and evaluation tool for LLM products, built by LangChain Inc.")
]

dataset_name = "Wikipedia RAG"
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name=dataset_name)
    inputs, outputs = zip(
        *[({"input": input}, {"expected": expected}) for input, expected in examples]
    )
    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

In [16]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate
from langsmith.schemas import Example, Run
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

def document_relevance(root_run: Run, example: Example) -> dict:
    """
    A very simple evaluator that checks to see if the input of the retrieval step exists
    in the retrieved docs.
    """
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "rag_pipeline")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve")
    page_contents = "\n\n".join(doc["page_content"] for doc in retrieve_run.outputs["output"])
    score = retrieve_run.inputs["query"] in page_contents
    return {"key": "simple_document_relevance", "score": score}

def hallucination(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see the answer is grounded in the documents
    """
    # Get documents and answer
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "rag_pipeline")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve")
    page_contents = "\n\n".join(doc["page_content"] for doc in retrieve_run.outputs["output"])
    generation = rag_pipeline_run.outputs["output"]

    # Data model
    class GradeHallucinations(BaseModel):
        """Binary score for hallucination present in generation answer."""

        binary_score: int = Field(description="Answer is grounded in the facts, 1 or 0")

    # LLM with function call
    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
    structured_llm_grader = llm.with_structured_output(GradeHallucinations)

    # Prompt
    system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
        Give a binary score 1 or 0, where 1 means that the answer is grounded in / supported by the set of facts."""
    hallucination_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
        ]
    )

    hallucination_grader = hallucination_prompt | structured_llm_grader
    score = hallucination_grader.invoke({"documents": page_contents, "generation": generation})
    return {"key": "answer_hallucination", "score": int(score.binary_score)}

In [17]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    lambda inputs: rag_pipeline(inputs["input"]),
    data=dataset_name,
    evaluators=[document_relevance, hallucination],
    experiment_prefix="rag-wiki-oai"
)

View the evaluation results for experiment: 'rag-wiki-oai-81f3e1b2' at:
https://smith.langchain.com/o/b47753b3-6d84-5fa3-b240-64adbbe1c7be/datasets/b52af8f3-5d72-40d3-b961-68bce46a5ea8/compare?selectedSessions=8a1f9b78-0373-48d0-8769-dc03e0938864




2it [00:13,  6.94s/it]
