In [None]:
print("Cool")

## Testing RAG Evaluation using Langsmith

### Indexing and Retrieval

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()

# List of URLs to load documents from
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

# Load documents from the URLs
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

# Add the document chunks to the "vector store" using OpenAIEmbeddings
vectorstore = InMemoryVectorStore.from_documents(
    documents=doc_splits,
    embedding=OpenAIEmbeddings(),
)

# With langchain we can easily turn any vector store into a retrieval component:
retriever = vectorstore.as_retriever(k=6)

### Generation

In [None]:
from langchain_openai import ChatOpenAI
from langsmith import traceable

llm = ChatOpenAI(model="gpt-4o", temperature=1)

# Add decorator so this function is traced in LangSmith
@traceable()
def rag_bot(question: str) -> dict:
    # LangChain retriever will be automatically traced
    docs = retriever.invoke(question)
    docs_string = "".join(doc.page_content for doc in docs)
    instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions.
       Use the following source documents to answer the user's questions.
       If you don't know the answer, just say that you don't know.
       Use three sentences maximum and keep the answer concise.

Documents:
{docs_string}"""
    # langchain ChatModel will be automatically traced
    ai_msg = llm.invoke([
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    )
    return {"answer": ai_msg.content, "documents": docs}

### Dataset

In [None]:
from langsmith import Client

client = Client()

# Define the examples for the dataset
examples = [
    {
        "inputs": {"question": "How does the ReAct agent use self-reflection? "},
        "outputs": {"answer": "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs."},
    },
    {
        "inputs": {"question": "What are the types of biases that can arise with few-shot prompting?"},
        "outputs": {"answer": "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias."},
    },
    {
        "inputs": {"question": "What are five types of adversarial attacks?"},
        "outputs": {"answer": "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming."},
    },
]

# Create the dataset and examples in LangSmith
dataset_name = "Lilian Weng Blogs Q&A"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=examples
)

## Evaluation Metrics
### Correctness:
- Compare the FinalAnswer from the llm to the **curated dataset** on scale of 0-1
### Relevance
- Compares the user's question with the Final answer, whether it satisfies the user's question or not
- If yes then how extent on score of 0-1
### Groundedness
- Given context by rag does the answer is following the document or making stuff up.
### Retrieval Relevance
- Does retrieved documents are as per user's query

In [None]:
### Correctness
from typing_extensions import Annotated, TypedDict

# Grade output schema
class CorrectnessGrade(TypedDict):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. (2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    CorrectnessGrade, method="json_schema", strict=True
)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""
    # Run evaluator
    grade = grader_llm.invoke([
        {"role": "system", "content": correctness_instructions},
        {"role": "user", "content": answers}
    ])
    return grade["correct"]

In [None]:
## RELEVANCE - OUTPUT VS INPUT
# Grade output schema
class RelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[
        bool, ..., "Provide the score on whether the answer addresses the question"
    ]

# Grade prompt
relevance_instructions = """You are a teacher grading a quiz. You will be given a QUESTION and a STUDENT ANSWER. Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Relevance:
A relevance value of True means that the student's answer meets all of the criteria.
A relevance value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    RelevanceGrade, method="json_schema", strict=True
)

# Evaluator
def relevance(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer helpfulness."""
    answer = f"QUESTION: {inputs['question']}\nSTUDENT ANSWER: {outputs['answer']}"
    grade = relevance_llm.invoke([
        {"role": "system", "content": relevance_instructions},
        {"role": "user", "content": answer}
    ])
    return grade["relevant"]

In [None]:
## - ----------------  GROUNDEDNESS: DOES ANSWER IS BASED ON RETRIEVED DOCS --------------------
# Grade output schema
class GroundedGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    grounded: Annotated[
        bool, ..., "Provide the score on if the answer hallucinates from the documents"
    ]

# Grade prompt
grounded_instructions = """You are a teacher grading a quiz. You will be given FACTS and a STUDENT ANSWER. Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is grounded in the FACTS. (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Grounded:
A grounded value of True means that the student's answer meets all of the criteria.
A grounded value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
grounded_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    GroundedGrade, method="json_schema", strict=True
)

# Evaluator
def groundedness(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer groundedness."""
    doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
    answer = f"FACTS: {doc_string}\nSTUDENT ANSWER: {outputs['answer']}"
    grade = grounded_llm.invoke([
        {"role": "system", "content": grounded_instructions},
        {"role": "user", "content": answer}
    ])
    return grade["grounded"]

In [None]:
##--------------------RETRIEVAL RELEVANCE: DOES DOCUMENTS ARE AS PER QUERY OR NOT ------------------------
# Grade output schema
class RetrievalRelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[
        bool,
        ...,
        "True if the retrieved documents are relevant to the question, False otherwise",
    ]

# Grade prompt
retrieval_relevance_instructions = """You are a teacher grading a quiz. You will be given a QUESTION and a set of FACTS provided by the student. Here is the grade criteria to follow:
(1) You goal is to identify FACTS that are completely unrelated to the QUESTION
(2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
(3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met

Relevance:
A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
A relevance value of False means that the FACTS are completely unrelated to the QUESTION.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
retrieval_relevance_llm = ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)

def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
    """An evaluator for document relevance"""
    doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
    answer = f"FACTS: {doc_string}\nQUESTION: {inputs['question']}"
    # Run evaluator
    grade = retrieval_relevance_llm.invoke([
        {"role": "system", "content": retrieval_relevance_instructions},
        {"role": "user", "content": answer}
    ])
    return grade["relevant"]

## Run the Experiment

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
def target(inputs: dict) -> dict:
    return rag_bot(inputs["question"])

experiment_results = client.evaluate(
    target,
    data=dataset_name,
    evaluators=[correctness, groundedness, relevance, retrieval_relevance],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gpt-4-0125-preview"},
)

# Explore results locally as a dataframe if you have pandas installed
# experiment_results.to_pandas()

## Testing my own rag

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
examples = [
    {
        "inputs": {
            "question": "What is the main concept discussed in the article 'For a More Creative Brain Follow These 5 Steps'?"
        },
        "outputs": {
            "answer": "The main concept discussed in the article 'For a More Creative Brain Follow These 5 Steps' is the creative process and how it can be systematically approached through five steps. These steps are:\n\nGather New Material: Learn specific material related to your task and general material by exploring a wide range of concepts.\nWork Over the Materials: Examine and experiment with the learned material by looking at it from different angles.\nStep Away from the Problem: Take a break from the problem to do something else that excites and energizes you.\nLet Your Idea Return: Allow the idea to come back to you naturally with renewed insight and energy.\nShape and Develop Your Idea: Release your idea into the world, gather feedback, and adapt it as needed.\nThe article emphasizes that creativity involves connecting ideas and can be cultivated through practice and courage."
        },
    },
    {
        "inputs": {
            "question": "How does the story of Frederic Eugene Ives demonstrate the five stages of the creative process?"
        },
        "outputs": {
            "answer": "The story of Frederic Eugene Ives demonstrates the five stages of the creative process as follows:\n\nGather New Material: Ives would have gathered material by learning about existing printing technologies and the challenges faced by newspapers and printers in the 1870s, particularly the costly and time-consuming process of photoengraving.\n\nWork Over the Materials: He likely examined the problem of printing images from different angles, experimenting with various ideas and technologies to find a more efficient solution.\n\nStep Away from the Problem: Although not explicitly detailed in the story, stepping away from the problem would involve Ives taking breaks or engaging in other activities, allowing his mind to subconsciously process the information.\n\nLet Your Idea Return: Ives' breakthrough likely came as a flash of insight, where he conceived the method of 'halftone printing,' which involved breaking a photograph down into a series of tiny dots that blend together to create an image.\n\nShape and Develop Your Idea: Ives would have tested his halftone printing method, refined it based on feedback, and adapted it for practical use, ultimately revolutionizing the printing process by making it faster and more cost-effective.\n\nThis story illustrates how Ives applied the creative process to solve a significant problem in the printing industry."
        },
    },
    {
        "inputs": {
            "question": "What are the five stages of the creative process and how can they be applied in modern problem-solving?"
        },
        "outputs": {
            "answer": "The five stages of the creative process and their application in modern problem-solving are as follows:\n\nGather New Material:\n\nApplication: Start by researching and collecting information relevant to the problem you're facing. This includes both specific data related to the issue and broader knowledge that might provide new perspectives. In modern settings, this could involve reading articles, attending workshops, or exploring related fields.\n\nWork Over the Materials:\n\nApplication: Analyze and experiment with the information you've gathered. Look at the problem from different angles and try to connect disparate ideas. Use brainstorming sessions or mind mapping to explore various possibilities and combinations.\n\nStep Away from the Problem:\n\nApplication: Take a break from actively thinking about the problem. Engage in activities that relax or energize you, such as exercising, meditating, or pursuing a hobby. This allows your subconscious mind to process the information and can lead to unexpected insights.\n\nLet Your Idea Return:\n\nApplication: After stepping away, allow the solution or idea to come back to you naturally. This often happens when you're not actively trying to solve the problem, such as during a walk or while showering. Be open to these moments of insight and ready to capture them.\n\nShape and Develop Your Idea:\n\nApplication: Once you have an idea, test it in the real world. Gather feedback from others, adapt your approach based on this input, and refine your solution. This iterative process helps in developing a robust and effective solution to the problem.\n\nBy following these stages, individuals and organizations can approach modern problems with a structured yet flexible mindset, fostering innovation and effective problem-solving."
        },
    },


     {
        "inputs": {
            "question": "How can saying 'no' actually make you more successful rather than less involved?"
        },
        "outputs": {
            "answer": "Saying 'no' can make you more successful by helping you focus on what truly matters and protecting your most valuable asset: your time. Here’s how it works:\n\nFocus on Priorities: By saying no to distractions and tasks that don't align with your goals, you can concentrate your efforts on activities that drive success. This focused approach allows you to invest your time and energy in what truly matters.\n\nPrevent Overcommitment: Saying no helps prevent overcommitment, which can lead to stress and burnout. By managing your commitments wisely, you maintain the capacity to perform at your best.\n\nEnhance Productivity: Eliminating unnecessary tasks is often more effective than optimizing how you perform them. This means that saying no can lead to greater productivity by ensuring you only engage in meaningful activities.\n\nStrategic Decision-Making: As you progress in your career or personal projects, saying no becomes a strategic tool to refine your focus. It allows you to selectively say yes to opportunities that align with your long-term vision and goals.\n\nMaintain Quality: By not spreading yourself too thin, you can maintain a high standard of quality in your work, which is crucial for achieving success.\n\nIn essence, saying no is about making deliberate choices that align with your priorities and values, rather than being less involved. It’s a proactive strategy for achieving success by focusing on what truly matters."
        },
    },
    {
        "inputs": {
            "question": "What does the article suggest about the hidden cost of saying 'yes'?"
        },
        "outputs": {
            "answer": "The article suggests that the hidden cost of saying 'yes' lies in the potential waste of time and effort on activities that do not truly matter or align with one's goals. Here are some key points:\n\nLoss of Time: Saying yes can lead to time being taken away from more important tasks or goals. Time is a finite resource, and agreeing to too many commitments can dilute your focus and effectiveness.\n\nDistraction from Goals: By saying yes to too many things, you risk being distracted from your primary objectives. This can prevent you from making meaningful progress in areas that are truly important to you.\n\nIncreased Stress and Overcommitment: Agreeing to too many requests can lead to overcommitment, resulting in stress and a decrease in the quality of work due to spreading yourself too thin.\n\nOpportunity Cost: Every time you say yes to something, you are implicitly saying no to something else. This opportunity cost can mean missing out on more valuable or rewarding opportunities that align better with your long-term goals.\n\nInefficiency: More effort can be wasted on doing things that are not necessary or important, rather than optimizing and focusing on what truly needs to be done.\n\nOverall, the article highlights that saying yes too often can lead to inefficiencies and prevent you from focusing on what truly matters, thereby hindering success."
        },
    },
    {
        "inputs": {
            "question": "How can individuals upgrade their ability to say 'no' as they grow in their careers?"
        },
        "outputs": {
            "answer": "As individuals grow in their careers, upgrading their ability to say 'no' becomes crucial for maintaining focus and achieving success. Here are some strategies to enhance this skill:\n\nClarify Your Priorities: Clearly define your goals and priorities. Understanding what is most important to you helps in making informed decisions about what to accept and what to decline.\n\nSet Boundaries: Establish clear boundaries regarding your time and commitments. Communicate these boundaries to colleagues and stakeholders to manage expectations.\n\nEvaluate Opportunities: Before saying yes, evaluate whether the opportunity aligns with your long-term goals and values. Consider the potential impact on your time and resources.\n\nPractice Assertiveness: Develop the confidence to assertively and politely decline requests that do not align with your priorities. Use clear and respectful language to communicate your decision.\n\nOffer Alternatives: When possible, offer alternatives or compromises. For example, suggest a different timeline or delegate the task to someone else who might benefit from the opportunity.\n\nReflect on Past Decisions: Learn from past experiences where saying yes led to negative outcomes. Use these reflections to guide future decisions.\n\nBe Gracious and Direct: When saying no, be gracious and direct. Express appreciation for the opportunity and provide a brief explanation if appropriate, without over-justifying your decision.\n\nRegularly Reassess Commitments: Periodically review your commitments to ensure they still align with your goals. Be willing to renegotiate or exit commitments that no longer serve your interests.\n\nBy developing these strategies, individuals can effectively manage their commitments, focus on what truly matters, and continue to grow successfully in their career."
        },
    },
]

In [None]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "OMNIAI RAG Dataset"
# dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples= examples)



In [None]:
## CorrectnessGrade
from typing_extensions import Annotated, TypedDict
from langchain_openai import ChatOpenAI


# Grade output schema
class CorrectnessGrade(TypedDict):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. (2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    CorrectnessGrade, method="json_schema", strict=True
)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""
    # Run evaluator
    grade = grader_llm.invoke([
        {"role": "system", "content": correctness_instructions},
        {"role": "user", "content": answers}
    ])
    return grade["correct"]

In [None]:
###----------------------Relevance----------------------------------
# Grade output schema
class RelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[
        bool, ..., "Provide the score on whether the answer addresses the question"
    ]

# Grade prompt
relevance_instructions = """You are a teacher grading a quiz. You will be given a QUESTION and a STUDENT ANSWER. Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Relevance:
A relevance value of True means that the student's answer meets all of the criteria.
A relevance value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    RelevanceGrade, method="json_schema", strict=True
)

# Evaluator
def relevance(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer helpfulness."""
    answer = f"QUESTION: {inputs['question']}\nSTUDENT ANSWER: {outputs['answer']}"
    grade = relevance_llm.invoke([
        {"role": "system", "content": relevance_instructions},
        {"role": "user", "content": answer}
    ])
    return grade["relevant"]

In [None]:
###----------------------GROUNDEDNESS----------------------------
# Grade output schema
class GroundedGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    grounded: Annotated[
        bool, ..., "Provide the score on if the answer hallucinates from the documents"
    ]

# Grade prompt
grounded_instructions = """You are a teacher grading a quiz. You will be given FACTS and a STUDENT ANSWER. Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is grounded in the FACTS. (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Grounded:
A grounded value of True means that the student's answer meets all of the criteria.
A grounded value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
grounded_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    GroundedGrade, method="json_schema", strict=True
)

# Evaluator
def groundedness(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer groundedness."""
    doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
    answer = f"FACTS: {doc_string}\nSTUDENT ANSWER: {outputs['answer']}"
    grade = grounded_llm.invoke([
        {"role": "system", "content": grounded_instructions},
        {"role": "user", "content": answer}
    ])
    return grade["grounded"]

In [None]:
###----------------------------------------Retrieval relevance---------------------------
class RetrievalRelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[
        bool,
        ...,
        "True if the retrieved documents are relevant to the question, False otherwise",
    ]

# Grade prompt
retrieval_relevance_instructions = """You are a teacher grading a quiz. You will be given a QUESTION and a set of FACTS provided by the student. Here is the grade criteria to follow:
(1) You goal is to identify FACTS that are completely unrelated to the QUESTION
(2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
(3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met

Relevance:
A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
A relevance value of False means that the FACTS are completely unrelated to the QUESTION.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. Avoid simply stating the correct answer at the outset."""

# Grader LLM
retrieval_relevance_llm = ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)

def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
    """An evaluator for document relevance"""
    doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
    answer = f"FACTS: {doc_string}\nQUESTION: {inputs['question']}"
    # Run evaluator
    grade = retrieval_relevance_llm.invoke([
        {"role": "system", "content": retrieval_relevance_instructions},
        {"role": "user", "content": answer}
    ])
    return grade["relevant"]

In [None]:
# ourrag bot
from src.workflow.agent_workflow import ReActAgent
agent= ReActAgent()


In [None]:
def target(inputs: dict) -> dict:
    return agent.run(inputs["question"])

experiment_results = client.evaluate(
    target,
    data=dataset_name,
    evaluators=[correctness, groundedness, relevance, retrieval_relevance],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gpt-4-0125-preview"},
)

# Explore results locally as a dataframe if you have pandas installed
# experiment_results.to_pandas()

In [None]:
from src.workflow.agent_workflow import ReActAgent
agent = ReActAgent()
response = agent.run("How is weather today in hyderabad")
response

In [None]:
import openai
from langsmith import wrappers

openai_client = wrappers.wrap_openai(openai.OpenAI())

eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""You are grading the following question:
{inputs['question']}
Here is the real answer:
{reference_outputs['answer']}
You are grading the following predicted answer:
{outputs['response']}
Respond with CORRECT or INCORRECT:
Grade:"""
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content},
        ],
    ).choices[0].message.content
    return response == "CORRECT"

In [None]:
def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

In [None]:
# Our agent
res = agent.run("How is weather today in hyderabad")
type(res)

In [None]:
def ls_target(inputs: str) -> dict:
    return {"response": agent.run((inputs["question"]))}

In [None]:
experiment_results = client.evaluate(
    ls_target, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[concision, correctness], # The evaluators to score the results
    experiment_prefix="openai-4o-mini", # A prefix for your experiment names to easily identify them
)

## Evaluate ChatBot

In [1]:
from dotenv import load_dotenv
load_dotenv()

examples = [
    {
        "inputs": {
            "question": "What is the main concept discussed in the article 'For a More Creative Brain Follow These 5 Steps'?"
        },
        "outputs": {
            "answer": "The main concept discussed in the article 'For a More Creative Brain Follow These 5 Steps' is the creative process and how it can be systematically approached through five steps. These steps are:\n\nGather New Material: Learn specific material related to your task and general material by exploring a wide range of concepts.\nWork Over the Materials: Examine and experiment with the learned material by looking at it from different angles.\nStep Away from the Problem: Take a break from the problem to do something else that excites and energizes you.\nLet Your Idea Return: Allow the idea to come back to you naturally with renewed insight and energy.\nShape and Develop Your Idea: Release your idea into the world, gather feedback, and adapt it as needed.\nThe article emphasizes that creativity involves connecting ideas and can be cultivated through practice and courage."
        },
    },
    {
        "inputs": {
            "question": "How does the story of Frederic Eugene Ives demonstrate the five stages of the creative process?"
        },
        "outputs": {
            "answer": "The story of Frederic Eugene Ives demonstrates the five stages of the creative process as follows:\n\nGather New Material: Ives would have gathered material by learning about existing printing technologies and the challenges faced by newspapers and printers in the 1870s, particularly the costly and time-consuming process of photoengraving.\n\nWork Over the Materials: He likely examined the problem of printing images from different angles, experimenting with various ideas and technologies to find a more efficient solution.\n\nStep Away from the Problem: Although not explicitly detailed in the story, stepping away from the problem would involve Ives taking breaks or engaging in other activities, allowing his mind to subconsciously process the information.\n\nLet Your Idea Return: Ives' breakthrough likely came as a flash of insight, where he conceived the method of 'halftone printing,' which involved breaking a photograph down into a series of tiny dots that blend together to create an image.\n\nShape and Develop Your Idea: Ives would have tested his halftone printing method, refined it based on feedback, and adapted it for practical use, ultimately revolutionizing the printing process by making it faster and more cost-effective.\n\nThis story illustrates how Ives applied the creative process to solve a significant problem in the printing industry."
        },
    },
    {
        "inputs": {
            "question": "What are the five stages of the creative process and how can they be applied in modern problem-solving?"
        },
        "outputs": {
            "answer": "The five stages of the creative process and their application in modern problem-solving are as follows:\n\nGather New Material:\n\nApplication: Start by researching and collecting information relevant to the problem you're facing. This includes both specific data related to the issue and broader knowledge that might provide new perspectives. In modern settings, this could involve reading articles, attending workshops, or exploring related fields.\n\nWork Over the Materials:\n\nApplication: Analyze and experiment with the information you've gathered. Look at the problem from different angles and try to connect disparate ideas. Use brainstorming sessions or mind mapping to explore various possibilities and combinations.\n\nStep Away from the Problem:\n\nApplication: Take a break from actively thinking about the problem. Engage in activities that relax or energize you, such as exercising, meditating, or pursuing a hobby. This allows your subconscious mind to process the information and can lead to unexpected insights.\n\nLet Your Idea Return:\n\nApplication: After stepping away, allow the solution or idea to come back to you naturally. This often happens when you're not actively trying to solve the problem, such as during a walk or while showering. Be open to these moments of insight and ready to capture them.\n\nShape and Develop Your Idea:\n\nApplication: Once you have an idea, test it in the real world. Gather feedback from others, adapt your approach based on this input, and refine your solution. This iterative process helps in developing a robust and effective solution to the problem.\n\nBy following these stages, individuals and organizations can approach modern problems with a structured yet flexible mindset, fostering innovation and effective problem-solving."
        },
    },


     {
        "inputs": {
            "question": "How can saying 'no' actually make you more successful rather than less involved?"
        },
        "outputs": {
            "answer": "Saying 'no' can make you more successful by helping you focus on what truly matters and protecting your most valuable asset: your time. Here’s how it works:\n\nFocus on Priorities: By saying no to distractions and tasks that don't align with your goals, you can concentrate your efforts on activities that drive success. This focused approach allows you to invest your time and energy in what truly matters.\n\nPrevent Overcommitment: Saying no helps prevent overcommitment, which can lead to stress and burnout. By managing your commitments wisely, you maintain the capacity to perform at your best.\n\nEnhance Productivity: Eliminating unnecessary tasks is often more effective than optimizing how you perform them. This means that saying no can lead to greater productivity by ensuring you only engage in meaningful activities.\n\nStrategic Decision-Making: As you progress in your career or personal projects, saying no becomes a strategic tool to refine your focus. It allows you to selectively say yes to opportunities that align with your long-term vision and goals.\n\nMaintain Quality: By not spreading yourself too thin, you can maintain a high standard of quality in your work, which is crucial for achieving success.\n\nIn essence, saying no is about making deliberate choices that align with your priorities and values, rather than being less involved. It’s a proactive strategy for achieving success by focusing on what truly matters."
        },
    },
    {
        "inputs": {
            "question": "What does the article suggest about the hidden cost of saying 'yes'?"
        },
        "outputs": {
            "answer": "The article suggests that the hidden cost of saying 'yes' lies in the potential waste of time and effort on activities that do not truly matter or align with one's goals. Here are some key points:\n\nLoss of Time: Saying yes can lead to time being taken away from more important tasks or goals. Time is a finite resource, and agreeing to too many commitments can dilute your focus and effectiveness.\n\nDistraction from Goals: By saying yes to too many things, you risk being distracted from your primary objectives. This can prevent you from making meaningful progress in areas that are truly important to you.\n\nIncreased Stress and Overcommitment: Agreeing to too many requests can lead to overcommitment, resulting in stress and a decrease in the quality of work due to spreading yourself too thin.\n\nOpportunity Cost: Every time you say yes to something, you are implicitly saying no to something else. This opportunity cost can mean missing out on more valuable or rewarding opportunities that align better with your long-term goals.\n\nInefficiency: More effort can be wasted on doing things that are not necessary or important, rather than optimizing and focusing on what truly needs to be done.\n\nOverall, the article highlights that saying yes too often can lead to inefficiencies and prevent you from focusing on what truly matters, thereby hindering success."
        },
    },
    {
        "inputs": {
            "question": "How can individuals upgrade their ability to say 'no' as they grow in their careers?"
        },
        "outputs": {
            "answer": "As individuals grow in their careers, upgrading their ability to say 'no' becomes crucial for maintaining focus and achieving success. Here are some strategies to enhance this skill:\n\nClarify Your Priorities: Clearly define your goals and priorities. Understanding what is most important to you helps in making informed decisions about what to accept and what to decline.\n\nSet Boundaries: Establish clear boundaries regarding your time and commitments. Communicate these boundaries to colleagues and stakeholders to manage expectations.\n\nEvaluate Opportunities: Before saying yes, evaluate whether the opportunity aligns with your long-term goals and values. Consider the potential impact on your time and resources.\n\nPractice Assertiveness: Develop the confidence to assertively and politely decline requests that do not align with your priorities. Use clear and respectful language to communicate your decision.\n\nOffer Alternatives: When possible, offer alternatives or compromises. For example, suggest a different timeline or delegate the task to someone else who might benefit from the opportunity.\n\nReflect on Past Decisions: Learn from past experiences where saying yes led to negative outcomes. Use these reflections to guide future decisions.\n\nBe Gracious and Direct: When saying no, be gracious and direct. Express appreciation for the opportunity and provide a brief explanation if appropriate, without over-justifying your decision.\n\nRegularly Reassess Commitments: Periodically review your commitments to ensure they still align with your goals. Be willing to renegotiate or exit commitments that no longer serve your interests.\n\nBy developing these strategies, individuals can effectively manage their commitments, focus on what truly matters, and continue to grow successfully in their career."
        },
    },
]

In [3]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "OMNIAI Chat Dataset"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples= examples)

{'example_ids': ['e2bee6d6-0300-450f-baa2-5de1692ca989',
  '508ab914-4b80-46ed-8e71-d43ad93b89fd',
  '748eae5f-2887-4916-b9cb-c664af32cd67',
  '46ccc9d5-ed91-4999-8591-32a701cc4b91',
  '712164f4-7cb0-4db9-b1a9-5baa435d32f4',
  '94da817c-a7bc-48a7-b1f0-eefc04573863'],
 'count': 6}

In [4]:
import openai
from langsmith import wrappers

openai_client = wrappers.wrap_openai(openai.OpenAI())

eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""You are grading the following question:
{inputs['question']}
Here is the real answer:
{reference_outputs['answer']}
You are grading the following predicted answer:
{outputs['response']}
Respond with CORRECT or INCORRECT:
Grade:"""
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content},
        ],
    ).choices[0].message.content
    return response == "CORRECT"

In [5]:
from src.workflow.agent_workflow import ReActAgent
agent = ReActAgent()

{"timestamp": "2025-11-12T16:58:08.939315Z", "level": "info", "event": "OPENAI_API_KEY loaded from environment"}
{"config_keys": ["name", "description", "embedding_model", "retriever", "llm", "multimodal", "qdrant"], "timestamp": "2025-11-12T16:58:08.947494Z", "level": "info", "event": "YAML config loaded"}
{"model": "text-embedding-3-large", "timestamp": "2025-11-12T16:58:08.947494Z", "level": "info", "event": "Loading embedding model"}
{"timestamp": "2025-11-12T16:58:09.467958Z", "level": "info", "event": "Created collection successfully: eval-collection"}
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
{"timestamp": "2025-11-12T16:58:10.823003Z", "level": "info", "event": "Created vector store successfully: eval-collection"}
{"timestamp": "2025-11-12T16:58:10.824697Z", "level": "info", "event": "OPENAI_API_KEY loaded from environment"}
{"config_keys": ["name", "description", "embedding_model", "retriever", "llm", "multimodal", "qdrant"], "timestamp": "2025-

In [6]:
def ls_target(inputs: str) -> dict:
    return {"response": agent.run(inputs["question"])}

In [8]:
experiment_results = client.evaluate(
    ls_target, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[correctness], # The evaluators to score the results
    experiment_prefix="openai-4o-mini", # A prefix for your experiment names to easily identify them
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'openai-4o-mini-4bb49010' at:
https://smith.langchain.com/o/3339245f-daf5-4cad-8293-2f8cc08b0816/datasets/0f620bb2-cf2d-482e-98c0-f56d0c548fb7/compare?selectedSessions=159dbda9-d346-49ba-a816-238cc681a8eb




0it [00:00, ?it/s]HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
1it [00:14, 14.23s/it]HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
{"timestamp": "2025-11-12T16:59:52.799443Z", "level": "info", "event": "Retriever invoked successfully"}
{"timestamp": "2025-11-12T16:59:52.799443Z", "level": "info", "event": "Successfully initiated retriever tool"}
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2it [00:25, 12.50s/it]HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST 