## Setup and Import Libraries

In [1]:
import os
import openai
from langsmith import Client
from langsmith import wrappers

In [2]:
from dotenv import load_dotenv

In [3]:
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"

## Create Client and Define Dataset

In [4]:
langsmith_client = Client()

In [5]:
dataset_name = "Chatbots Evaluation"

dataset = langsmith_client.create_dataset(dataset_name)

In [6]:
langsmith_client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['697a583c-03e7-40e2-9441-7e6276213e1a',
  '3d36b8a7-0178-43b9-9f8d-69a1a21f1068',
  '5e11ab81-d0b2-4afa-80dd-5a412c684aec',
  '3e46e8b5-981e-47b9-96c5-0342949fa753',
  'c5e8684b-6933-4d7d-9b52-bec98978db57'],
 'count': 5}

## Define Metrics (LLM as a Judge)

In [7]:
openai_client = wrappers.wrap_openai(openai.OpenAI())

In [8]:
evaluation_instructions = "You are an expert professor specialized in grading students' answers to questions."

In [9]:
def correctness(inputs:dict, outputs:dict, reference_outputs:dict) -> bool:
      
    user_content = f"""You are grading the following question:
    {inputs['question']}
    Here is the real answer:
    {reference_outputs['answer']}
    You are grading the following predicted answer:
    {outputs['response']}
    Respond with CORRECT or INCORRECT:
    Grade:
    """

    response=openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
          {"role":"system","content":evaluation_instructions},
          {"role":"user","content":user_content}
        ]
    ).choices[0].message.content

    return response == "CORRECT"

In [10]:
## Concisions- checks whether the actual output is less than 2x the length of the expected result.

def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

## Run Evaluations

In [11]:
default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."

In [13]:
def chatbot_app(question: str, model: str = "gpt-4o-mini", instructions: str = default_instructions) -> str:
    return openai_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [14]:
## Call my_app for every datapoints
def ls_target(inputs: str) -> dict:
    return {"response": chatbot_app(inputs["question"])}

In [16]:
## Run our evaluation
experiment_results = langsmith_client.evaluate(
    ls_target,
    data=dataset_name,
    evaluators=[correctness,concision],
    experiment_prefix="openai-4o-mini-chatbot"
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'openai-4o-mini-chatbot-095bdac5' at:
https://smith.langchain.com/o/e70f5ce4-4aca-4a02-b6ba-8f13fc21b177/datasets/1242d07c-f3d3-436f-9bca-879741de88bf/compare?selectedSessions=f0d4939f-4e4d-4968-8e15-a352792dfa63




5it [00:11,  2.32s/it]


### Trying Different Model

In [17]:
def ls_target(inputs: str) -> dict:
    return {"response": chatbot_app(inputs["question"], model="gpt-4-turbo")}

In [18]:
experiment_results = langsmith_client.evaluate(
    ls_target,
    data=dataset_name,
    evaluators=[correctness,concision],
    experiment_prefix="openai-4-turbo-chatbot"
)

View the evaluation results for experiment: 'openai-4-turbo-chatbot-91b2aef4' at:
https://smith.langchain.com/o/e70f5ce4-4aca-4a02-b6ba-8f13fc21b177/datasets/1242d07c-f3d3-436f-9bca-879741de88bf/compare?selectedSessions=c22cceb1-ac10-4e4e-b700-f2d673f12068




5it [00:09,  1.86s/it]
