## Setup and Import Libraries

In [2]:
import os
import openai
from langsmith import Client
from langsmith import wrappers

In [3]:
from dotenv import load_dotenv

In [4]:
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"

## Create Client and Define Dataset

In [5]:
langsmith_client = Client()

In [7]:
dataset_name = "Chatbots-Evaluation"

dataset = langsmith_client.create_dataset(dataset_name)

In [8]:
langsmith_client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['dbe4f578-5d04-4024-808f-a7a9dc5ad111',
  '4abe01fe-9a00-45c8-82f3-5908667731ed',
  '1e7374d1-8531-4a35-acf6-ab574afd5f66',
  '4b95e509-e0f7-4513-a2eb-ea7e80bd87d8',
  'e85988eb-14b3-4a10-a584-ca046fca32ae'],
 'count': 5}

## Define Metrics (LLM as a Judge)

In [9]:
openai_client = wrappers.wrap_openai(openai.OpenAI())

In [10]:
evaluation_instructions = "You are an expert professor specialized in grading students' answers to questions."

In [11]:
def correctness(inputs:dict, outputs:dict, reference_outputs:dict) -> bool:
      
    user_content = f"""You are grading the following question:
    {inputs['question']}
    Here is the real answer:
    {reference_outputs['answer']}
    You are grading the following predicted answer:
    {outputs['response']}
    Respond with CORRECT or INCORRECT:
    Grade:
    """

    response=openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
          {"role":"system","content":evaluation_instructions},
          {"role":"user","content":user_content}
        ]
    ).choices[0].message.content

    return response == "CORRECT"

In [12]:
## Concisions- checks whether the actual output is less than 2x the length of the expected result.

def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

## Run Evaluations

In [13]:
default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."

In [14]:
def chatbot_app(question: str, model: str = "gpt-4o-mini", instructions: str = default_instructions) -> str:
    return openai_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [15]:
## Call my_app for every datapoints
def ls_target(inputs: str) -> dict:
    return {"response": chatbot_app(inputs["question"])}

In [17]:
## Run our evaluation
experiment_results = langsmith_client.evaluate(
    ls_target,
    data=dataset_name,
    evaluators=[correctness,concision],
    experiment_prefix="openai-4o-mini-chatbot"
)

View the evaluation results for experiment: 'openai-4o-mini-chatbot-1af57d37' at:
https://smith.langchain.com/o/e70f5ce4-4aca-4a02-b6ba-8f13fc21b177/datasets/6c0e5c0f-378e-47fd-9758-daf28b3aaf64/compare?selectedSessions=8732dd9a-542e-4b27-8424-227e9577375e




5it [00:07,  1.46s/it]


### Trying Different Model

In [18]:
def ls_target(inputs: str) -> dict:
    return {"response": chatbot_app(inputs["question"], model="gpt-4-turbo")}

In [19]:
experiment_results = langsmith_client.evaluate(
    ls_target,
    data=dataset_name,
    evaluators=[correctness,concision],
    experiment_prefix="openai-4-turbo-chatbot"
)

View the evaluation results for experiment: 'openai-4-turbo-chatbot-f0610dd8' at:
https://smith.langchain.com/o/e70f5ce4-4aca-4a02-b6ba-8f13fc21b177/datasets/6c0e5c0f-378e-47fd-9758-daf28b3aaf64/compare?selectedSessions=d8b3fd95-d6c7-4702-aeb9-9749c0f13b04




5it [00:10,  2.11s/it]
