In [None]:
from dotenv import load_dotenv

load_dotenv(override=True)

In [None]:
from langsmith import Client

client = Client()

# Define dataset: these are your test cases
examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]

dataset_name="QA Example Dataset"
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name=dataset_name)
    client.create_examples(
        dataset_id=dataset.id,
        examples=examples,
        )



In [None]:
# Initialize the LLM for use with router / structured output
from langchain.chat_models import init_chat_model
model_gemini_flash = init_chat_model("gemini-2.5-flash", model_provider="google_genai", timeout=30, temperature=0)
model_llama_groq = init_chat_model("llama-3.1-8b-instant", model_provider="groq", timeout=30, temperature=0)
model_gpt_4o_mini = init_chat_model("gpt-4o-mini", model_provider="openai", timeout=30, temperature=0)


In [None]:
from langchain_core.messages import HumanMessage, SystemMessage
from typing_extensions import Literal
from pydantic import BaseModel


class Grade(BaseModel):
    score:Literal["CORRECT","INCORRECT"]

def correctness(inputs:dict, outputs:dict, reference_outputs:dict) -> bool:
    eval_instructions = "You are an expert professor specialized in grading students' answers to questions."
    user_content = f"""You are grading the following question:
                       {inputs['question']}
                       Here is the real answer:
                       {reference_outputs['answer']}
                       You are grading the following predicted answer:
                       {outputs['response']}
                       Respond with CORRECT or INCORRECT:
                       Grade:"""

    response = model_llama_groq.invoke([SystemMessage(eval_instructions), HumanMessage(user_content)])
    return response.content == 'CORRECT'


In [None]:
def concision(outputs:dict, reference_outputs:dict)->bool:
    return int(len(outputs["response"])< 2*len(reference_outputs['answer']))

In [None]:
from langchain.chat_models import BaseChatModel

default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."
def my_app(question:str, model:BaseChatModel,instructions:str=default_instructions) ->str:
    return model.invoke([SystemMessage(instructions), HumanMessage(question)]).content

In [None]:
def ls_target(inputs:dict)->dict:
    return {'response':my_app(inputs['question'],model_llama_groq)}

In [None]:
experiment_results = client.evaluate(
    ls_target,
    data=dataset_name,
    evaluators=[concision, correctness],
    experiment_prefix="llama_groq"
)

In [None]:
instructions_v2 = "Respond to the users question in a short, concise manner (one short sentence). Do NOT use more than ten words."

def ls_target_v2(inputs:dict)->dict:
    return {'response':my_app(inputs['question'],model_llama_groq, instructions_v2)}

In [None]:
experiment_results_v2 = client.evaluate(
    ls_target_v2,
    data=dataset_name,
    evaluators=[concision,correctness],
    experiment_prefix='strict_llama_groq'
)