In [6]:
!pip install -U langsmith openevals langchain langchain-groq python-dotenv




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from dotenv import load_dotenv
load_dotenv()


True

In [8]:
from langchain_groq import ChatGroq
from langsmith import Client


In [9]:
# Initialize LangSmith client
client = Client()

# Setup Groq LLaMA-4 Maverick model
llm = ChatGroq(
    temperature=0.7,
    model_name="meta-llama/llama-4-maverick-17b-128e-instruct"
)


In [19]:
dataset_name = "Final Groq Eval."

# Check if it exists
existing = next((ds for ds in client.list_datasets() if ds.name == dataset_name), None)
if existing:
    dataset = existing
    print(f"Using existing dataset: {dataset.name}")
else:
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Dataset Evaluation w/ Groq - Final"
    )
    print(f"Created new dataset: {dataset.name}")


Created new dataset: Final Groq Eval.


In [20]:
examples = [
    {
        "inputs": {"question": "Which country is Mount Kilimanjaro located in?"},
        "outputs": {"answer": "Mount Kilimanjaro is located in Tanzania."},
    },
    {
        "inputs": {"question": "What is Earth's lowest point?"},
        "outputs": {"answer": "Earth's lowest point is the Dead Sea."},
    },
    {
        "inputs": {"question": "Who is the president of USA in 2019?"},
        "outputs": {"answer": "Donald Trump"},
    },
    {
        "inputs": {"question": "What is Langchain?"},
        "outputs": {"answer": "LangChain is an open source framework for building applications based on large language models (LLMs)."},
    },
]

client.create_examples(dataset_id=dataset.id, examples=examples)
print(f"Added {len(examples)} examples to dataset")


Added 4 examples to dataset


In [21]:
def target(inputs: dict) -> dict:
    messages = [
        {"role": "system", "content": "Answer the following question accurately"},
        {"role": "user", "content": inputs["question"]},
    ]
    response = llm.invoke(messages)
    return {"answer": response.content.strip()}


In [22]:
from typing import Dict
from langchain_groq import ChatGroq

# Your evaluator LLM
judge_llm = ChatGroq(
    temperature=0.3,
    model_name="meta-llama/llama-4-maverick-17b-128e-instruct"
)

# Prompt to judge correctness
JUDGE_PROMPT = """You are a strict evaluator.

Question: {question}
Predicted Answer: {prediction}
Reference Answer: {reference}

Is the predicted answer correct compared to the reference? Reply with only "Yes" or "No", then explain.
"""

# Function-style evaluator
def groq_correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> Dict:
    filled_prompt = JUDGE_PROMPT.format(
        question=inputs["question"],
        prediction=outputs["answer"],
        reference=reference_outputs["answer"]
    )

    response = judge_llm.invoke(filled_prompt).content.strip()

    # Basic scoring
    score = 1.0 if response.lower().startswith("yes") else 0.0

    return {
        "key": "groq_correctness",
        "score": score,
        "comment": response
    }


In [23]:
results = client.evaluate(
    target,
    data=dataset.id,
    evaluators=[groq_correctness_evaluator],
    experiment_prefix="groq-only-eval",
    max_concurrency=2,
)

print("✅ Groq-only evaluation complete.")

View the evaluation results for experiment: 'groq-only-eval-8434abcf' at:
https://smith.langchain.com/o/961102de-fb10-40b4-921c-d523b4df68aa/datasets/6c713137-5984-458d-92e5-a401d689958e/compare?selectedSessions=5ed6a3af-7b3b-4a0e-bb63-48e4087b290a




4it [00:01,  2.70it/s]

✅ Groq-only evaluation complete.



