In [None]:
from dotenv import load_dotenv
load_dotenv()

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langsmith import Client
from langsmith import traceable


In [None]:
client = Client()

llm = ChatGroq(
    model_name="meta-llama/llama-4-maverick-17b-128e-instruct",
    temperature=0.7
)


In [None]:
# Prompt Variant 1 - General assistant
prompt_v1 = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("user", "{question}")
])

# Prompt Variant 2 - Concise and witty
prompt_v2 = ChatPromptTemplate.from_messages([
    ("system", "You are a precise and witty assistant. Keep responses under 20 words."),
    ("user", "{question}")
])

# Chains
chain_v1 = prompt_v1 | llm
chain_v2 = prompt_v2 | llm


In [None]:
@traceable(name="Prompt V1 Run")
def run_v1(inputs: dict) -> dict:
    response = chain_v1.invoke(inputs)
    return {"answer": response.content.strip()}

@traceable(name="Prompt V2 Run")
def run_v2(inputs: dict) -> dict:
    response = chain_v2.invoke(inputs)
    return {"answer": response.content.strip()}


In [None]:
dataset_name = "Prompt Eval Dataset"

existing = next((ds for ds in client.list_datasets() if ds.name == dataset_name), None)
if existing:
    dataset = existing
else:
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Dataset to compare prompt versions using Groq"
    )

examples = [
    {
        "inputs": {"question": "What is the Turing Test?"},
        "outputs": {"answer": "The Turing Test evaluates if a machine exhibits human-like intelligence."},
    },
    {
        "inputs": {"question": "Define quantum computing in simple terms."},
        "outputs": {"answer": "Quantum computing uses quantum bits to perform complex calculations faster than traditional computers."},
    },
]

client.create_examples(dataset_id=dataset.id, examples=examples)


In [None]:
judge_llm = llm 

JUDGE_PROMPT = """You are an evaluator.

Question: {question}
Predicted Answer: {prediction}
Reference Answer: {reference}

Is the predicted answer correct compared to the reference? Reply with only "Yes" or "No", then explain.
"""
def groq_correctness_evaluator(inputs, outputs, reference_outputs):
    prompt = JUDGE_PROMPT.format(
        question=inputs["question"],
        prediction=outputs["answer"],
        reference=reference_outputs["answer"]
    )
    response = judge_llm.invoke(prompt).content.strip()
    score = 1.0 if response.lower().startswith("yes") else 0.0
    return {
        "key": "groq_correctness",
        "score": score,
        "comment": response
    }
    
def groq_conciseness_evaluator(inputs, outputs, reference_outputs):
    prompt = f"""You are evaluating response conciseness.

Question: {inputs["question"]}
Answer: {outputs["answer"]}

Is the answer concise and free from fluff? Reply "Yes" or "No" and explain."""
    
    result = judge_llm.invoke(prompt).content.strip()
    score = 1.0 if result.lower().startswith("yes") else 0.0
    return {
        "key": "groq_conciseness",
        "score": score,
        "comment": result
    }



In [None]:
results_v1 = client.evaluate(
    run_v1,
    data=dataset.id,
    evaluators=[groq_correctness_evaluator, groq_conciseness_evaluator],
    experiment_prefix="prompt-v1-correctness",
    max_concurrency=2,
)

print("✅ Prompt V1 Evaluation Complete")


In [None]:
results_v2 = client.evaluate(
    run_v2,
    data=dataset.id,
    evaluators=[groq_correctness_evaluator, groq_conciseness_evaluator],
    experiment_prefix="prompt-v2-correctness",
    max_concurrency=2,
)

print("✅ Prompt V2 Evaluation Complete")


In [None]:
# CELL INDEX: 0
# This cell imports necessary modules:
# - `load_dotenv` is used to load environment variables from a .env file.
# - `ChatGroq` is the Groq model integration for LangChain.
# - `ChatPromptTemplate` is used to define structured prompts for the model.
# - `Client` is used to interact with LangSmith for dataset and experiment management.
# - `traceable` is a decorator for tracking function execution.

# CELL INDEX: 1
# This cell initializes the LangSmith client and sets up the Groq model (`ChatGroq`) with specific parameters:
# - `model_name` specifies the model to use.
# - `temperature` controls the randomness of the model's responses.

# CELL INDEX: 2
# This cell defines two prompt templates:
# - `prompt_v1` is a general assistant prompt.
# - `prompt_v2` is a concise and witty assistant prompt.
# It also creates chains (`chain_v1` and `chain_v2`) by combining the prompts with the Groq model.

# CELL INDEX: 3
# This cell defines two traceable functions:
# - `run_v1` executes the `chain_v1` and returns the model's response.
# - `run_v2` executes the `chain_v2` and returns the model's response.
# Both functions are decorated with `traceable` for tracking.

# CELL INDEX: 4
# This cell manages a dataset in LangSmith:
# - Checks if a dataset named "Prompt Eval Dataset" exists; if not, creates it.
# - Adds examples to the dataset for evaluation purposes.

# CELL INDEX: 5
# This cell defines evaluators for assessing model responses:
# - `groq_correctness_evaluator` evaluates if the predicted answer matches the reference answer.
# - `groq_conciseness_evaluator` evaluates if the response is concise and free from fluff.
# Both evaluators use the Groq model (`judge_llm`) to generate evaluation scores and comments.

# CELL INDEX: 6
# This cell evaluates `run_v1` using the dataset and evaluators:
# - Runs correctness and conciseness evaluations for Prompt V1.
# - Prints a completion message once the evaluation is done.

# CELL INDEX: 7
# This cell evaluates `run_v2` using the dataset and evaluators:
# - Runs correctness and conciseness evaluations for Prompt V2.
# - Prints a completion message once the evaluation is done.