In [None]:
# Step 1: Define Sample Documents
documents = [
    {"section": "Employee Info", "content": "John's pay is processed on the 1st of every month."},
    {"section": "Employee Info", "content": "Mark is on a leave of absence until next Monday."},
    {"section": "Employee Info", "content": "Julie is a software engineer."},
    {"section": "Employee Info", "content": "Julie's pay is processed on the 1st of every month."},
    {"section": "Employee Info", "content": "Mark is a product manager."},
    {"section": "Employee Info", "content": "John is an AI architect and has salary of 500K USD."},
]

# Step 2: Get Content Texts
content_corpus = [doc["content"] for doc in documents]
content_corpus

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
doc_vectors = model.encode(content_corpus)

doc_vectors
print(doc_vectors.shape)

In [None]:
# Step 3: User Query and Semantic Matching
import numpy as np

query = "Tell me about John's role."
query_vec = model.encode([query])[0]
# query_vec


similarities = model.similarity(query_vec, doc_vectors)

# Ensure it's a 1D numpy array
similarities = np.asarray(similarities).squeeze()

# Now get top 3
top_3_indices = np.argsort(similarities)[::-1][:3]
top_scores = similarities[top_3_indices]
top_scores

top_docs = [documents[i]['content'] for i in top_3_indices]

top_docs
context = "\n---\n".join(top_docs)
top_docs, context

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True, dotenv_path="../.env")
my_api_key = os.getenv("OPEN_AI_API_KEY")

my_client = OpenAI(api_key=my_api_key)
# my_client

def ask_question_open_ai(prompt, context=''):
    """Call the LLM with the provided prompt and context.

    IMPORTANT: use the passed-in prompt (not a global variable) so each
    evaluation example can be answered correctly.
    """
    llm_response = my_client.chat.completions.create(
        model="gpt-5-nano",
        messages=[
            {"role": "system", "content": '''
             You are an assistant who answers only based on the given context.
             '''},
            {"role": "user", "content": f"Context: {context}\n\nUser Question: {prompt}"}
        ]

    )
    return llm_response.choices[0].message.content

In [None]:
print (query)
response = ask_question_open_ai(query, context)

response

#### Create a LangSmith Dataset

In [None]:
from langsmith import Client

client = Client()

dataset_name = "2025Dec-Employee-Info-QA-Dataset-3"
dataset = client.create_dataset(dataset_name=dataset_name)

examples = [
    {"input": "When is John's pay processed?", "output": "John's pay is processed on the 1st of every month."},
    {"input": "What is Julie's job title?", "output": "Julie is a software engineer."},
    {"input": "What is John's salary?", "output": "John has a salary of 500K USD."},
    {"input": "What is Mark's current work status?", "output": "Mark is on a leave of absence until next Monday."},
]

for ex in examples:
    client.create_example(inputs={"question": ex["input"]}, outputs={"answer": ex["output"]}, dataset_id=dataset.id)

print(f" Dataset '{dataset_name}' created with {len(examples)} examples.")

#### Run LangSmith Evaluation

In [None]:
# %pip install --upgrade --no-cache-dir langsmith

import langsmith, langchain
print(langsmith.__version__)
print(langchain.__version__)

In [None]:
# !pip index versions langsmith

In [None]:
import os
import json
from openai import OpenAI
from langsmith.evaluation import RunEvaluator

class SimpleCorrectness(RunEvaluator):
    """LLM-as-a-judge correctness evaluator (version-safe)."""

    def evaluate_run(self, run, example, **kwargs):
        ref = example.outputs.get("answer", "").strip()

        pred = (
            run.outputs.get("answer")
            or run.outputs.get("output_text")
            or run.outputs.get("result")
            or ""
        ).strip()

        question = example.inputs.get("question", "")

        if not ref or not pred:
            return {
                "key": "correctness",
                "score": 0.0,
                "commentary": "Missing reference or prediction",
            }

        try:
            score, reason = self.llm_as_a_judge(ref, pred, question)
        except Exception as e:
            return {
                "key": "correctness",
                "score": 0.0,
                "commentary": f"Evaluator error: {e}",
            }

        return {
            "key": "correctness",
            "score": score,
            "commentary": reason,
        }

    def llm_as_a_judge(self, reference: str, prediction: str, question: str = ""):
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {
                    "role": "system",
                    "content": "You are a semantic correctness evaluator."
                },
                {
                    "role": "user",
                    "content": f"""
Question: {question}
Reference answer: {reference}
Model prediction: {prediction}

Return JSON only:
{{"score": <number between 0 and 1>, "reason": "<short explanation>"}}
"""
                }
            ],
        )

        content = response.choices[0].message.content
        data = json.loads(content)

        score = float(data["score"])
        reason = data["reason"]

        return max(0.0, min(1.0, score)), reason

In [None]:
# from langchain_openai import ChatOpenAI
from langsmith import traceable # Need to enable tracing on LangSmith

# Define your target function that performs retrieval per-question
@traceable 
def ask_question(inputs):
    question = inputs["question"]
    # compute embedding for the question
    query_vec = model.encode([question])[0]

    # compute cosine similarities between query and doc_vectors
    import numpy as np
    q_norm = np.linalg.norm(query_vec) + 1e-8
    doc_norms = np.linalg.norm(doc_vectors, axis=1) + 1e-8
    sims = np.dot(doc_vectors, query_vec) / (doc_norms * q_norm)

    # pick top-3 supporting docs and build context
    top_3_indices = np.argsort(sims)[::-1][:3]
    top_docs = [content_corpus[i] for i in top_3_indices]
    context = "\n---\n".join(top_docs)

    # call LLM with question and its retrieved context
    answer = ask_question_open_ai(question, context)
    return {"answer": answer}

In [None]:
import os
import json
from openai import OpenAI
from langsmith.evaluation import RunEvaluator

class SimpleCorrectness(RunEvaluator):
    """LLM-as-a-judge correctness evaluator (version-safe)."""

    def evaluate_run(self, run, example, **kwargs):
        ref = example.outputs.get("answer", "").strip()

        pred = (
            run.outputs.get("answer")
            or run.outputs.get("output_text")
            or run.outputs.get("result")
            or ""
        ).strip()

        question = example.inputs.get("question", "")

        if not ref or not pred:
            return {
                "key": "correctness",
                "score": 0.0,
                "commentary": "Missing reference or prediction",
            }

        try:
            score, reason = self.llm_as_a_judge(ref, pred, question)
        except Exception as e:
            return {
                "key": "correctness",
                "score": 0.0,
                "commentary": f"Evaluator error: {e}",
            }

        return {
            "key": "correctness",
            "score": score,
            "commentary": reason,
        }

    def llm_as_a_judge(self, reference: str, prediction: str, question: str = ""):
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {
                    "role": "system",
                    "content": "You are a semantic correctness evaluator."
                },
                {
                    "role": "user",
                    "content": f"""
Question: {question}
Reference answer: {reference}
Model prediction: {prediction}

Return JSON only:
{{"score": <number between 0 and 1>, "reason": "<short explanation>"}}
"""
                }
            ],
        )

        content = response.choices[0].message.content
        data = json.loads(content)

        score = float(data["score"])
        reason = data["reason"]

        return max(0.0, min(1.0, score)), reason

In [None]:
from langsmith.evaluation import evaluate

simple_correctness = SimpleCorrectness()

results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=[simple_correctness],
    experiment_prefix="langsmith_eval_test",
)