In [None]:
# %pip install pytest

In [8]:
reference_data = [
  {
    "question": "What is the company's policy on remote work?", 
    "ground_truth": "Remote work is allowed up to 3 days per week.", #Expected llm generated answer
    "context": "Remote work is allowed up to 3 days per week." #Expected retrieved context
  }
]
question = reference_data[0]['question']
ground_truth = reference_data[0]['ground_truth']
context = reference_data[0]['context']
print (f"question: {question}")
print (f"ground_truth: {ground_truth}")
print (f"context: {context}")

question: What is the company's policy on remote work?
ground_truth: Remote work is allowed up to 3 days per week.
context: Remote work is allowed up to 3 days per week.


In [9]:
# Retrieve context from Milvus DB

from milvus_chatbot_with_rag import retrieve_similiar_contexts, generate_answer

def perform_retrieval(question):

    retrieved_context = retrieve_similiar_contexts(question, "employee_policies", 1)[0]['content']
    print (f"perform_retrieval.retrieved_context: {retrieved_context}")
    return retrieved_context

# Generate answer using LLM

question = "What is the company's policy on remote work?"
context = perform_retrieval(question)
answer = generate_answer(question, context)
answer

perform_retrieval.retrieved_context: Remote work is allowed up to 3 days per week.


'Remote work is allowed up to 3 days per week.'

In [10]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

from dotenv import load_dotenv
from openai import OpenAI
import os

# --- Load API Key ---
load_dotenv(override=True, dotenv_path="../.env")
my_api_key = os.getenv("OPENAI_API_KEY")


client = OpenAI(api_key=my_api_key)

# Question User asked
question = reference_data[0]['question']

# Reference context (should be a string)
reference_context = reference_data[0]['context']

# ground truth answer
ground_truth = reference_data[0]['ground_truth']

# Retrieved context (a string from perform_retrieval)
retrieved_context = [perform_retrieval(question)]
llm_answer = generate_answer(question, retrieved_context[0])

# Build dataset properly
dataset_dict = {
    "question": [question],
    "contexts": [retrieved_context],    # list of strings INSIDE another list
    "ground_truth": [ground_truth],   # single string/ reference answer
    "answer": [llm_answer]
}

print(f"dataset_dict: {dataset_dict}")

ragas_dataset = Dataset.from_dict(dataset_dict)

perform_retrieval.retrieved_context: Remote work is allowed up to 3 days per week.
dataset_dict: {'question': ["What is the company's policy on remote work?"], 'contexts': [['Remote work is allowed up to 3 days per week.']], 'ground_truth': ['Remote work is allowed up to 3 days per week.'], 'answer': ['Remote work is allowed up to 3 days per week.']}


In [11]:
results = evaluate(
    dataset=ragas_dataset,
    metrics=[faithfulness]  # just one metric
)


print("LLM Generation Evaluation Results:")
results.to_pandas()

Evaluating: 100%|██████████| 1/1 [00:03<00:00,  3.89s/it]


LLM Generation Evaluation Results:


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness
0,What is the company's policy on remote work?,[Remote work is allowed up to 3 days per week.],Remote work is allowed up to 3 days per week.,Remote work is allowed up to 3 days per week.,1.0


In [13]:
from ragas.llms.base import llm_factory
from ragas import evaluate
from ragas.metrics import answer_correctness, answer_similarity

# Create the modern LLM wrapper
llm = llm_factory("gpt-4o-mini")

# Run evaluation
results = evaluate(
    dataset=ragas_dataset,
    metrics=[answer_correctness,answer_similarity],
    llm=llm
)

print("LLM Generation Evaluation Results:")
results.to_pandas()

Evaluating: 100%|██████████| 2/2 [00:05<00:00,  2.74s/it]


LLM Generation Evaluation Results:


Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness,answer_similarity
0,What is the company's policy on remote work?,[Remote work is allowed up to 3 days per week.],Remote work is allowed up to 3 days per week.,Remote work is allowed up to 3 days per week.,1.0,1.0
