In [4]:
# %pip install pytest

In [5]:
reference_data = [
  {
    "question": "What is the company's policy on remote work?", 
    "ground_truth": "Remote work is allowed up to 3 days per week.", #Expected llm generated answer
    "context": "Remote work is allowed up to 3 days per week." #Expected retrieved context
  }
]
question = reference_data[0]['question']
ground_truth = reference_data[0]['ground_truth']
context = reference_data[0]['context']
print (f"question: {question}")
print (f"ground_truth: {ground_truth}")
print (f"context: {context}")

question: What is the company's policy on remote work?
ground_truth: Remote work is allowed up to 3 days per week.
context: Remote work is allowed up to 3 days per week.


In [6]:
# Retrieve context from Milvus DB

from milvus_chatbot_with_rag import retrieve_similiar_contexts, generate_answer

def perform_retrieval(question):

    retrieved_context = retrieve_similiar_contexts(question, "employee_policies", 1)[0]['content']
    print (f"perform_retrieval.retrieved_context: {retrieved_context}")
    return retrieved_context

# Generate answer using LLM

question = "What is the company's policy on remote work?"
context = perform_retrieval(question)
answer = generate_answer(question, context)
answer

perform_retrieval.retrieved_context: Remote work is allowed up to 3 days per week.


'Remote work is allowed up to 3 days per week.'

In [7]:
%pip install ragas datasets 

Collecting ragas
  Downloading ragas-0.4.0-py3-none-any.whl.metadata (22 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken (from ragas)
  Downloading tiktoken-0.12.0-cp313-cp313-win_amd64.whl.metadata (6.9 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting typer (from ragas)
  Downloading typer-0.20.0-py3-none-any.whl.metadata (16 kB)
Collecting rich (from ragas)
  Downloading rich-14.2.0-py3-none-any.whl.metadata (18 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.13.0-py3-none-any.whl.metadata (11 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_network-0.33.5-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting langchain (from ragas)
  Downloading langchain-1.1.2-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-core (fro


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

from dotenv import load_dotenv
from openai import OpenAI
import os

# --- Load API Key ---
load_dotenv(override=True, dotenv_path="../.env")
my_api_key = os.getenv("OPENAI_API_KEY")


client = OpenAI(api_key=my_api_key)

# Question User asked
question = reference_data[0]['question']

# Reference context (should be a string)
reference_context = reference_data[0]['context']

# ground truth answer
ground_truth = reference_data[0]['ground_truth']

# Retrieved context (a string from perform_retrieval)
retrieved_context = [perform_retrieval(question)]
llm_answer = generate_answer(question, retrieved_context[0])

# Build dataset properly
dataset_dict = {
    "question": [question],
    "contexts": [retrieved_context],    # list of strings INSIDE another list
    "ground_truth": [ground_truth],   # single string/ reference answer
    "answer": [llm_answer]
}

print(f"dataset_dict: {dataset_dict}")

ragas_dataset = Dataset.from_dict(dataset_dict)

perform_retrieval.retrieved_context: Remote work is allowed up to 3 days per week.
dataset_dict: {'question': ["What is the company's policy on remote work?"], 'contexts': [['Remote work is allowed up to 3 days per week.']], 'ground_truth': ['Remote work is allowed up to 3 days per week.'], 'answer': ['Remote work is allowed up to 3 days per week.']}


In [9]:
from ragas.llms.base import llm_factory
from ragas import evaluate
from ragas.metrics import answer_correctness

results = evaluate(
    dataset=ragas_dataset,
    metrics=[faithfulness, answer_correctness]  
)


print("LLM Generation Evaluation Results:")
results.to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Exception raised in Job[1]: TypeError(Cannot use aembed_text() with a synchronous client. Use embed_text() instead.)


LLM Generation Evaluation Results:


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_correctness
0,What is the company's policy on remote work?,[Remote work is allowed up to 3 days per week.],Remote work is allowed up to 3 days per week.,Remote work is allowed up to 3 days per week.,1.0,


In [10]:
from ragas.llms.base import llm_factory
from ragas import evaluate
from ragas.metrics import answer_correctness

# Create the modern LLM wrapper
client = OpenAI()
llm = llm_factory("gpt-4o-mini", client=client)

# Run evaluation
results = evaluate(
    dataset=ragas_dataset,
    metrics=[answer_correctness],
    llm=llm
)

print("LLM Generation Evaluation Results:")
results.to_pandas()

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Exception raised in Job[0]: TypeError(Cannot use aembed_text() with a synchronous client. Use embed_text() instead.)


LLM Generation Evaluation Results:


Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness
0,What is the company's policy on remote work?,[Remote work is allowed up to 3 days per week.],Remote work is allowed up to 3 days per week.,Remote work is allowed up to 3 days per week.,
