This notebook performs comprehensive RAG (Retrieval-Augmented Generation) evaluation using:
- RAG setup from `0_rag_setup.ipynb`
- Test data from `2_generate_test_data.ipynb`

The following metrics are computed and summarized:
1. **Context Precision**
2. **Context Recall**
3. **Response Relevance**
4. **Faithfulness**
5. **Factual Correctness**

In [1]:
from helpers import (
    compute_context_precision_score,
    compute_context_recall_score,
    compute_response_relevance_score,
    compute_faithfulness_score,
    compute_factual_correctness_score,
)
from dotenv import load_dotenv
import pandas as pd
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from ragas.llms.base import llm_factory
from IPython.display import Markdown, display
import ast
import os

In [2]:
load_dotenv("configs.env")

True

In [3]:
# MODELS
EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"

In [4]:
# initialize LLMs and embedding models
llm_as_judge = llm_factory(LLM_MODEL)
embedding_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)

In [5]:
# paths
TEST_DATA_PATH = "generated_test_data/test_data.csv"
RESPONSE_DATA_DIR = "generated_responses"
os.makedirs(RESPONSE_DATA_DIR, exist_ok=True)
RESPONSE_DATA_PATH = os.path.join(RESPONSE_DATA_DIR, "rag_responses_v1.csv")
VECTOR_DB_DIR = "vector_db"

Read generated test data

In [6]:
# read the generated test data
test_data_df = pd.read_csv(TEST_DATA_PATH)

In [7]:
test_data_df.shape

(10, 4)

In [8]:
test_data_df.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What was the author's experience upon arriving...,['3 May. Bistritz._--Left Munich at 8:35 P. M....,The author arrived in Vienna early the next mo...,single_hop_specific_query_synthesizer
1,Wot kind of experiences can a traveler expect ...,['such as we see in old missals; sometimes we ...,"In London, a traveler can expect to encounter ...",single_hop_specific_query_synthesizer
2,What does the Count's letter indicate about th...,['4 May._--I found that my landlord had got a ...,The Count's letter directed the narrator's lan...,single_hop_specific_query_synthesizer
3,"In the context of travel narratives, how is th...",['5 May. The Castle._--The grey of the morning...,The term 'vrolok' is significant as it represe...,single_hop_specific_query_synthesizer
4,What is the significance of the Mittel Land in...,"['the Hospadars would not repair them, lest th...",The Mittel Land is described as a beautiful re...,single_hop_specific_query_synthesizer


Initialize vector database retriever

In [9]:
# === Load persistent vector DB ===
vectordb = Chroma(
    persist_directory=VECTOR_DB_DIR,
    embedding_function=OpenAIEmbeddings(model=EMBEDDING_MODEL),
)

In [10]:
# Set up the LLM
llm = ChatOpenAI(model=LLM_MODEL, temperature=0)

In [11]:
# Set up the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
)

Generate responses

In [12]:
response_obj_arr = []
for _, row in test_data_df.iterrows():
    response = qa_chain.invoke(row["user_input"])
    response_obj_arr.append(response)

In [13]:
# save responses to new dataframe using response_obj_arr
response_df = pd.DataFrame(
    {
        "query": [resp["query"] for resp in response_obj_arr],
        "result": [resp["result"] for resp in response_obj_arr],
        "source_documents_names_arr": [
            [doc.metadata["source"] for doc in resp["source_documents"]]
            for resp in response_obj_arr
        ],
        "source_documents_contents_arr": [
            [doc.page_content for doc in resp["source_documents"]]
            for resp in response_obj_arr
        ],
    }
)

In [14]:
response_df.to_csv(RESPONSE_DATA_PATH, index=False)

Read saved responses

In [15]:
response_df = pd.read_csv(RESPONSE_DATA_PATH)

In [16]:
response_df.head()

Unnamed: 0,query,result,source_documents_names_arr,source_documents_contents_arr
0,What was the author's experience upon arriving...,"The author, Jonathan Harker, arrived in Vienna...","['data_files/Dracula.txt', 'data_files/Dracula...",['How these papers have been placed in sequenc...
1,Wot kind of experiences can a traveler expect ...,A traveler arriving in London can expect a ric...,"['data_files/Dracula.txt', 'data_files/Dracula...",['All day long we seemed to dawdle through a c...
2,What does the Count's letter indicate about th...,The Count's letter indicates that the narrator...,"['data_files/Dracula.txt', 'data_files/Dracula...","['The Count halted, putting down my bags, clos..."
3,"In the context of travel narratives, how is th...","In the context of travel narratives, the term ...","['data_files/Dracula.txt', 'data_files/Dracula...",['When I got on the coach the driver had not t...
4,What is the significance of the Mittel Land in...,The Mittel Land is significant in the passage ...,"['data_files/Dracula.txt', 'data_files/Dracula...",['Beyond the green swelling hills of the Mitte...


# Context Precision

In [17]:
# compute context precision scores
precision_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute context precision score
    precision_score = await compute_context_precision_score(
        llm_as_judge,
        row["query"],
        retrieved_context_arr,
        row["result"],
    )
    precision_score_arr.append(precision_score)

In [18]:
print(precision_score_arr)

[0.9999999999, 0.9999999999, 0.6388888888675925, 0.8055555555287036, 0.99999999995, 0.999999999975, 0.999999999975, 0.999999999975, 0.9999999999666667, 0.9166666666361111]


In [19]:
# what is average precision?
average_precision = (
    sum(precision_score_arr) / len(precision_score_arr) if precision_score_arr else 0
)
print(f"Average Context Precision: {average_precision}")

Average Context Precision: 0.9361111110674074


# Context Recall

In [20]:
# compute context recall scores
recall_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute context recall score
    recall_score = await compute_context_recall_score(
        llm_as_judge,
        row["query"],
        test_data_df.loc[i, "reference"],
        retrieved_context_arr,
    )
    recall_score_arr.append(recall_score)

In [21]:
print(recall_score_arr)

[1.0, 0.8, 0.0, 1.0, 0.5, 0.2857142857142857, 0.5, 0.6666666666666666, 1.0, 0.4]


In [22]:
# what is average recall?
average_recall = (
    sum(recall_score_arr) / len(recall_score_arr) if recall_score_arr else 0
)
print(f"Average Context Recall: {average_recall}")

Average Context Recall: 0.6152380952380953


# Response Relevance

In [23]:
# compute response relevance scores
relevance_score_arr = []
for i, row in response_df.iterrows():

    # compute response relevance score
    relevance_score = await compute_response_relevance_score(
        llm_as_judge,
        embedding_model,
        row["query"],
        row["result"],
    )
    relevance_score_arr.append(relevance_score)

In [24]:
print(relevance_score_arr)

[np.float64(0.7258745271466869), np.float64(0.8883869251353049), np.float64(0.9010497172083558), np.float64(0.8846601811379675), np.float64(0.9432960012916567), np.float64(0.9346393395408618), np.float64(0.6710805271893379), np.float64(0.9690563757521421), np.float64(0.6945021232044878), np.float64(0.0)]


In [25]:
# what is average relevance?
average_relevance = (
    sum(relevance_score_arr) / len(relevance_score_arr) if relevance_score_arr else 0
)
print(f"Average Response Relevance: {average_relevance}")

Average Response Relevance: 0.76125457176068


# Faithfulness

In [26]:
# compute faithfulness scores
faithfulness_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute faithfulness score
    faithfulness_score = await compute_faithfulness_score(
        llm_as_judge,
        row["query"],
        retrieved_context_arr,
        row["result"],
    )
    faithfulness_score_arr.append(faithfulness_score)

In [27]:
print(faithfulness_score_arr)

[1.0, 0.06666666666666667, 1.0, 0.9090909090909091, 1.0, 0.6111111111111112, 0.5, 0.6363636363636364, 1.0, 1.0]


In [28]:
# what is average faithfulness?
average_faithfulness = (
    sum(faithfulness_score_arr) / len(faithfulness_score_arr)
    if faithfulness_score_arr
    else 0
)
print(f"Average Response Faithfulness: {average_faithfulness}")

Average Response Faithfulness: 0.7723232323232323


# Factual Correctness

In [29]:
# compute factual correctness scores
factual_correctness_score_arr = []
for i, row in response_df.iterrows():

    # compute factual correctness score
    factual_correctness_score = await compute_factual_correctness_score(
        llm_as_judge,
        row["result"],
        test_data_df.loc[i, "reference"],
    )
    factual_correctness_score_arr.append(factual_correctness_score)

In [30]:
print(factual_correctness_score_arr)

[np.float64(0.67), np.float64(0.33), np.float64(0.36), np.float64(0.8), np.float64(0.25), np.float64(0.24), np.float64(0.57), np.float64(0.64), np.float64(0.64), np.float64(0.08)]


In [31]:
# what is average factual correctness score?
average_factual_correctness = (
    sum(factual_correctness_score_arr) / len(factual_correctness_score_arr)
    if factual_correctness_score_arr
    else 0
)
print(f"Average Response Factual Correctness: {average_factual_correctness}")

Average Response Factual Correctness: 0.458


# Summary of results

In [32]:
# Create markdown table with variables
markdown_text = f"""
|Evaluation Metric| Average Score|
|----|----|
|Context Precision|{average_precision:.3f}|
|Context Recall|{average_recall:.3f}|
|Response relevance|{average_relevance:.3f}|
|Faithfulness|{average_faithfulness:.3f}|
|Factual Correctness|{average_factual_correctness:.3f}|
"""

In [33]:
display(Markdown(markdown_text))


|Evaluation Metric| Average Score|
|----|----|
|Context Precision|0.936|
|Context Recall|0.615|
|Response relevance|0.761|
|Faithfulness|0.772|
|Factual Correctness|0.458|
