- Run evaluation metrics on generated test data

In [1]:
from helpers import (
    compute_context_precision_score,
    compute_context_recall_score,
    compute_response_relevance_score,
    compute_faithfulness_score,
    compute_factual_correctness_score,
)
from dotenv import load_dotenv
import pandas as pd
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from ragas.llms.base import llm_factory
from IPython.display import Markdown, display
import ast
import os

In [2]:
load_dotenv("configs.env")

True

In [3]:
# initialize LLMs and embedding models
llm_as_judge = llm_factory("gpt-4o-mini")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
# paths
TEST_DATA_PATH = "generated_test_data/test_data.csv"
RESPONSE_DATA_DIR = "generated_responses"
os.makedirs(RESPONSE_DATA_DIR, exist_ok=True)
RESPONSE_DATA_PATH = os.path.join(RESPONSE_DATA_DIR, "rag_responses_v1.csv")
VECTOR_DB_DIR = "vector_db"

Read generated test data

In [5]:
# read the generated test data
test_data_df = pd.read_csv(TEST_DATA_PATH)

In [6]:
test_data_df.shape

(10, 4)

In [7]:
test_data_df.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What impression did Jonathan Harker have of Bu...,['JONATHAN HARKER’S JOURNAL (_Kept in shorthan...,Jonathan Harker described Buda-Pesth as a wond...,single_hop_specific_query_synthesizer
1,What is the Golden Krone Hotel like?,['saw little towns or castles on the top of st...,The Golden Krone Hotel is described as thoroug...,single_hop_specific_query_synthesizer
2,What significance does St. George’s Day hold i...,['4 May._--I found that my landlord had got a ...,St. George’s Day is significant as it is descr...,single_hop_specific_query_synthesizer
3,"In the context of the narrative, what signific...",['5 May. The Castle._--The grey of the morning...,'Ordog' translates to 'Satan' in the local lan...,single_hop_specific_query_synthesizer
4,Why the Hospadars no fix things for fear of th...,"['the Hospadars would not repair them, lest th...","The Hospadars would not repair them, lest the ...",single_hop_specific_query_synthesizer


initialize retriever

In [8]:
# === Load persistent vector DB ===
vectordb = Chroma(
    persist_directory=VECTOR_DB_DIR,
    embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
)

In [9]:
# Set up the LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [10]:
# Set up the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
)

generate responses

In [11]:
response_obj_arr = []
for _, row in test_data_df.iterrows():
    response = qa_chain.invoke(row["user_input"])
    response_obj_arr.append(response)

In [12]:
# save responses to new dataframe using response_obj_arr
response_df = pd.DataFrame(
    {
        "query": [resp["query"] for resp in response_obj_arr],
        "result": [resp["result"] for resp in response_obj_arr],
        "source_documents_names_arr": [
            [doc.metadata["source"] for doc in resp["source_documents"]]
            for resp in response_obj_arr
        ],
        "source_documents_contents_arr": [
            [doc.page_content for doc in resp["source_documents"]]
            for resp in response_obj_arr
        ],
    }
)

In [13]:
response_df.to_csv(RESPONSE_DATA_PATH, index=False)

evaluate responses

In [14]:
response_df = pd.read_csv(RESPONSE_DATA_PATH)

In [15]:
response_df.head()

Unnamed: 0,query,result,source_documents_names_arr,source_documents_contents_arr
0,What impression did Jonathan Harker have of Bu...,Jonathan Harker had a positive impression of B...,"['data_files/Dracula.txt', 'data_files/Dracula...",['It was on the dark side of twilight when we ...
1,What is the Golden Krone Hotel like?,The Golden Krone Hotel is described as thoroug...,"['data_files/Dracula.txt', 'data_files/Dracula...",['It was on the dark side of twilight when we ...
2,What significance does St. George’s Day hold i...,"In the context of the narrative, St. George's ...","['data_files/Dracula.txt', 'data_files/Dracula...",['“It is the eve of St. George’s Day. Do you n...
3,"In the context of the narrative, what signific...","In the context of the narrative, the term 'Ord...","['data_files/Dracula.txt', 'data_files/Dracula...",['When I got on the coach the driver had not t...
4,Why the Hospadars no fix things for fear of th...,The Hospadars did not repair the roads for fea...,"['data_files/Dracula.txt', 'data_files/Dracula...","['road is in summertime excellent, but that it..."


# Context Precision

In [16]:
# compute context precision scores
precision_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute context precision score
    precision_score = await compute_context_precision_score(
        llm_as_judge,
        row["query"],
        retrieved_context_arr,
        row["result"],
    )
    precision_score_arr.append(precision_score)

In [17]:
precision_score_arr

[0.3333333333,
 0.9999999999,
 0.9999999999,
 0.8333333332916666,
 0.999999999975,
 0.99999999995,
 0.999999999975,
 0.9999999999,
 0.999999999975,
 0.999999999975]

In [18]:
# what is average precision?
average_precision = (
    sum(precision_score_arr) / len(precision_score_arr) if precision_score_arr else 0
)
print(f"Average Context Precision: {average_precision}")

Average Context Precision: 0.9166666666141667


# Context Recall

In [19]:
# compute context recall scores
recall_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute context recall score
    recall_score = await compute_context_recall_score(
        llm_as_judge,
        row["query"],
        test_data_df.loc[i, "reference"],
        retrieved_context_arr,
    )
    recall_score_arr.append(recall_score)

In [20]:
recall_score_arr

[1.0, 0.75, 1.0, 1.0, 1.0, 1.0, 0.5, 0.75, 1.0, 0.0]

In [21]:
# what is average recall?
average_recall = (
    sum(recall_score_arr) / len(recall_score_arr) if recall_score_arr else 0
)
print(f"Average Context Recall: {average_recall}")

Average Context Recall: 0.8


# Response Relevance

In [22]:
# compute response relevance scores
relevance_score_arr = []
for i, row in response_df.iterrows():

    # compute response relevance score
    relevance_score = await compute_response_relevance_score(
        llm_as_judge,
        embedding_model,
        row["query"],
        row["result"],
    )
    relevance_score_arr.append(relevance_score)

In [23]:
relevance_score_arr

[np.float64(0.9617870340333668),
 np.float64(0.8488536721165306),
 np.float64(0.9474903782993339),
 np.float64(0.9083116486533589),
 np.float64(0.6881438184553886),
 np.float64(0.9655675728668855),
 np.float64(0.8556736335230722),
 np.float64(0.9559870690058007),
 np.float64(0.9175456113544486),
 np.float64(0.9440661760843473)]

In [24]:
# what is average relevance?
average_relevance = (
    sum(relevance_score_arr) / len(relevance_score_arr) if relevance_score_arr else 0
)
print(f"Average Response Relevance: {average_relevance}")

Average Response Relevance: 0.8993426614392532


# Faithfulness

In [25]:
# compute faithfulness scores
faithfulness_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute faithfulness score
    faithfulness_score = await compute_faithfulness_score(
        llm_as_judge,
        row["query"],
        retrieved_context_arr,
        row["result"],
    )
    faithfulness_score_arr.append(faithfulness_score)

In [26]:
faithfulness_score_arr

[1.0, 1.0, 1.0, 0.375, 0.8, 1.0, 0.45, 1.0, 1.0, 1.0]

In [27]:
# what is average faithfulness?
average_faithfulness = (
    sum(faithfulness_score_arr) / len(faithfulness_score_arr)
    if faithfulness_score_arr
    else 0
)
print(f"Average Response Faithfulness: {average_faithfulness}")

Average Response Faithfulness: 0.8625


# Factual Correctness

In [33]:
response_df

Unnamed: 0,query,result,source_documents_names_arr,source_documents_contents_arr
0,What impression did Jonathan Harker have of Bu...,Jonathan Harker had a positive impression of B...,"['data_files/Dracula.txt', 'data_files/Dracula...",['It was on the dark side of twilight when we ...
1,What is the Golden Krone Hotel like?,The Golden Krone Hotel is described as thoroug...,"['data_files/Dracula.txt', 'data_files/Dracula...",['It was on the dark side of twilight when we ...
2,What significance does St. George’s Day hold i...,"In the context of the narrative, St. George's ...","['data_files/Dracula.txt', 'data_files/Dracula...",['“It is the eve of St. George’s Day. Do you n...
3,"In the context of the narrative, what signific...","In the context of the narrative, the term 'Ord...","['data_files/Dracula.txt', 'data_files/Dracula...",['When I got on the coach the driver had not t...
4,Why the Hospadars no fix things for fear of th...,The Hospadars did not repair the roads for fea...,"['data_files/Dracula.txt', 'data_files/Dracula...","['road is in summertime excellent, but that it..."
5,What role does the King play in the quest for ...,The King plays a central role in the quest for...,['data_files/The Adventures of Sherlock Holmes...,"['“On the contrary, my dear sir,” cried the Ki..."
6,What connection does the character's journey f...,The character's journey from London to Bistrit...,"['data_files/Dracula.txt', 'data_files/Dracula...",['It was on the dark side of twilight when we ...
7,What was Watson's role in the unexpected marri...,Watson's role in the unexpected marriage cerem...,['data_files/The Adventures of Sherlock Holmes...,"['“‘Thank God,’ he cried. ‘You’ll do. Come! Co..."
8,What experiences did Jonathan Harker have in B...,"In Bistritz, Jonathan Harker's experiences ref...","['data_files/Dracula.txt', 'data_files/Dracula...",['It was on the dark side of twilight when we ...
9,Whaat are the connections between the characte...,The character's journey in London is closely i...,"['data_files/Dracula.txt', 'data_files/Dracula...","['the devil-begotten Hun, the Slav, the Saxon,..."


In [28]:
# compute factual correctness scores
factual_correctness_score_arr = []
for i, row in response_df.iterrows():

    # compute factual correctness score
    factual_correctness_score = await compute_factual_correctness_score(
        llm_as_judge,
        row["result"],
        test_data_df.loc[i, "reference"],
    )
    factual_correctness_score_arr.append(factual_correctness_score)

In [29]:
factual_correctness_score_arr

[np.float64(0.92),
 np.float64(0.43),
 np.float64(1.0),
 np.float64(0.73),
 np.float64(0.73),
 np.float64(0.18),
 np.float64(0.42),
 np.float64(0.46),
 np.float64(0.36),
 np.float64(0.16)]

In [30]:
# what is average factual correctness score?
average_factual_correctness = (
    sum(factual_correctness_score_arr) / len(factual_correctness_score_arr)
    if factual_correctness_score_arr
    else 0
)
print(f"Average Response Factual Correctness: {average_factual_correctness}")

Average Response Factual Correctness: 0.539


# Summary of results

In [31]:
# Create markdown table with variables
markdown_text = f"""
|Evaluation Metric| Average Score|
|----|----|
|Context Precision|{average_precision:.3f}|
|Context Recall|{average_recall:.3f}|
|Response relevance|{average_relevance:.3f}|
|Faithfulness|{average_faithfulness:.3f}|
|Factual Correctness|{average_factual_correctness:.3f}|
"""

In [32]:
display(Markdown(markdown_text))


|Evaluation Metric| Average Score|
|----|----|
|Context Precision|0.917|
|Context Recall|0.800|
|Response relevance|0.899|
|Faithfulness|0.863|
|Factual Correctness|0.539|
