- Run evaluation metrics on generated test data

In [1]:
from helpers import (
    compute_context_precision_score,
    compute_context_recall_score,
    compute_response_relevance_score,
    compute_faithfulness_score,
    compute_factual_correctness_score,
)
from dotenv import load_dotenv
import pandas as pd
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from ragas.llms.base import llm_factory
from IPython.display import Markdown, display
import ast
import os

In [2]:
load_dotenv("configs.env")

True

In [3]:
# initialize LLMs and embedding models
llm_as_judge = llm_factory("gpt-4o-mini")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

In [4]:
# paths
TEST_DATA_PATH = "generated_test_data/test_data.csv"
RESPONSE_DATA_DIR = "generated_responses"
os.makedirs(RESPONSE_DATA_DIR, exist_ok=True)
RESPONSE_DATA_PATH = os.path.join(RESPONSE_DATA_DIR, "rag_responses_v1.csv")
VECTOR_DB_DIR = "vector_db"

Read generated test data

In [5]:
# read the generated test data
test_data_df = pd.read_csv(TEST_DATA_PATH)

In [6]:
test_data_df.shape

(10, 4)

In [7]:
test_data_df.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Who were the Atkinson brothers?,['The Adventure of the Noble Bachelor XI. The ...,The Atkinson brothers were involved in a singu...,single_hop_specific_query_synthesizer
1,Who is Mary Jane and what is her significance ...,"['You would certainly have been burned, had yo...","Mary Jane is described as incorrigible, and he...",single_hop_specific_query_synthesizer
2,What role does Watson play in the interaction ...,['‘Remarkable as being the scene of the death ...,"In the interaction with the Count Von Kramm, W...",single_hop_specific_query_synthesizer
3,How does the context of Bohemia relate to the ...,['to be an immense scandal and seriously compr...,"In the narrative, Bohemia is significant as it...",single_hop_specific_query_synthesizer
4,What details can you provide about the locatio...,['the betrothal was publicly proclaimed. That ...,Briony Lodge is described as a bijou villa loc...,single_hop_specific_query_synthesizer


initialize retriever

In [8]:
# === Load persistent vector DB ===
vectordb = Chroma(
    persist_directory=VECTOR_DB_DIR,
    embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
)

In [9]:
# Set up the LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [10]:
# Set up the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
)

generate responses

In [11]:
# TODO uncomment

In [12]:
# response_obj_arr = []
# for _, row in test_data_df.iterrows():
#     response = qa_chain.invoke(row["user_input"])
#     response_obj_arr.append(response)

In [13]:
# # save responses to new dataframe using response_obj_arr
# response_df = pd.DataFrame(
#     {
#         "query": [resp["query"] for resp in response_obj_arr],
#         "result": [resp["result"] for resp in response_obj_arr],
#         "source_documents_names_arr": [
#             [doc.metadata["source"] for doc in resp["source_documents"]]
#             for resp in response_obj_arr
#         ],
#         "source_documents_contents_arr": [
#             [doc.page_content for doc in resp["source_documents"]]
#             for resp in response_obj_arr
#         ],
#     }
# )

In [14]:
# response_df.to_csv(RESPONSE_DATA_PATH, index=False)

evaluate responses

In [5]:
response_df = pd.read_csv(RESPONSE_DATA_PATH)

In [6]:
response_df.head()

Unnamed: 0,query,result,source_documents_names_arr,source_documents_contents_arr
0,Who were the Atkinson brothers?,I don't know.,"['data_files/Dracula.txt', 'data_files/The Adv...","['in hand, they made light of the attack, and ..."
1,Who is Mary Jane and what is her significance ...,"Mary Jane is referred to as the ""incorrigible""...",['data_files/The Adventures of Sherlock Holmes...,"['“Then, how do you know?”\n\n“I see it, I ded..."
2,What role does Watson play in the interaction ...,"In the interaction with Count Von Kramm, Dr. W...",['data_files/The Adventures of Sherlock Holmes...,['“You had my note?” he asked with a deep hars...
3,How does the context of Bohemia relate to the ...,The context of Bohemia is significant to the c...,['data_files/The Adventures of Sherlock Holmes...,['“If your Majesty would condescend to state y...
4,What details can you provide about the locatio...,Briony Lodge is described as a bijou villa wit...,['data_files/The Adventures of Sherlock Holmes...,['“I can’t imagine. I suppose that you have be...


# Context Precision

In [19]:
# compute context precision scores
precision_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute context precision score
    precision_score = await compute_context_precision_score(
        llm_as_judge,
        row["query"],
        retrieved_context_arr,
        row["result"],
    )
    precision_score_arr.append(precision_score)

In [20]:
precision_score_arr

[0.0,
 0.9999999999,
 0.99999999995,
 0.999999999975,
 0.999999999975,
 0.999999999975,
 0.999999999975,
 0.8055555555287036,
 0.999999999975,
 0.999999999975]

In [21]:
# what is average precision?
average_precision = (
    sum(precision_score_arr) / len(precision_score_arr) if precision_score_arr else 0
)
print(f"Average Context Precision: {average_precision}")

Average Context Precision: 0.8805555555228703


# Context Recall

In [25]:
# compute context recall scores
recall_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute context recall score
    recall_score = await compute_context_recall_score(
        llm_as_judge,
        row["query"],
        test_data_df.loc[i, "reference"],
        retrieved_context_arr,
    )
    recall_score_arr.append(recall_score)

In [26]:
recall_score_arr

[0.0, 0.5, 0.5, 1.0, 1.0, 0.0, 0.3333333333333333, 0.25, 0.75, 0.5]

In [27]:
# what is average recall?
average_recall = (
    sum(recall_score_arr) / len(recall_score_arr) if recall_score_arr else 0
)
print(f"Average Context Recall: {average_recall}")

Average Context Recall: 0.4833333333333333


# Response Relevance

In [29]:
# compute response relevance scores
relevance_score_arr = []
for i, row in response_df.iterrows():

    # compute response relevance score
    relevance_score = await compute_response_relevance_score(
        llm_as_judge,
        embedding_model,
        row["query"],
        row["result"],
    )
    relevance_score_arr.append(relevance_score)

In [33]:
relevance_score_arr

[np.float64(0.0),
 np.float64(0.8802317289007987),
 np.float64(0.9377789431335847),
 np.float64(0.9119061495065955),
 np.float64(0.6919304509628469),
 np.float64(0.6342989958585301),
 np.float64(0.838835571945377),
 np.float64(0.8593732347339079),
 np.float64(0.8937727817588309),
 np.float64(0.8201806401685484)]

In [31]:
# what is average relevance?
average_relevance = (
    sum(relevance_score_arr) / len(relevance_score_arr) if relevance_score_arr else 0
)
print(f"Average Response Relevance: {average_relevance}")

Average Response Relevance: 0.746830849696902


# Faithfulness

In [34]:
# compute faithfulness scores
faithfulness_score_arr = []
for i, row in response_df.iterrows():

    # get the relevant contexts for generated response
    retrieved_context_arr = ast.literal_eval(row["source_documents_contents_arr"])

    # compute faithfulness score
    faithfulness_score = await compute_faithfulness_score(
        llm_as_judge,
        row["query"],
        retrieved_context_arr,
        row["result"],
    )
    faithfulness_score_arr.append(faithfulness_score)

In [35]:
faithfulness_score_arr

[0.0, 0.8, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7777777777777778, 0.875, 1.0]

In [36]:
# what is average faithfulness?
average_faithfulness = (
    sum(faithfulness_score_arr) / len(faithfulness_score_arr)
    if faithfulness_score_arr
    else 0
)
print(f"Average Response Faithfulness: {average_faithfulness}")

Average Response Faithfulness: 0.8252777777777778


# Factual Correctness

In [37]:
# compute factual correctness scores
factual_correctness_score_arr = []
for i, row in response_df.iterrows():

    # compute factual correctness score
    factual_correctness_score = await compute_factual_correctness_score(
        llm_as_judge,
        row["result"],
        test_data_df.loc[i, "reference"],
    )
    factual_correctness_score_arr.append(factual_correctness_score)

In [38]:
factual_correctness_score_arr

[np.float64(0.0),
 np.float64(0.43),
 np.float64(0.63),
 np.float64(0.78),
 np.float64(0.64),
 np.float64(0.32),
 np.float64(0.18),
 np.float64(0.35),
 np.float64(0.64),
 np.float64(0.5)]

In [39]:
# what is average factual correctness score?
average_factual_correctness = (
    sum(factual_correctness_score_arr) / len(factual_correctness_score_arr)
    if factual_correctness_score_arr
    else 0
)
print(f"Average Response Factual Correctness: {average_factual_correctness}")

Average Response Factual Correctness: 0.44700000000000006


# Summary of results

In [45]:
# Create markdown table with variables
markdown_text = f"""
|Evaluation Metric| Average Score|
|----|----|
|Context Precision|{average_precision:.3f}|
|Context Recall|{average_recall:.3f}|
|Response relevance|{average_relevance:.3f}|
|Faithfulness|{average_faithfulness:.3f}|
|Factual Correctness|{average_factual_correctness:.3f}|
"""

In [46]:
display(Markdown(markdown_text))


|Evaluation Metric| Average Score|
|----|----|
|Context Precision|0.881|
|Context Recall|0.483|
|Response relevance|0.747|
|Faithfulness|0.825|
|Factual Correctness|0.447|
