# Evaluation of RAG respones to 'Where', 'When' and 'Who' questions

Evalation with the help of the helpers.rag_evaluate methods

In [1]:
import json
import os
import numpy as np
import pandas as pd
from helpers.rag_evaluate import evaluate_category, evaluate_all_data

In [2]:
from pipelines.rag import RAGPipeline

index_name = "football_index_711" #TODO: set the index name

# Initialize the pipeline
rag = RAGPipeline(openai_embedding_model="text-embedding-3-small")

# load the FAISS index
rag.load_faiss_index(index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
def generate_responses(data, question_column, response_column, rag, kwargs = {"retrieve": 200, "top_k": 250, "reorder": True}
):
    """
    Generates responses for questions in a specified column of a DataFrame
    using a RAG pipeline and populates the responses in another specified column.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the questions.
    - question_column (str): Name of the column containing the questions.
    - response_column (str): Name of the column where responses will be stored.
    - rag (object): An instance of the RAG pipeline with an 'answer_query' method.

    Returns:
    - pd.DataFrame: Updated DataFrame with answers in the specified response column.
    """
    for i, row in data.iterrows():
        # Get the question
        question = row[question_column]

        # Answer the question using the RAG pipeline
        answer, _, _, _, _ = rag.answer_query(question)

        # Assign the answer to the specified response column
        data.loc[i, response_column] = answer

    return data

In [32]:
# Path to the questions file - queries and ground truth answers
save_path = "data/results"
os.makedirs(save_path, exist_ok=True)

#### 1. Eval of 'When?' Questions

In [38]:
# Load in the when data
when_data = pd.read_json(os.path.join(save_path, 'when.json'))

regenerate_responses = False # TODO: set to True if you want to regenerate responses

if regenerate_responses:
    updated_data = generate_responses(data=when_data, 
                            question_column='question', 
                            response_column='response', 
                            rag=rag)
else:
    if 'response' not in when_data.columns:
        Raise("Response column not found in the data. Set regenerate_responses=True to generate responses.")

when_metrics, when_results = evaluate_category(when_data, 'ground_truth')

when_data['eval'] = when_results
when_data.to_json(os.path.join(save_path, 'when.json'), orient='records', indent=4)

# Print the results
print("When Metrics:")
print(json.dumps(when_metrics, indent=4))

When Metrics:
{
    "accuracy": 0.7
}


#### 2. Eval of 'Where?' Questions

In [35]:
# Load in the where data
where_data = pd.read_json(os.path.join(save_path, 'where.json'))

regenerate_responses = False # TODO: set to True if you want to regenerate responses

if regenerate_responses:
    updated_data = generate_responses(data=where_data, 
                            question_column='question', 
                            response_column='response', 
                            rag=rag)
else:
    if 'response' not in where_data.columns:
        Raise("Response column not found in the data. Set regenerate_responses=True to generate responses.")

where_metrics, where_results = evaluate_category(where_data, 'ground_truth')

where_data['eval'] = where_results
where_data.to_json(os.path.join(save_path, 'where.json'), orient='records', indent=4)

# Print the results
print("Where Metrics:")
print(json.dumps(where_metrics, indent=4))

Where Metrics:
{
    "accuracy": 0.7
}


#### 3. Eval of 'Who?' Questions

In [36]:
# Load in the who data
who_data = pd.read_json(os.path.join(save_path, 'who.json'))

regenerate_responses = False # TODO: set to True if you want to regenerate responses

if regenerate_responses:
    updated_data = generate_responses(data=who_data, 
                            question_column='question', 
                            response_column='response', 
                            rag=rag)
else:
    if 'response' not in who_data.columns:
        Raise("Response column not found in the data. Set regenerate_responses=True to generate responses.")

who_metrics, who_results = evaluate_category(who_data, 'ground_truth')

who_data['eval'] = who_results
who_data.to_json(os.path.join(save_path, 'who.json'), orient='records', indent=4)

# Print the results
print("Who Metrics:")
print(json.dumps(who_metrics, indent=4))

Who Metrics:
{
    "accuracy": 0.5
}


#### 4. Eval across categories

In [37]:
data = [when_data, where_data, who_data]
metrics = evaluate_all_data(data)

# Print the results

print("Metrics:")
print(json.dumps(metrics, indent=4))

Metrics:
{
    "accuracy": 0.6333333333333333
}
