# Retrieval Augmented Generation evaluation

### LOADING

#### Load Embedings

In [32]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load dataset
task_prompts_df = pd.read_csv('dataset/task_prompts.csv')
nodes_df = pd.read_csv('dataset/nodes.csv')

print("Task prompts:", task_prompts_df.shape)
print("Nodes:", nodes_df.shape)

# Load embedings
task_prompt_embedings = np.load('dataset/task_prompts_embeddings.npy')
nodes_texts_embedings = np.load('dataset/nodes_texts_embeddings.npy')

print("Task prompts embedings:", task_prompt_embedings.shape)
print("Nodes texts embedings:", nodes_texts_embedings.shape)

# Prepare model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

Task prompts: (1009, 2)
Nodes: (4683115, 4)
Task prompts embedings: (1, 768)
Nodes texts embedings: (2076, 768)




### EVALUATE TASKS

In [70]:
# Get the nodes indexes for a given task_id
def get_nodes_indexes(task_id, nodes_df):
    return nodes_df[nodes_df['task_id'] == task_id].index


def get_nodes_embeddigs(nodes_indexes, nodes_texts_embedings):
    return nodes_texts_embedings[nodes_indexes]


def get_task_embedings(task_index, task_prompt_embedings):
    # Get the task prompt embedings
    task_embedints = task_prompt_embedings[task_index]    
    return task_embedints

def get_top_nodes(similarities, k=10):
    import tensorflow as tf
    # Get top k nodes
    top_values, indices = tf.math.top_k(similarities, k)
    return top_values.numpy().flatten(), indices.numpy().flatten()

# Proportion  of positive candidates in the relevant nodes
def calculate_precision(relevant_nodes):
    # Check how many of the selected relevant nodes are set as pos_candidates
    total = len(relevant_nodes)
    pos_candidates_count = len(relevant_nodes[relevant_nodes['pos_candidate'] == 1])
    
    return pos_candidates_count / total

def evaluate_task(task, nodes_df, task_prompts_df, task_prompt_embedings, nodes_texts_embedings, model, k=10):
    """
    This function evaluates a task by calculating the precision of the top 10 nodes for all the actions recorded for the task.
    If we wanted to evaluate the model for a single action, we could use the function evaluate_action.
    """
    task_id = task['task_id']
    task_index = task_prompts_df[task_prompts_df['task_id'] == task_id].index[0]
    
    # The specific task prompt embeddings
    prompt_embedings = get_task_embedings(task_index, task_prompt_embedings)
    
    # Get the nodes embeddings
    nodes_indexes = get_nodes_indexes(task_id, nodes_df)
    nodes_embedings = get_nodes_embeddigs(nodes_indexes, nodes_texts_embedings)
    
    # Calculate the similarity
    similarities = model.similarity(prompt_embedings, nodes_embedings)
    
    # Get the top 10 nodes
    top_values, top_nodes_indices = get_top_nodes(similarities, k)
    
    # Convert nodes_indexes to a numpy array and use it for indexing
    nodes_indexes_array = np.array(nodes_indexes)
    top_nodes_indexes = nodes_indexes_array[top_nodes_indices]
    
    # Use integer indexing to get the correct nodes
    top_nodes = nodes_df.loc[top_nodes_indexes]
    
    # Calculate the task precision
    precision = calculate_precision(top_nodes)
    
    return top_nodes, precision

In [71]:
# Test task evaluation
test_task = task_prompts_df.iloc[0]
print("Task:", test_task)

top_nodes, results = evaluate_task(test_task, nodes_df, task_prompts_df, task_prompt_embedings, nodes_texts_embedings, model, 100)
print("Task precision: ", results)

Task: task_id                 7bda9645-0b5f-470a-8dd7-6af0bff4da68
prompt     Check for pickup restaurant available in Bosto...
Name: 0, dtype: object
Task precision:  0.0


In [72]:
print(len(top_nodes))

100


In [37]:
import pandas as pd
from tqdm import tqdm

# Evaluate all tasks
results = []
total_tasks = len(task_prompts_df)
for index, row in tqdm(task_prompts_df.iterrows(), total=total_tasks):
    top_nodes, task_precision = evaluate_task(row, nodes_df, task_prompts_df, task_prompt_embedings, nodes_texts_embedings, model)
    results.append((row['task_id', task_precision]))
    
print('Tasks evaluated: ', len(results))

100%|██████████| 1009/1009 [00:00<00:00, 8161.35it/s]

Evaluating task:  7bda9645-0b5f-470a-8dd7-6af0bff4da68
Evaluating task:  a6372f23-f462-4706-8455-5b350c46d83c
Evaluating task:  c0eeead1-f8ea-4819-a6da-ef0108b40c89
Evaluating task:  ce34bc61-e3d2-40c8-b02b-b149efc4b115
Evaluating task:  bf469f30-6628-4017-b963-672645d7feab
Evaluating task:  9e035a36-1c77-4014-98ec-4d48ee41d904
Evaluating task:  cf361c84-6414-4b05-a7a1-77383997150a
Evaluating task:  46a3683f-fbe0-40d0-8729-6c7964d994e6
Evaluating task:  4b2030ff-b83c-445f-bf87-9c8fbc68498b
Evaluating task:  4bc70fa1-e817-405f-b113-0919e8e94205
Evaluating task:  b20e1dc4-651b-46e1-8470-16250657f2a8
Evaluating task:  d63b1715-688c-4be2-b196-dde9659bc59d
Evaluating task:  d1942a73-745f-44c5-ba30-0d0c7925f5d2
Evaluating task:  8e849b85-5acc-4d26-ad5e-d24ad24343df
Evaluating task:  9ed8cd2a-b0f8-4ccf-9d48-1ddfba4f5e58
Evaluating task:  a31de393-c6e0-4175-858b-03cdc435d585
Evaluating task:  a5c1095b-bba1-4029-8b8d-fa5848702827
Evaluating task:  b910229f-6133-452c-a640-6a6ec67b668b
Evaluating




### RESULTS EVALUATION