In [74]:
import os
import pandas as pd
import mlflow
from rag_module.rag import RAG

os.environ["embedding_path"] = "./embeddings/guidance_framework_2/"

# Directory for synthetic QA datasets
QA_DATA_DIRECTORY = "eval_data/SyntheticQA/QA_sets/"

# MLflow experiment name
MLFLOW_EXPERIMENT_NAME = "RAG Retrieval Accuracy"

# Define different RAG configurations to test
RAG_CONFIGURATIONS = [
    {"retrieve_top_k": 5, "description": "RAG with top 5 retrieval"},
    {"retrieve_top_k": 10, "description": "RAG with top 10 retrieval"},
    {"retrieve_top_k": 15, "description": "RAG with top 15 retrieval"},
    {"retrieve_top_k": 20, "description": "RAG with top 20 retrieval"}
]

# --- Data Loading and Preparation ---
def load_and_prepare_qa_data(directory_path: str) -> pd.DataFrame:
    """
    Loads all CSV files from a directory, concatenates them,
    and groups by 'chunk_id' to get unique ground truth pairs.
    """
    print(f"Loading QA data from: {directory_path}")
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {directory_path}")

    dataframes = [pd.read_csv(os.path.join(directory_path, file)) for file in csv_files]

    # Concatenate all DataFrames and remove duplicates based on 'chunk_id'
    df_combined = pd.concat(dataframes, ignore_index=True)
    df_unique_qa = df_combined.groupby('chunk_id').first().reset_index()
    print(f'Total unique QA pairs loaded: {len(df_unique_qa)}')
    return df_unique_qa

# --- Accuracy Measurement ---
def calculate_retrieval_accuracy(df_qa: pd.DataFrame, rag_instance, question_col="question") -> float:
    """
    Calculates the accuracy of the RAG retrieval component.
    Accuracy here is defined as: Was the ground-truth chunk_id found in the retrieved top-K?
    """
    correct_retrievals = 0
    total_queries = len(df_qa)

    for index, row in df_qa.iterrows():
        question = row[question_col]
        ground_truth_chunk_id = row['chunk_id']

        # Retrieve chunks using the current RAG instance
        retrieved_chunks_info = rag_instance.retrieve(question)

        # Extract vector_ids (which should correspond to chunk_ids) from retrieved items
        retrieved_vector_ids = [item['vector_id'] for item in retrieved_chunks_info]

        # Check if the ground truth chunk_id is present in the retrieved set
        if ground_truth_chunk_id in retrieved_vector_ids:
            correct_retrievals += 1

    accuracy = correct_retrievals / total_queries
    return accuracy

## Retrieval performance

In [None]:
# Set up MLflow experiment
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

# Load and prepare your evaluation data once
try:
    df_gf = load_and_prepare_qa_data(QA_DATA_DIRECTORY)
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure the directory and CSV files exist.")
    exit()

print("\nStarting RAG evaluation...")

for config in RAG_CONFIGURATIONS:
    retrieve_top_k = config["retrieve_top_k"]
    description = config["description"]

    with mlflow.start_run(run_name=f"RAG_Evaluation_k={retrieve_top_k}"):
        print(f"\n--- Evaluating: {description} (k={retrieve_top_k}) ---")

        # Log parameters for this run
        mlflow.log_param("retrieve_top_k", retrieve_top_k)
        mlflow.log_param("total_evaluation_queries", len(df_gf))

        # Initialize RAG for the current configuration
        try:
            current_rag = RAG(retrieve_top_k=retrieve_top_k)
        except Exception as e:
            print(f"Failed to initialize RAG with k={retrieve_top_k}: {e}")
            mlflow.log_param("initialization_status", "Failed")
            continue # Skip this run if RAG can't be initialized

        # Calculate accuracy
        retrieval_accuracy = calculate_retrieval_accuracy(df_gf, current_rag)
        print(f"Retrieval Accuracy (k={retrieve_top_k}): {retrieval_accuracy:.4f}")

        # Log the accuracy metric
        mlflow.log_metric("retrieval_accuracy", retrieval_accuracy)

        print(f"Finished evaluation for k={retrieve_top_k}. MLflow run logged.")

print("\nAll RAG configurations evaluated. Check MLflow UI for results (mlflow ui).")

2025/05/24 17:38:39 INFO mlflow.tracking.fluent: Experiment with name 'RAG Retrieval Accuracy' does not exist. Creating a new experiment.


Loading QA data from: eval_data/SyntheticQA/QA_sets/
Total unique QA pairs loaded: 114

Starting RAG evaluation...


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2



--- Evaluating: RAG with top 5 retrieval (k=5) ---
Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.57it/s]
Batches: 1

Retrieval Accuracy (k=5): 0.9649
Finished evaluation for k=5. MLflow run logged.

--- Evaluating: RAG with top 10 retrieval (k=10) ---
Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 53.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.99it/s]
Batches: 1

Retrieval Accuracy (k=10): 0.9737
Finished evaluation for k=10. MLflow run logged.

--- Evaluating: RAG with top 15 retrieval (k=15) ---
Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.85it/s]
Batches: 1

Retrieval Accuracy (k=15): 0.9825
Finished evaluation for k=15. MLflow run logged.

--- Evaluating: RAG with top 20 retrieval (k=20) ---
Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 23.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.22it/s]
Batches: 1

Retrieval Accuracy (k=20): 0.9912
Finished evaluation for k=20. MLflow run logged.

All RAG configurations evaluated. Check MLflow UI for results (mlflow ui).


## Cross-lingual  performance

### Manual

In [62]:
def call_rags(rag_configs, queries, retrieve_only=True) -> pd.DataFrame:
    """
    Given a set of RAG configurations and queries,
    create a dataframe with the results of each RAG call.
    """
    results = []
    for rag_config in rag_configs:
        rag = RAG(**rag_config['config'])
        for query in queries:
            if retrieve_only:
                result = {}
                result['chunks'] = rag.retrieve(query)
                result['query'] = query
            else:
                result = rag.query(query)
            result['rag_name'] = rag_config['name']
            result['rag_config'] = rag_config['config']
            results.append(result)

    df = pd.DataFrame(results)
    df['chunk_ids'] = df['chunks'].apply(lambda chunks: [chunk['vector_id'] for chunk in chunks])
    
    return df

In [63]:
rag_configs = [
    {
        "name": "RAG-bl",
        "config": {"retrieve_top_k": 15, "rerank_top_k": 15, "use_reranker": False}
    },
    {
        "name": "RAG-flashrank",
        "config": {"retrieve_top_k": 15, "rerank_top_k": 15, "use_reranker": True}
    },
]

lang_queries = [
    'What is the main reason for avoiding the implementation of complex business logic within Power BI?',
    'Wat is de belangrijkste reden om complexe bedrijfslogica te vermijden in Power BI?',
    'What is the purpose of the sprint retrospective in the Scrum process?',
    'Wat is het doel van de sprint retrospective in het Scrum-proces?',
    'What are some potential drawbacks of using Lombok in Java development?',
    'Wat zijn enkele mogelijke nadelen van het gebruik van Lombok in Java-ontwikkeling?',
    'Which database should I use for graphs?',
    'Welke database moet ik gebruiken voor netwerken?'
    ]

In [64]:
df_lang = call_rags(rag_configs, lang_queries)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.98it/s]
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.79it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.26it/s]


### Automated

In [None]:
from deep_translator import GoogleTranslator

# Translate the question column to Dutch
df_gf = load_and_prepare_qa_data(QA_DATA_DIRECTORY)
df_gf['question_dutch'] = df_gf['question'].apply(lambda x: GoogleTranslator(source='en', target='nl').translate(x))

Loading QA data from: eval_data/SyntheticQA/QA_sets/
Total unique QA pairs loaded: 114


In [75]:
# Define different RAG configurations to test
RAG_CONFIGURATIONS = [
    {"retrieve_top_k": 15, "description": "RAG with top 15 retrieval"}
]

# Evaluate the RAG configurations with the Dutch questions
for config in RAG_CONFIGURATIONS:
    retrieve_top_k = config["retrieve_top_k"]
    description = config["description"]

    with mlflow.start_run(run_name=f"RAG_Evaluation_Dutch_k={retrieve_top_k}"):
        print(f"\n--- Evaluating: {description} (k={retrieve_top_k}) ---")

        # Log parameters for this run
        mlflow.log_param("retrieve_top_k", retrieve_top_k)
        mlflow.log_param("total_evaluation_queries", len(df_gf))
        mlflow.log_param("language", "Dutch")

        # Initialize RAG for the current configuration
        try:
            current_rag = RAG(retrieve_top_k=retrieve_top_k)
        except Exception as e:
            print(f"Failed to initialize RAG with k={retrieve_top_k}: {e}")
            mlflow.log_param("initialization_status", "Failed")
            continue  # Skip this run if RAG can't be initialized

        # Calculate accuracy
        retrieval_accuracy = calculate_retrieval_accuracy(df_gf, current_rag, question_col="question_dutch")
        print(f"Retrieval Accuracy (k={retrieve_top_k}): {retrieval_accuracy:.4f}")

        # Log the accuracy metric
        mlflow.log_metric("retrieval_accuracy", retrieval_accuracy)

        print(f"Finished evaluation for k={retrieve_top_k}. MLflow run logged.")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2



--- Evaluating: RAG with top 15 retrieval (k=15) ---
Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 65.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.98it/s]
Batches: 1

Retrieval Accuracy (k=15): 0.1754
Finished evaluation for k=15. MLflow run logged.


# Basic

In [3]:
from rag_module.rag import RAG
import os
import pandas as pd

# set path to database
os.environ["embedding_path"] = "./embeddings/guidance_framework_2/"

In [None]:
# Combine syhnthetic QA datasets

# Get all CSV files in the directory
directory = "eval_data/SyntheticQA/QA_sets/"
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Loop through the files and read them into DataFrames
dataframes = []
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into one
df_gf = pd.concat(dataframes, ignore_index=True)
df_gf = df_gf.groupby('chunk_id').first()

print(f'n queries guidance framework: {len(df_gf)}')

rags = [
    RAG(retrieve_top_k=10),
    RAG(retrieve_top_k=15),
    RAG(retrieve_top_k=20)
    ]

retrieved = df_gf['question'].apply(lambda q: rag.retrieve(q))

vector_ids = retrieved.apply(lambda x: [item['vector_id'] for item in x]).reset_index()

chunk_retrieved = vector_ids.apply(lambda x: x['chunk_id'] in x['question'], axis=1)
accuracy = chunk_retrieved.sum() / len(vector_ids)

n queries guidance framework: 114


In [None]:
rags = [
    RAG(retrieve_top_k=10),
    RAG(retrieve_top_k=15),
    RAG(retrieve_top_k=20)
    ]

Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...
Error loading model: Run 'None' not found
Error loading scope model: Run 'None' not found


In [None]:
retrieved = df_gf['question'].apply(lambda q: rag.retrieve(q))

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.70it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 32.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.95it/s]
Batches: 1

In [51]:
vector_ids = retrieved.apply(lambda x: [item['vector_id'] for item in x]).reset_index()

In [58]:
chunk_retrieved = vector_ids.apply(lambda x: x['chunk_id'] in x['question'], axis=1)
accuracy = chunk_retrieved.sum() / len(vector_ids)

## Old:

In [None]:
from rag_module.rag import explainaRAG
import mlflow
import inspect
import pandas as pd
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# set path to benchmark data
os.environ["embedding_path"] = "./embeddings/ELOQ_news/"

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
rags = [{"model": explainaRAG(normalize_embeddings=True), "name": "normalized"},
        {"model": explainaRAG(normalize_embeddings=False), "name": "regular"}]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


In [None]:
chunkdata = pd.read_json(rags[0]['model'].embedder.chunkdata_path)
evalset = pd.read_csv("eval_data/ScopeQA/ELOQ_silver.csv")
evalset = evalset.merge(chunkdata, how="left", left_on="doc_id", right_on="filename").drop(columns=["filename", "text"])

In [65]:
sampleset = evalset.sample(200)

In [66]:
def retrieve_docs(rag, question):
    result = rag['model'].retrieve(question)
    return [doc['vector_id'] for doc in result]

for rag in rags:
    sampleset[f'{rag['name']}_docs'] = sampleset['question'].apply(lambda q: retrieve_docs(rag, q))

Batches: 100%|██████████| 1/1 [00:00<00:00, 24.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.95it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.17it/s]
Batches: 1

In [75]:
(sampleset['normalized_docs'] == sampleset['regular_docs']).value_counts()

True    200
Name: count, dtype: int64