In [1]:
import os
import sys

# Set the root directory of your project
project_root = '/Users/rodolfocacacho/Documents/Documents/MAI/Master Thesis/Code/rag_project'
os.chdir(project_root)
# Add the root directory to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

from config import (CONFIG_SQL_DB,DB_NAME,
                    SQL_EVAL_QAS_TABLE_SCHEMA,
                    SQL_EVAL_QAS_TABLE, 
                    EMBEDDING_MODEL,EMBEDDING_MODEL_API,
                    EMBEDDING_MODEL_EMB_TASK,
                    TEST_RESULTS_TABLE,SQL_EVAL_CHUNKS_TABLE,
                    SQL_PROMPTS_TABLE,TEST_GEN_ANSWERS_TABLE,
                    RESULTS_DIR)
from utils.MySQLDB_manager import MySQLDB
from testing.modules.evaluating_modules import RAGEvaluator
import json
import pandas as pd
from datetime import datetime

sql_con = MySQLDB(CONFIG_SQL_DB,DB_NAME)

ragEval = RAGEvaluator(sql_con=sql_con,
                       test_table_name=TEST_RESULTS_TABLE,
                       qas_table_name=SQL_EVAL_QAS_TABLE,
                       chunks_eval_table_name=SQL_EVAL_CHUNKS_TABLE,
                       prompts_table_name=SQL_PROMPTS_TABLE,
                       eval_answers_table=TEST_GEN_ANSWERS_TABLE)

df_results = ragEval.data_df

print(df_results.head(5))
print(df_results.columns)
print(df_results.shape)

# print(ragEval.generate_report())


   id_question  id_sample type_question  \
0            1          1       Factual   
1            1          1       Factual   
2            1          1       Factual   
3            1          1       Factual   
4            1          1       Factual   

                                            question  \
0  Welche Dokumente müssen bei der Beantragung de...   
1  Welche Dokumente müssen bei der Beantragung de...   
2  Welche Dokumente müssen bei der Beantragung de...   
3  Welche Dokumente müssen bei der Beantragung de...   
4  Welche Dokumente müssen bei der Beantragung de...   

                                     expected_answer  clarity  specificity  \
0  Bei der Beantragung des Klimageschwindigkeits-...        5            5   
1  Bei der Beantragung des Klimageschwindigkeits-...        5            5   
2  Bei der Beantragung des Klimageschwindigkeits-...        5            5   
3  Bei der Beantragung des Klimageschwindigkeits-...        5            5   
4  Bei der Bea

In [None]:
def generate_consolidated_test_report(data_df, test_group_cols=['test_name'], groupby_cols=None, output_dir='.',):
    """
    Generate a consolidated test report with all tests in a single sheet for each group.

    Args:
        data_df (pd.DataFrame): The dataframe containing test data with precomputed metrics.
        test_group_cols (list of str): Columns that define the main grouping, e.g., 'test_name'.
        groupby_cols (list of str): Additional columns to group by for metrics.
        output_file (str): Path to the output Excel file.

    Returns:
        None
    """
    # Get the current date and time
    now = datetime.now()

    # Format the date and time
    formatted_date = now.strftime("%d%m%y%H%M")

    filename = f"consolidated_test_report_{formatted_date}.xlsx"
    output_file=os.path.join(output_dir,filename)

    if groupby_cols is None:
        groupby_cols = []

    with pd.ExcelWriter(output_file) as writer:
        # General Summary Sheet
        summary_stats = []
        for group_values, test_group in data_df.groupby(test_group_cols):
            # Ensure group_values is iterable (single or multiple columns)
            group_values = [group_values] if not isinstance(group_values, tuple) else list(group_values)
            summary_entry = {col: val for col, val in zip(test_group_cols, group_values)}
            summary_entry.update({
                'Total Questions': len(test_group),
                'Avg Prompt Tokens': test_group['prompt_tokens'].mean(),
                'Std Prompt Tokens': test_group['prompt_tokens'].std(),
                'Avg Answer Tokens': test_group['completion_tokens'].mean(),
                'Std Answer Tokens': test_group['completion_tokens'].std(),
                'Retrieval Accuracy (used)': test_group['used_context'].mean(),
                'Retrieval Accuracy (retrieved)': test_group['retrieved_context'].mean(),
                'Expanded Retrieval Accuracy (used)': test_group['used_context_ext'].mean(),
                'Expanded Retrieval Accuracy (retrieved)': test_group['retrieved_context_ext'].mean(),
                'Avg Score': test_group['score'].mean(),
                'Binary Score': test_group['binary_score'].mean(),
                'ASS Precision': test_group['precision'].mean(),
                'ASS Recall': test_group['recall'].mean(),
                'ASS F1': test_group['f1'].mean()
            })
            summary_stats.append(summary_entry)

        summary_df = pd.DataFrame(summary_stats)
        summary_df.to_excel(writer, sheet_name="Summary", index=False)

        # Group Sheets
        for col in groupby_cols:
            group_stats = []
            for group_values, test_group in data_df.groupby(test_group_cols):
                group_values = [group_values] if not isinstance(group_values, tuple) else list(group_values)
                for sub_group, group_data in test_group.groupby(col):
                    group_entry = {col_name: val for col_name, val in zip(test_group_cols, group_values)}
                    group_entry[col] = sub_group
                    group_entry.update({
                        'Total Questions': len(group_data),
                        'Avg Prompt Tokens': group_data['prompt_tokens'].mean(),
                        'Std Prompt Tokens': group_data['prompt_tokens'].std(),
                        'Avg Answer Tokens': group_data['completion_tokens'].mean(),
                        'Std Answer Tokens': group_data['completion_tokens'].std(),
                        'Retrieval Accuracy (used)': group_data['used_context'].mean(),
                        'Retrieval Accuracy (retrieved)': group_data['retrieved_context'].mean(),
                        'Expanded Retrieval Accuracy (used)': group_data['used_context_ext'].mean(),
                        'Expanded Retrieval Accuracy (retrieved)': group_data['retrieved_context_ext'].mean(),
                        'Avg Score': group_data['score'].mean(),
                        'Binary Score': group_data['binary_score'].mean(),
                        'ASS Precision': group_data['precision'].mean(),
                        'ASS Recall': group_data['recall'].mean(),
                        'ASS F1': group_data['f1'].mean()
                    })
                    group_stats.append(group_entry)

            # Convert to DataFrame and write to a single sheet per group
            group_stats_df = pd.DataFrame(group_stats)
            sheet_name = col[:31]  # Ensure Excel sheet name limit
            group_stats_df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Consolidated report saved to {output_file}")

In [4]:
main_test_cols = ['test_name','embedding_model','chunk_size','alpha_value']
test_cols = ['type_question','type','doc_type','query_intent']

generate_consolidated_test_report(df_results,test_group_cols=main_test_cols,groupby_cols=test_cols,output_dir=RESULTS_DIR)

Consolidated report saved to results/consolidated_test_report_0712240101.xlsx
