In [1]:
import os
import sys

import pandas as pd

sys.path.append(f'{os.getcwd()}/../')
pd.set_option('display.max_columns', None)

from metrics import (
    exact_match,
    InAcc,
    f1, 
)

In [2]:
datasets = {
    "NQ": pd.read_csv('../data/adaptive_rag_natural_questions.csv'),
    "2Wiki": pd.read_csv('../data/adaptive_rag_2wikimultihopqa.csv'),
    "HotPot": pd.read_csv('../data/adaptive_rag_hotpotqa.csv'),
    "Musique": pd.read_csv('../data/adaptive_rag_musique.csv'),
}

In [3]:
rows = ["Never RAG", "Always RAG"]
columns = pd.MultiIndex.from_product(
    [datasets.keys(), ["F1", "Exact Match", "InAcc"]],
    names=["Dataset", "Metric"]
)
data = []

for row in rows:
    row_data = []
    for dataset_name, dataset in datasets.items():
        if row == "Never RAG":
            answers = dataset['our_answer_wo_context']
        elif row == "Always RAG":
            answers = dataset['our_answer_w_context']
        
        f1_score = f1(answers, dataset['reference'])
        exact_match_score = exact_match(answers, dataset['reference'])
        inacc_score = InAcc(answers, dataset['reference'])
        
        row_data.extend([f1_score, exact_match_score, inacc_score])
    
    data.append(row_data)

metrics_df = pd.DataFrame(data, index=rows, columns=columns)
metrics_df

Dataset,NQ,NQ,NQ,2Wiki,2Wiki,2Wiki,HotPot,HotPot,HotPot,Musique,Musique,Musique
Metric,F1,Exact Match,InAcc,F1,Exact Match,InAcc,F1,Exact Match,InAcc,F1,Exact Match,InAcc
Never RAG,0.233954,0.092,0.194,0.106826,0.004,0.118,0.157464,0.066,0.138,0.043554,0.006,0.028
Always RAG,0.556455,0.338,0.478,0.466314,0.308,0.478,0.681702,0.492,0.584,0.391861,0.256,0.356
