In [1]:
import os
import sys

import pandas as pd

sys.path.append(f'{os.getcwd()}/../')
pd.set_option('display.max_columns', None)

from metrics import (
    EM_compute,
    has_answer, # InAcc
    F1_compute,
)

In [2]:
datasets = {
    "NQ": pd.read_csv('../data/adaptive_rag_natural_questions.csv'),
    "2Wiki": pd.read_csv('../data/adaptive_rag_2wikimultihopqa.csv'),
    "HotPot": pd.read_csv('../data/adaptive_rag_hotpotqa.csv'),
    "Musique": pd.read_csv('../data/adaptive_rag_musique.csv'),
}

In [3]:
def process_dataframe(df, pred_col, gt_col):
    total_has_answer = 0
    total_em = 0
    total_f1 = 0
    count = len(df)
    has_answer_arr = []
    em_arr = []
    f1_arr = []

    for _, row in df.iterrows():
        prediction = row[pred_col]
        ground_truths = row[gt_col]

        has_ans = has_answer([ground_truths], prediction)
        has_answer_arr.append(has_ans)
        em = EM_compute([ground_truths], prediction)
        em_arr.append(em)
        f1 = F1_compute([ground_truths], prediction)
        f1_arr.append(f1)

        total_has_answer += has_ans
        total_em += em
        total_f1 += f1

    mean_has_answer = total_has_answer / count if count > 0 else 0
    mean_em = total_em / count if count > 0 else 0
    mean_f1 = total_f1 / count if count > 0 else 0

    df['InAcc'] = has_answer_arr
    df['EM'] = em_arr
    df['F1'] = f1_arr

    return mean_has_answer, mean_em, mean_f1, df

In [4]:
rows = ["Never RAG", "Always RAG"]
columns = pd.MultiIndex.from_product(
    [datasets.keys(), ["F1", "Exact Match", "InAcc", "gptAcc"]],
    names=["Dataset", "Metric"]
)
data = []
correlation_data = []

for row in rows:
    row_data = []
    correlation_row_data = []
    for dataset_name, dataset in datasets.items():
        if row == "Never RAG":
            pred_col = 'our_answer_wo_context'
            gpt_col = 'is_correct_wo_context'
        elif row == "Always RAG":
            pred_col = 'our_answer_w_context'
            gpt_col = 'is_correct_w_context'

        gt_col = 'reference'
        gptAcc = dataset[gpt_col].sum()/dataset[gpt_col].shape[0]
        inacc_score, mean_em, mean_f1, _ = process_dataframe(dataset, pred_col, gt_col)
        row_data.extend([mean_f1, mean_em, inacc_score, gptAcc])
        
        corr_matrix = dataset[[gpt_col, "F1", "EM", "InAcc"]].corr()
        f1_corr, em_corr, inacc_corr = corr_matrix.loc[gpt_col, ["F1", "EM", "InAcc"]]
        correlation_row_data.extend([f1_corr, em_corr, inacc_corr, gptAcc])
        
    data.append(row_data)
    correlation_data.append(correlation_row_data)

metrics_df = pd.DataFrame(data, index=rows, columns=columns)
correlation_df = pd.DataFrame(correlation_data, index=rows, columns=columns)

In [5]:
metrics_df

Dataset,NQ,NQ,NQ,NQ,2Wiki,2Wiki,2Wiki,2Wiki,HotPot,HotPot,HotPot,HotPot,Musique,Musique,Musique,Musique
Metric,F1,Exact Match,InAcc,gptAcc,F1,Exact Match,InAcc,gptAcc,F1,Exact Match,InAcc,gptAcc,F1,Exact Match,InAcc,gptAcc
Never RAG,0.242246,0.096,0.234,0.404,0.106784,0.004,0.138,0.054,0.15959,0.066,0.166,0.218,0.043354,0.006,0.028,0.054
Always RAG,0.587047,0.346,0.612,0.808,0.473839,0.308,0.552,0.64,0.686283,0.494,0.66,0.832,0.397806,0.258,0.376,0.498


In [6]:
correlation_df

Dataset,NQ,NQ,NQ,NQ,2Wiki,2Wiki,2Wiki,2Wiki,HotPot,HotPot,HotPot,HotPot,Musique,Musique,Musique,Musique
Metric,F1,Exact Match,InAcc,gptAcc,F1,Exact Match,InAcc,gptAcc,F1,Exact Match,InAcc,gptAcc,F1,Exact Match,InAcc,gptAcc
Never RAG,0.766871,0.395807,0.642433,0.404,0.514487,0.265246,0.391875,0.054,0.758369,0.50347,0.66272,0.218,0.811067,0.325185,0.710386,0.054
Always RAG,0.697197,0.354564,0.580955,0.808,0.767692,0.500361,0.807378,0.64,0.712428,0.433298,0.580902,0.832,0.82893,0.592032,0.746329,0.498
