## evaluator

In [1]:
from typing import Dict, List, Callable, Tuple, Union, Callable, Literal
import string
import re
import numpy as np
from collections import Counter
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

class Evaluator:

    @classmethod
    def normalize_answer(cls, s):
        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)
        def white_space_fix(text):
            return ' '.join(text.split())
        def remove_punc(text):
            exclude = set(string.punctuation)
            return ''.join(ch for ch in text if ch not in exclude)
        def lower(text):
            return text.lower()
        if not isinstance(s, str):
            return ""
        return white_space_fix(remove_articles(remove_punc(lower(s))))

    @classmethod
    def exact_match_score(
        cls,
        prediction: str,
        ground_truth: Union[str, List[str]],
    ):
        if not prediction:
            return {'correct': 0, 'incorrect': 1}
        ground_truths = {ground_truth} if isinstance(ground_truth, str) else set(ground_truth)

        correct = np.max([int(cls.normalize_answer(prediction) == cls.normalize_answer(gt)) for gt in ground_truths])
        return {'correct': correct, 'incorrect': 1 - correct}

    @classmethod
    def f1_score(
        cls,
        prediction: str,
        ground_truth: Union[str, List[str]],
    ):
        final_metric = {'f1': 0, 'precision': 0, 'recall': 0}
        
        if not prediction:
            return final_metric
        ground_truths = {ground_truth} if isinstance(ground_truth, str) else set(ground_truth)
            
        for ground_truth in ground_truths:
            normalized_prediction = cls.normalize_answer(prediction)
            normalized_ground_truth = cls.normalize_answer(ground_truth)
            if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
                continue
            if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
                continue
            prediction_tokens = normalized_prediction.split()
            ground_truth_tokens = normalized_ground_truth.split()
            common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
            num_same = sum(common.values())
            if num_same == 0:
                continue

            precision = 1.0 * num_same / len(prediction_tokens)
            recall = 1.0 * num_same / len(ground_truth_tokens)
            f1 = (2 * precision * recall) / (precision + recall)
            for k in ['f1', 'precision', 'recall']:
                final_metric[k] = max(eval(k), final_metric[k])
        return final_metric
    
    def eval_answer(self, results_df, answer_col="Final Answer"):
        # for datasets don't have answer_ids, aliases
        em_list = []
        f1_list = []
        for i, row in results_df.iterrows():
            prediction = row[answer_col]
            ground_truth = row['ground_truth']
            em_list.append(self.exact_match_score(prediction, ground_truth)['correct'])
            f1_list.append(self.f1_score(prediction, ground_truth)['f1'])
        print(f"EM: {sum(em_list)/len(em_list):4f}\t F1: {sum(f1_list)/len(f1_list):4f}")

In [2]:
def my_eval(pred_list, ground_truths):
    evaluator = Evaluator()
    em_list = []
    f1_list = []
    for prediction, ground_truth in zip(pred_list, ground_truths):
        em_list.append(evaluator.exact_match_score(prediction, ground_truth)['correct'])
        f1_list.append(evaluator.f1_score(prediction, ground_truth)['f1'])
    print(f"EM: {sum(em_list)/len(em_list):4f}\t F1: {sum(f1_list)/len(f1_list):4f}")

## Load Results

In [3]:
dataset_name: Literal['nq', 'tq', 'sq'] = 'nq'

raw_data = pd.read_json(f"./data/singlehop_data/processed_{dataset_name}.json")
ground_truth_list = raw_data['answer']

In [5]:
direct_results = pd.read_json("./outputs_nq/direct.jsonl", lines=True) # replace with your output file
rag_results = pd.read_json("./outputs_nq/rag.jsonl", lines=True) # replace with your output file

In [6]:
assert len(ground_truth_list) == len(direct_results)
assert 10 * len(ground_truth_list) == len(rag_results)

## Evaluate Baseline

In [7]:
## Llama 2
THRESHOLD = -6.0
answer_list = []
for i, direct_res in direct_results.iterrows():
    direct_eigen_score = direct_res['eigen_score']
    if direct_eigen_score < THRESHOLD:
        answer_list.append(direct_res['answer'])
    else:
        rag_batch = rag_results.iloc[10*i : 10*i+10]
        best_answer = rag_batch.loc[rag_batch['eigen_score'].idxmin()]['answer']
        answer_list.append(best_answer)
my_eval(answer_list, ground_truth_list)

EM: 0.251524	 F1: 0.351987


In [7]:
## Llama 3.1
THRESHOLD = -6.0
answer_list = []
for i, direct_res in direct_results.iterrows():
    direct_eigen_score = direct_res['eigen_score']
    if direct_eigen_score < THRESHOLD:
        answer_list.append(direct_res['answer'])
    else:
        rag_batch = rag_results.iloc[10*i : 10*i+10]
        best_answer = rag_batch.loc[rag_batch['eigen_score'].idxmin()]['answer']
        answer_list.append(best_answer)
my_eval(answer_list, ground_truth_list)

EM: 0.337119	 F1: 0.443897


## NQ_Evaluation

In [8]:
dataset_name: Literal['nq', 'tq', 'sq'] = 'nq'

raw_data = pd.read_json(f"./data/singlehop_data/processed_{dataset_name}.json") # replace with your processed dataset file
ground_truth_list = raw_data['answer']

direct_results = pd.read_json("./outputs_nq_ours_3_1/direct.jsonl", lines=True) # replace with your output file
rag_results = pd.read_json("./outputs_nq_ours_3_1/rag.jsonl", lines=True) # replace with your output file

data_ = pd.read_json('./data/singlehop_data/nq_top10.json')
data_['len_ctxs'] = data_.ctxs.apply(lambda x: len(x))
direct_results['len_ctxs']=data_.len_ctxs

assert len(ground_truth_list) == len(direct_results)

In [28]:
## 3.1
THRESHOLD = -6.0
answer_list = []
current_index = 0  # Pointer to keep track of the current position in rag_results

for i, direct_res in tqdm(direct_results.iterrows()):
    direct_eigen_score = direct_res['eigen_score']
    
    if direct_eigen_score < THRESHOLD:
        answer_list.append(direct_res['answer'])
        
    else:
        len_ctxs = direct_res['len_ctxs']
        rag_batch = rag_results.iloc[current_index : current_index + len_ctxs]

        best_rag_result = rag_batch.loc[rag_batch['eigen_score'].idxmin()]
        best_answer = rag_batch.loc[rag_batch['eigen_score'].idxmin()]['answer']
        
        answer_list.append(best_answer)
        current_index += len_ctxs
        
my_eval(answer_list, ground_truth_list)

0it [00:00, ?it/s]

EM: 0.358000	 F1: 0.483508


In [49]:
with open('nq_ours.jsonl', 'w', encoding='utf-8') as f:
    pd.DataFrame([answer_list, ground_truth_list]).T.rename(columns={0:'predict', 1:'answer'}).to_json(f, orient='records', lines=True)

In [50]:
! python eval.py ./nq_ours.jsonl 'predict' 'answer'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 7078.83it/s]
Mean Accuracy: 0.4060
Mean EM: 0.3600
Mean F1: 0.4870


In [1]:
! python bootrstrap_CI_upd.py --input_file "nq_ours.jsonl"\
                            --pred_col "predict"\
                            --gt_col "answer"\
                            --n_rounds 1000

bootstrap: 100%|██████████████████████████| 1000/1000 [00:00<00:00, 2860.47it/s]
--------------------------------------------------
 Accuracy (init, mean, median): 0.406, 0.406, 0.406 |Std: 0.023 | 95% CI: (0.36, 0.448)
 EM (init, mean, median): 0.36 , 0.36 , 0.36  | Std: 0.022 | 95% CI: (0.316, 0.404)
 F1 (init, mean, median): 0.487, 0.487, 0.488  | Std: 0.021 |95% CI: (0.445, 0.528)


## SQuad Evaluation

In [3]:
dataset_name: Literal['nq', 'tq', 'sq'] = 'sq'

raw_data = pd.read_json(f"./data/singlehop_data/processed_{dataset_name}.json")
ground_truth_list = raw_data['answer']

direct_results = pd.read_json(f"./outputs_{dataset_name}_ours_3_1/direct.jsonl", lines=True) # replace with your output file
rag_results = pd.read_json(f"./outputs_{dataset_name}_ours_3_1/rag.jsonl", lines=True) # replace with your output file

data_ = pd.read_json('./data/singlehop_data/sq_top10.json')
data_.len_ctxs = data_.len_ctxs.apply(lambda x: x[0]) 
direct_results['len_ctxs']=data_.len_ctxs


assert len(ground_truth_list) == len(direct_results)
# assert 12 * len(ground_truth_list) == len(rag_results)

In [7]:
## 3.1
THRESHOLD = -6.0
answer_list = []
current_index = 0  # Pointer to keep track of the current position in rag_results

for i, direct_res in tqdm(direct_results.iterrows()):
    direct_eigen_score = direct_res['eigen_score']
    
    if direct_eigen_score < THRESHOLD:
        answer_list.append(direct_res['answer'])
        
    else:
        len_ctxs = direct_res['len_ctxs']
        rag_batch = rag_results.iloc[current_index : current_index + len_ctxs]

        best_rag_result = rag_batch.loc[rag_batch['eigen_score'].idxmin()]
        best_answer = rag_batch.loc[rag_batch['eigen_score'].idxmin()]['answer']
        
        answer_list.append(best_answer)
        current_index += len_ctxs
        
my_eval(answer_list, ground_truth_list)

0it [00:00, ?it/s]

EM: 0.228000	 F1: 0.353668


In [26]:
with open('squad_ours.jsonl', 'w', encoding='utf-8') as f:
    pd.DataFrame([answer_list, ground_truth_list]).T.rename(columns={0:'predict', 1:'answer'}).to_json(f, orient='records', lines=True)

In [2]:
! python eval.py ./squad_ours.jsonl 'predict' 'answer'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 6735.57it/s]
Mean Accuracy: 0.2680
Mean EM: 0.2260
Mean F1: 0.3612


In [7]:
! python bootrstrap_CI_upd.py --input_file "squad_ours.jsonl"\
                            --pred_col "predict"\
                            --gt_col "answer"\
                            --n_rounds 1000

bootstrap: 100%|██████████████████████████| 1000/1000 [00:00<00:00, 2892.41it/s]
--------------------------------------------------
 Accuracy (init, mean, median): 0.268, 0.268, 0.268 |Std: 0.02 | 95% CI: (0.23, 0.306)
 EM (init, mean, median): 0.226, 0.226, 0.226 | Std: 0.019 | 95% CI: (0.19, 0.262)
 F1 (init, mean, median): 0.361, 0.361, 0.361  | Std: 0.019 |95% CI: (0.323, 0.4)


## TQ Evaluation

In [3]:
dataset_name: Literal['nq', 'tq', 'sq'] = 'tq'

raw_data = pd.read_json(f"./data/singlehop_data/processed_{dataset_name}.json")
ground_truth_list = raw_data['answer']

direct_results = pd.read_json(f"./outputs_{dataset_name}_ours_3_1/direct.jsonl", lines=True) # replace with your output file
rag_results = pd.read_json(f"./outputs_{dataset_name}_ours_3_1/rag.jsonl", lines=True) # replace with your output file
data_ = pd.read_json(f'./data/singlehop_data/{dataset_name}_top10.json')
data_.len_ctxs = data_.len_ctxs.apply(lambda x: x[0]) 
direct_results['len_ctxs']=data_.len_ctxs

assert len(ground_truth_list) == len(direct_results)

In [5]:
## 3.1
THRESHOLD = -6.0
answer_list = []
current_index = 0  # Pointer to keep track of the current position in rag_results

for i, direct_res in tqdm(direct_results.iterrows()):
    direct_eigen_score = direct_res['eigen_score']
    
    if direct_eigen_score < THRESHOLD:
        answer_list.append(direct_res['answer'])
        
    else:
        len_ctxs = direct_res['len_ctxs']
        rag_batch = rag_results.iloc[current_index : current_index + len_ctxs]

        best_rag_result = rag_batch.loc[rag_batch['eigen_score'].idxmin()]
        best_answer = rag_batch.loc[rag_batch['eigen_score'].idxmin()]['answer']
        
        answer_list.append(best_answer)
        current_index += len_ctxs
        
my_eval(answer_list, ground_truth_list)

0it [00:00, ?it/s]

EM: 0.598000	 F1: 0.692176


In [14]:
with open('tq_ours.jsonl', 'w', encoding='utf-8') as f:
    pd.DataFrame([answer_list, ground_truth_list]).T.rename(columns={0:'predict', 1:'answer'}).to_json(f, orient='records', lines=True)

In [15]:
! python eval.py ./tq_ours.jsonl 'predict' 'answer'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 2183.41it/s]
Mean Accuracy: 0.6560
Mean EM: 0.5980
Mean F1: 0.6955


In [2]:
! python bootrstrap_CI_upd.py --input_file "tq_ours.jsonl"\
                            --pred_col "predict"\
                            --gt_col "answer"\
                            --n_rounds 1000

bootstrap: 100%|██████████████████████████| 1000/1000 [00:00<00:00, 2902.51it/s]
--------------------------------------------------
 Accuracy (init, mean, median): 0.656, 0.656, 0.656 |Std: 0.021 | 95% CI: (0.616, 0.698)
 EM (init, mean, median): 0.598, 0.597, 0.596 | Std: 0.022 | 95% CI: (0.556, 0.642)
 F1 (init, mean, median): 0.696, 0.695, 0.695  | Std: 0.019 |95% CI: (0.659, 0.732)
