## Evaluator

In [None]:
# This script is modified from https://github.com/oneal2000/DRAGIN
from typing import Dict, List, Callable, Tuple, Union, Callable
import string
import os
import json
import re
import numpy as np
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd

class MultiHopEvaluator:
    @classmethod
    def get_all_alias(cls, ground_truth_id: str) -> List[str]:
        return {}

    @classmethod
    def normalize_answer(cls, s):
        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)
        def white_space_fix(text):
            return ' '.join(text.split())
        def remove_punc(text):
            exclude = set(string.punctuation)
            return ''.join(ch for ch in text if ch not in exclude)
        def lower(text):
            return text.lower()
        if not isinstance(s, str):
            return ""
        return white_space_fix(remove_articles(remove_punc(lower(s))))

    @classmethod
    def exact_match_score(
        cls,
        prediction: str,
        ground_truth: Union[str, List[str]],
        ground_truth_id: Union[str, List[str]] = None
    ):
        if not prediction:
            return {'correct': 0, 'incorrect': 1}
        ground_truths = {ground_truth} if isinstance(ground_truth, str) else set(ground_truth)
        if ground_truth_id and isinstance(ground_truth_id, str):
            ground_truths.update(cls.get_all_alias(ground_truth_id))

        correct = np.max([int(cls.normalize_answer(prediction) == cls.normalize_answer(gt)) for gt in ground_truths])
        return {'correct': correct, 'incorrect': 1 - correct}

    @classmethod
    def f1_score(
        cls,
        prediction: str,
        ground_truth: Union[str, List[str]],
        ground_truth_id: Union[str, List[str]] = None
    ):
        final_metric = {'f1': 0, 'precision': 0, 'recall': 0}
        
        if not prediction:
            return final_metric
        ground_truths = {ground_truth} if isinstance(ground_truth, str) else set(ground_truth)
        if ground_truth_id and isinstance(ground_truth_id, str):
            ground_truths.update(cls.get_all_alias(ground_truth_id))
            
        for ground_truth in ground_truths:
            normalized_prediction = cls.normalize_answer(prediction)
            normalized_ground_truth = cls.normalize_answer(ground_truth)
            if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
                continue
            if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
                continue
            prediction_tokens = normalized_prediction.split()
            ground_truth_tokens = normalized_ground_truth.split()
            common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
            num_same = sum(common.values())
            if num_same == 0:
                continue

            precision = 1.0 * num_same / len(prediction_tokens)
            recall = 1.0 * num_same / len(ground_truth_tokens)
            f1 = (2 * precision * recall) / (precision + recall)
            for k in ['f1', 'precision', 'recall']:
                final_metric[k] = max(eval(k), final_metric[k])
        return final_metric
    
    def eval_answer(self, results_df, answer_col="Final Answer"):
        # for datasets don't have answer_ids, aliases
        em_list = []
        f1_list = []
        for i, row in results_df.iterrows():
            prediction = row[answer_col]
            ground_truth = row['ground_truth']
            em_list.append(self.exact_match_score(prediction, ground_truth, None)['correct'])
            f1_list.append(self.f1_score(prediction, ground_truth, None)['f1'])
        print(f"EM: {sum(em_list)/len(em_list):4f}\t F1: {sum(f1_list)/len(f1_list):4f}")

## TwoWikiHop

In [2]:
class WikiMultiHopEvaluator(MultiHopEvaluator):

    def __init__(self, data_path: str="data/multihop_data/2wikimultihopqa"): 
        # logger.info(f"Loading WikiMultiHopQA from {data_path}")
        dataset = []
        with open(os.path.join(data_path, 'dev.json'), 'r') as fin:
            js = json.load(fin)
            for example in tqdm(js):
                qid = example['_id']
                question = example['question']
                ans = example['answer']
                ans_id = example['answer_id']
                # ctxs = example['ctxs']
                dataset.append({
                    'qid': qid,
                    'question': question,
                    'answer': ans,
                    'answer_id': ans_id,
                    # 'ctxs': ctxs,
                })
        self.dataset = dataset
        self.dataset_from_qid = {entry['qid']: entry for entry in self.dataset}
        self.init_id_aliases(data_path)
        
    @classmethod
    def init_id_aliases(cls, data_path):
        cls.id_alias: Dict[str, List[str]] = {}
        with open(os.path.join(data_path, 'id_aliases.json'), 'r') as fin:
            for l in fin:
                l = json.loads(l)
                cls.id_alias[l['Q_id']] = l['aliases']

    @classmethod
    def get_all_alias(cls, ground_truth_id: str) -> List[str]:
        if ground_truth_id and ground_truth_id in cls.id_alias:
            return cls.id_alias[ground_truth_id]
        else:
            return []

    def get_real_prediction(self, pred):
        if "the answer is" in pred:
            beg = pred.find("the answer is") + len("the answer is") + 1
            pred = pred[beg:] # delete final "."
            if pred.endswith("</s>"):
                pred = pred[:len(pred) - len("</s>")]
            if pred.endswith("<|endoftext|>"):
                pred = pred[:len(pred) - len("<|endoftext|>")]
            if pred.endswith("."):
                pred = pred[:-1]
            return pred
        else:
            return pred
        
    def eval_answer(self, results_df, answer_col="Final Answer"):
        em_list = []
        f1_list = []
        for i, row in results_df.iterrows():
            prediction = row[answer_col]
            ground_truth = row['ground_truth']
            ground_truth_id = self.dataset_from_qid[row['qid']]['answer_id']
            em_list.append(self.exact_match_score(prediction, ground_truth, ground_truth_id)['correct'])
            f1_list.append(self.f1_score(prediction, ground_truth, ground_truth_id)['f1'])
        print(f"EM: {sum(em_list)/len(em_list):4f}\t F1: {sum(f1_list)/len(f1_list):4f}")

In [None]:
results_df =  pd.read_json("outputs/twowikihop_llama3/results.jsonl", lines=True)
print(len(results_df))

In [None]:
twowikihop_evaluator = WikiMultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    twowikihop_evaluator.eval_answer(results_df=results_df, answer_col=column_name)

## HotpotQA

In [None]:
results_df = pd.read_json("outputs/hotpotqa_llama3/results.jsonl", lines=True),
print(len(results_df))

In [None]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

In [10]:
### OUR

In [8]:
results_df = pd.read_json("outputs_new/hotpot_3_1_1120_1549/results.jsonl", lines=True)
print(len(results_df))

500


In [9]:
results_df

Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,5a8481945542997175ce1ed3,Are Steve Perry and Leslie West both singers?,1,7,yes,yes,yes,yes
1,5ae479745542996836b02cb1,What is the name of the movie that stars Katri...,1,7,sex drive,sex drive,sex drive,Sex Drive
2,5ac42f42554299076e296d88,Who starred in My Dog Skip and Malcolm in the ...,1,7,frankie muniz,frankie muniz,frankie muniz,Frankie Muniz
3,5a7c17c95542990527d5544d,A guitar player called Noodles is a longtime m...,1,7,the offspring,the offspring,the offspring,The Offspring
4,5a8e0d04554299068b959e3a,What country does Haringey Heartlands and The ...,1,7,england,the united kingdom,england,London
...,...,...,...,...,...,...,...,...
495,5a8a5bd355429970aeb702b9,Falling Stars is a role-playing game published...,3,15,the united states,united states,the united states,Lithuania
496,5ae68fcb5542992ae0d1635b,"""Ew!"" is a song by a television host born where?",2,10,palermo,palermo,palermo,"Bay Ridge, Brooklyn"
497,5a74f3f55542993748c8974b,"What movie was the actor born on May 4, 1946 i...",2,10,the poseidon adventure,the poseidon adventure,the poseidon adventure,Five Summer Stories
498,5ae524415542992663a4f121,In what city did the 23rd overall pick of the ...,2,11,helsinki,grand rapids,helsinki,"Helsinki, Finland"


In [12]:
results_df['Retrieval Times'].mean()

1.758

In [10]:
results_df['Call LLM Times'].mean()

9.85

In [8]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

Final Answer
EM: 0.400000	 F1: 0.522819
Final Step Answer
EM: 0.386000	 F1: 0.510819
Final Read Answer
EM: 0.416000	 F1: 0.531038


In [3]:
# Music
results_df = pd.read_json("outputs_new/music_3_1_1123_2147/results.jsonl", lines=True)
print(len(results_df))

500


In [4]:
results_df

Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,2hop__249867_557232,Which country is the Desert Forest Golf Club l...,1,7,united states,united states,united states,Maricopa County
1,2hop__606439_127399,What year did the group that performed From Th...,1,7,2011,2011,2011,2005
2,2hop__496961_554601,What's the place of birth of the former member...,1,7,sydney,sydney,sydney,Kingscliff
3,2hop__149710_108549,Who is the show Hawkgirl is from by?,1,7,justice league unlimited,justice league unlimited,justice league unlimited,Alfred Gough
4,2hop__50910_177869,Who married the publisher of abolitionist news...,1,7,frederick douglass,anna murray,frederick douglass,Helen Pitts Douglass
...,...,...,...,...,...,...,...,...
495,3hop1__649930_15840_36002,What were the Genesis' advantages over the pla...,5,23,"1. The Sega Genesis had a 16-bit processor, wh...",16-bit processor,"1. The Sega Genesis had a 16-bit processor, wh...",built on 16-bit architectures and offered impr...
496,2hop__198459_61845,Who has played for both West Ham Ajax and the ...,5,22,michael essien,St. Mirren.,St. Mirren.,Mido
497,4hop2__5206_14670_8987_8529,"When was ""Slavs"" used in the national anthem o...",5,21,1927,1926,1926,1943–1992
498,4hop3__439878_88460_30152_20999,How were people from whom new coins were a pro...,5,19,not mentioned,the king of rattanakosin kingdom,not mentioned,The dynasty regrouped and defeated the Portuguese


In [5]:
results_df['Retrieval Times'].mean()

2.398

In [7]:
results_df['Call LLM Times'].mean()

12.314

In [6]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

Final Answer
EM: 0.112000	 F1: 0.215475
Final Step Answer
EM: 0.110000	 F1: 0.215397
Final Read Answer
EM: 0.106000	 F1: 0.207597


In [4]:
# Music
results_df = pd.read_json("outputs_new/2wiki_3_1_1123_2315/results.jsonl", lines=True)
print(len(results_df))
results_df.head(3)

500


Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,5c83d5960bde11eba7f7acde48001122,Where did Edward Hoby's father study?,1,7,cambridge,cambridge university,cambridge,"St. John's College, Cambridge"
1,8ee8259e0bdb11eba7f7acde48001122,Where did Abe Laboriel Jr 's father study?,1,7,berklee college of music,berklee college of music,berklee college of music,Berklee College of Music
2,7adefa720bdc11eba7f7acde48001122,"Where did George Tryon, 1St Baron Tryon's fath...",1,7,london,london,london,Tripoli


In [5]:
results_df['Retrieval Times'].mean(), results_df['Call LLM Times'].mean()

(2.436, 12.278)

In [6]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

Final Answer
EM: 0.382000	 F1: 0.460338
Final Step Answer
EM: 0.382000	 F1: 0.456268
Final Read Answer
EM: 0.366000	 F1: 0.445003


## IIRC

In [None]:
results_df = pd.read_json("outputs/iirc_llama3/results.jsonl", lines=True)
print(len(results_df))
results_df.head()

In [None]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

## Metrics OUR SCRIPT

In [1]:
from typing import Dict, List, Callable, Tuple, Union, Callable
import string
import os
import json
import re
import numpy as np
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd

class MultiHopEvaluator:
    @classmethod
    def get_all_alias(cls, ground_truth_id: str) -> List[str]:
        return {}

    @classmethod
    def normalize_answer(cls, s):
        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)
        def white_space_fix(text):
            return ' '.join(text.split())
        def remove_punc(text):
            exclude = set(string.punctuation)
            return ''.join(ch for ch in text if ch not in exclude)
        def lower(text):
            return text.lower()
        if not isinstance(s, str):
            return ""
        return white_space_fix(remove_articles(remove_punc(lower(s))))

    @classmethod
    def exact_match_score(
        cls,
        prediction: str,
        ground_truth: Union[str, List[str]],
        ground_truth_id: Union[str, List[str]] = None
    ):
        if not prediction:
            return {'correct': 0, 'incorrect': 1}
        ground_truths = {ground_truth} if isinstance(ground_truth, str) else set(ground_truth)
        if ground_truth_id and isinstance(ground_truth_id, str):
            ground_truths.update(cls.get_all_alias(ground_truth_id))

        correct = np.max([int(cls.normalize_answer(prediction) == cls.normalize_answer(gt)) for gt in ground_truths])
        return {'correct': correct, 'incorrect': 1 - correct}

    @classmethod
    def f1_score(
        cls,
        prediction: str,
        ground_truth: Union[str, List[str]],
        ground_truth_id: Union[str, List[str]] = None
    ):
        final_metric = {'f1': 0, 'precision': 0, 'recall': 0}
        
        if not prediction:
            return final_metric
        ground_truths = {ground_truth} if isinstance(ground_truth, str) else set(ground_truth)
        if ground_truth_id and isinstance(ground_truth_id, str):
            ground_truths.update(cls.get_all_alias(ground_truth_id))
            
        for ground_truth in ground_truths:
            normalized_prediction = cls.normalize_answer(prediction)
            normalized_ground_truth = cls.normalize_answer(ground_truth)
            if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
                continue
            if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
                continue
            prediction_tokens = normalized_prediction.split()
            ground_truth_tokens = normalized_ground_truth.split()
            common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
            num_same = sum(common.values())
            if num_same == 0:
                continue

            precision = 1.0 * num_same / len(prediction_tokens)
            recall = 1.0 * num_same / len(ground_truth_tokens)
            f1 = (2 * precision * recall) / (precision + recall)
            for k in ['f1', 'precision', 'recall']:
                final_metric[k] = max(eval(k), final_metric[k])
        return final_metric

    @classmethod
    def has_answer(cls, answers: Union[str, List[str]], prediction: str, match_type="string"):
        """
        Check if any of the answers are present in the prediction.
        """
        if not prediction:
            return 0
        if isinstance(answers, str):
            answers = [answers]
        normalized_prediction = cls.normalize_answer(prediction)
        prediction_tokens = normalized_prediction.split()
        for answer in answers:
            normalized_answer = cls.normalize_answer(answer)
            answer_tokens = normalized_answer.split()
            # Check if answer_tokens are in prediction_tokens
            for i in range(len(prediction_tokens) - len(answer_tokens) + 1):
                if prediction_tokens[i:i+len(answer_tokens)] == answer_tokens:
                    return 1
        return 0

    def eval_answer(self, results_df, answer_col="Final Answer"):
        # For datasets that don't have answer_ids or aliases
        em_list = []
        f1_list = []
        accuracy_list = []
        for i, row in results_df.iterrows():
            prediction = row[answer_col]
            ground_truth = row['ground_truth']
            if isinstance(ground_truth, str):
                ground_truths = [ground_truth]
            else:
                ground_truths = ground_truth  # Assuming ground_truth is a list
            em_list.append(self.exact_match_score(prediction, ground_truths, None)['correct'])
            f1_list.append(self.f1_score(prediction, ground_truths, None)['f1'])
            accuracy_list.append(self.has_answer(ground_truths, prediction))
        print(f"EM: {sum(em_list)/len(em_list):.4f}\t F1: {sum(f1_list)/len(f1_list):.4f}\t Accuracy: {sum(accuracy_list)/len(accuracy_list):.4f}")


### HotPot

In [8]:
import os
import glob
import re
import pandas as pd

# Define the path to the log directory
log_dir = './outputs_new/hotpot_3_1/logs'  # Change this if your log directory is different

# Initialize a list to store the results
results = []
# Define regex patterns
qid_pattern = re.compile(r'INFO - qid:\s*([a-fA-F0-9]+)')
final_answer_pattern = re.compile(
    r'INFO - Final (?:step|read) answer Better\s+([-+]?\d*\.?\d+)'
)

# Iterate over all .log files in the log directory
for log_file in glob.glob(os.path.join(log_dir, '*.log')):
    with open(log_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract qid
    qid_match = qid_pattern.search(content)
    if qid_match:
        qid = qid_match.group(1)
    else:
        qid = None  # or assign a default value or skip
    
    # Extract all final answer eigen scores
    eigen_scores = final_answer_pattern.findall(content)
    if eigen_scores:
        # Convert all found scores to float
        eigen_scores = [float(score) for score in eigen_scores]
        # Take the last eigen score
        last_eigen_score = eigen_scores[-1]
    else:
        last_eigen_score = None  # or assign a default value or skip
    
    # Append the extracted information to the results list
    results.append({
        'qid': qid,
        'last_step_eigen_score': last_eigen_score
    })


df = pd.DataFrame(results)

In [9]:

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Optionally, drop rows where qid or eigen score wasn't found
df = df.dropna(subset=['qid', 'last_step_eigen_score'])

# Reset index for cleanliness
df.reset_index(drop=True, inplace=True)

In [10]:
df

Unnamed: 0,qid,last_step_eigen_score
0,5a90937955429933b8a20568,-1.729184
1,5a8b78775542997f31a41d3d,-0.191309
2,5a7db62d5542997cc2c4747c,-5.440386
3,5a84af3e5542991dd0999d7c,-0.232433
4,5a828a7b55429954d2e2eb69,-0.407650
...,...,...
495,5abb03325542992ccd8e7eae,-2.965832
496,5ab74eb85542993667793fb8,-2.185149
497,5a7a5b3a5542996c55b2dd71,-3.272111
498,5adf65555542992d7e9f9334,-1.690654


In [17]:
total_hotpot = results_df.merge(df, on = 'qid')
total_hotpot

Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth,last_step_eigen_score
0,5a8481945542997175ce1ed3,Are Steve Perry and Leslie West both singers?,1,7,yes,yes,yes,yes,-3.188928
1,5ae479745542996836b02cb1,What is the name of the movie that stars Katri...,1,7,sex drive,sex drive,sex drive,Sex Drive,-3.080185
2,5ac42f42554299076e296d88,Who starred in My Dog Skip and Malcolm in the ...,1,7,frankie muniz,frankie muniz,frankie muniz,Frankie Muniz,-4.222337
3,5a7c17c95542990527d5544d,A guitar player called Noodles is a longtime m...,1,7,the offspring,the offspring,the offspring,The Offspring,-1.602943
4,5a8e0d04554299068b959e3a,What country does Haringey Heartlands and The ...,1,7,england,the united kingdom,england,London,-0.931478
...,...,...,...,...,...,...,...,...,...
495,5a8a5bd355429970aeb702b9,Falling Stars is a role-playing game published...,3,15,the united states,united states,the united states,Lithuania,-0.596343
496,5ae68fcb5542992ae0d1635b,"""Ew!"" is a song by a television host born where?",2,10,palermo,palermo,palermo,"Bay Ridge, Brooklyn",-0.697833
497,5a74f3f55542993748c8974b,"What movie was the actor born on May 4, 1946 i...",2,10,the poseidon adventure,the poseidon adventure,the poseidon adventure,Five Summer Stories,-0.300882
498,5ae524415542992663a4f121,In what city did the 23rd overall pick of the ...,2,11,helsinki,grand rapids,helsinki,"Helsinki, Finland",-0.586357


In [23]:
with open("./outputs_new/hotpot_3_1/scores.json", 'w', encoding='utf-8') as f:
    total_hotpot.to_json(f,  orient='records', lines=True)

In [11]:
results_df = pd.read_json("outputs_new/hotpot_3_1/results.jsonl", lines=True)
print(len(results_df))
results_df.head(3)

500


Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,5a8481945542997175ce1ed3,Are Steve Perry and Leslie West both singers?,1,7,yes,yes,yes,yes
1,5ae479745542996836b02cb1,What is the name of the movie that stars Katri...,1,7,sex drive,sex drive,sex drive,Sex Drive
2,5ac42f42554299076e296d88,Who starred in My Dog Skip and Malcolm in the ...,1,7,frankie muniz,frankie muniz,frankie muniz,Frankie Muniz


In [28]:
results_df['Call LLM Times'].mean()

9.85

In [20]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

Final Answer
EM: 0.4000	 F1: 0.5228	 Accuracy: 0.4240
Final Step Answer
EM: 0.3860	 F1: 0.5108	 Accuracy: 0.4180
Final Read Answer
EM: 0.4160	 F1: 0.5310	 Accuracy: 0.4320


In [10]:
! python bootstrap_multihop.py --input_file "outputs_new/hotpot_3_1/results.jsonl"\
                            --pred_cols "Final Answer" "Final Step Answer" "Final Read Answer"\
                            --gt_col "ground_truth"\
                            --n_rounds 1000

Final Answer
EM: 0.4000	 F1: 0.5228	 Accuracy: 0.4240
Bootstrapping for Final Answer: 100%|█████| 1000/1000 [00:00<00:00, 2750.99it/s]
Final Answer EM - Mean: 0.4009, Median: 0.4000, Std Dev: 0.0221, 95% CI: (0.3540, 0.4440)
Final Answer F1 - Mean: 0.5235, Median: 0.5235, Std Dev: 0.0209, 95% CI: (0.4819, 0.5636)
Final Answer Accuracy - Mean: 0.4246, Median: 0.4260, Std Dev: 0.0224, 95% CI: (0.3800, 0.4680)
--------------------------------------------------
Final Step Answer
EM: 0.3860	 F1: 0.5108	 Accuracy: 0.4180
Bootstrapping for Final Step Answer: 100%|█| 1000/1000 [00:00<00:00, 2506.40it/s
Final Step Answer EM - Mean: 0.3856, Median: 0.3860, Std Dev: 0.0219, 95% CI: (0.3439, 0.4280)
Final Step Answer F1 - Mean: 0.5104, Median: 0.5110, Std Dev: 0.0202, 95% CI: (0.4709, 0.5502)
Final Step Answer Accuracy - Mean: 0.4177, Median: 0.4180, Std Dev: 0.0225, 95% CI: (0.3720, 0.4620)
--------------------------------------------------
Final Read Answer
EM: 0.4160	 F1: 0.5310	 Accuracy: 0.43

### Music

In [29]:
import os
import glob
import re
import pandas as pd

# Define the path to the log directory
log_dir = './outputs_new/music_3_1/logs'  # Change this if your log directory is different

# Initialize a list to store the results
results = []

# Updated qid_pattern to capture various formats like "03ab64820bdb11eba7f7acde48001122" and "2hop__176_51164"
qid_pattern = re.compile(r'INFO\s*-\s*qid:\s*([^\s]+)', re.IGNORECASE)

# Pattern to capture the eigen score after "Final step answer Better" or "Final Read Answer Better"
# It captures the first floating-point number after the keyword
final_answer_pattern = re.compile(
    r'INFO\s*-\s*Final\s+(?:step|read)\s+answer\s+Better\s+([-+]?\d*\.?\d+)',
    re.IGNORECASE
)

# Iterate over all .log files in the log directory
for log_file in glob.glob(os.path.join(log_dir, '*.log')):
    with open(log_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract all qids in the log file
    qid_matches = qid_pattern.findall(content)
    
    # Extract all final answer eigen scores in the log file
    eigen_scores = final_answer_pattern.findall(content)
    
    # If multiple qids and eigen_scores are present, we assume they are ordered correspondingly
    # If there's only one qid per log file, this will still work
    if qid_matches and eigen_scores:
        # Ensure the number of qids matches the number of eigen scores
        # If not, pair the last qid with the last eigen score
        if len(qid_matches) == len(eigen_scores):
            paired = zip(qid_matches, eigen_scores)
        else:
            # Log a warning or handle as needed
            print(f"Warning: Number of qids and eigen scores do not match in {log_file}. Pairing last qid with last eigen score.")
            paired = [(qid_matches[-1], eigen_scores[-1])]
        
        for qid, score in paired:
            results.append({
                'qid': qid,
                'last_step_eigen_score': float(score)
            })
    else:
        # Handle cases where qid or eigen score is missing
        if qid_matches:
            qid = qid_matches[-1]  # Take the last qid if multiple are present
        else:
            qid = None
        
        if eigen_scores:
            last_eigen_score = float(eigen_scores[-1])
        else:
            last_eigen_score = None
        
        results.append({
            'qid': qid,
            'last_step_eigen_score': last_eigen_score
        })

# Create a DataFrame from the results
df = pd.DataFrame(results)
df

Unnamed: 0,qid,last_step_eigen_score
0,2hop__504464_708278,-3.740366
1,2hop__274529_5385,-0.461703
2,2hop__157602_21567,-0.211736
3,2hop__75714_21969,0.400266
4,2hop__731584_700117,-0.677838
...,...,...
495,3hop1__106313_443779_52195,0.433040
496,3hop1__145411_443779_52195,0.587324
497,2hop__819974_129669,0.801444
498,2hop__5190_64006,0.034412


In [30]:
total_music = results_df.merge(df, on = 'qid')
total_music

Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth,last_step_eigen_score
0,2hop__249867_557232,Which country is the Desert Forest Golf Club l...,1,7,united states,united states,united states,Maricopa County,-2.573724
1,2hop__606439_127399,What year did the group that performed From Th...,1,7,2011,2011,2011,2005,-3.947639
2,2hop__496961_554601,What's the place of birth of the former member...,1,7,sydney,sydney,sydney,Kingscliff,-3.226050
3,2hop__149710_108549,Who is the show Hawkgirl is from by?,1,7,justice league unlimited,justice league unlimited,justice league unlimited,Alfred Gough,-1.642056
4,2hop__50910_177869,Who married the publisher of abolitionist news...,1,7,frederick douglass,anna murray,frederick douglass,Helen Pitts Douglass,-0.155800
...,...,...,...,...,...,...,...,...,...
495,3hop1__649930_15840_36002,What were the Genesis' advantages over the pla...,5,23,"1. The Sega Genesis had a 16-bit processor, wh...",16-bit processor,"1. The Sega Genesis had a 16-bit processor, wh...",built on 16-bit architectures and offered impr...,0.777605
496,2hop__198459_61845,Who has played for both West Ham Ajax and the ...,5,22,michael essien,St. Mirren.,St. Mirren.,Mido,0.908945
497,4hop2__5206_14670_8987_8529,"When was ""Slavs"" used in the national anthem o...",5,21,1927,1926,1926,1943–1992,0.747119
498,4hop3__439878_88460_30152_20999,How were people from whom new coins were a pro...,5,19,not mentioned,the king of rattanakosin kingdom,not mentioned,The dynasty regrouped and defeated the Portuguese,0.378867


In [31]:
with open("./outputs_new/music_3_1/scores.json", 'w', encoding='utf-8') as f:
    total_music.to_json(f,  orient='records', lines=True)

In [25]:
results_df = pd.read_json("outputs_new/music_3_1/results.jsonl", lines=True)
print(len(results_df))
results_df.head(3)

500


Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,2hop__249867_557232,Which country is the Desert Forest Golf Club l...,1,7,united states,united states,united states,Maricopa County
1,2hop__606439_127399,What year did the group that performed From Th...,1,7,2011,2011,2011,2005
2,2hop__496961_554601,What's the place of birth of the former member...,1,7,sydney,sydney,sydney,Kingscliff


In [22]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

Final Answer
EM: 0.1120	 F1: 0.2155	 Accuracy: 0.1180
Final Step Answer
EM: 0.1100	 F1: 0.2154	 Accuracy: 0.1180
Final Read Answer
EM: 0.1060	 F1: 0.2076	 Accuracy: 0.1120


In [11]:
! python bootstrap_multihop.py --input_file "outputs_new/music_3_1/results.jsonl"\
                            --pred_cols "Final Answer" "Final Step Answer" "Final Read Answer"\
                            --gt_col "ground_truth"\
                            --n_rounds 1000

Final Answer
EM: 0.1120	 F1: 0.2155	 Accuracy: 0.1180
Bootstrapping for Final Answer: 100%|█████| 1000/1000 [00:00<00:00, 2749.38it/s]
Final Answer EM - Mean: 0.1126, Median: 0.1120, Std Dev: 0.0140, 95% CI: (0.0860, 0.1400)
Final Answer F1 - Mean: 0.2159, Median: 0.2151, Std Dev: 0.0152, 95% CI: (0.1881, 0.2475)
Final Answer Accuracy - Mean: 0.1187, Median: 0.1180, Std Dev: 0.0143, 95% CI: (0.0900, 0.1460)
--------------------------------------------------
Final Step Answer
EM: 0.1100	 F1: 0.2154	 Accuracy: 0.1180
Bootstrapping for Final Step Answer: 100%|█| 1000/1000 [00:00<00:00, 2524.60it/s
Final Step Answer EM - Mean: 0.1105, Median: 0.1100, Std Dev: 0.0142, 95% CI: (0.0840, 0.1380)
Final Step Answer F1 - Mean: 0.2153, Median: 0.2146, Std Dev: 0.0155, 95% CI: (0.1856, 0.2461)
Final Step Answer Accuracy - Mean: 0.1185, Median: 0.1180, Std Dev: 0.0147, 95% CI: (0.0900, 0.1480)
--------------------------------------------------
Final Read Answer
EM: 0.1060	 F1: 0.2076	 Accuracy: 0.11

### 2Wiki

In [36]:
# Define the path to the log directory
log_dir = './outputs_new/2wiki_3_1/logs'  # Change this if your log directory is different

# Initialize a list to store the results
results = []


# Iterate over all .log files in the log directory
for log_file in glob.glob(os.path.join(log_dir, '*.log')):
    with open(log_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract all qids in the log file
    qid_matches = qid_pattern.findall(content)
    
    # Extract all final answer eigen scores in the log file
    eigen_scores = final_answer_pattern.findall(content)
    
    # If multiple qids and eigen_scores are present, we assume they are ordered correspondingly
    # If there's only one qid per log file, this will still work
    if qid_matches and eigen_scores:
        # Ensure the number of qids matches the number of eigen scores
        # If not, pair the last qid with the last eigen score
        if len(qid_matches) == len(eigen_scores):
            paired = zip(qid_matches, eigen_scores)
        else:
            # Log a warning or handle as needed
            print(f"Warning: Number of qids and eigen scores do not match in {log_file}. Pairing last qid with last eigen score.")
            paired = [(qid_matches[-1], eigen_scores[-1])]
        
        for qid, score in paired:
            results.append({
                'qid': qid,
                'last_step_eigen_score': float(score)
            })
    else:
        # Handle cases where qid or eigen score is missing
        if qid_matches:
            qid = qid_matches[-1]  # Take the last qid if multiple are present
        else:
            qid = None
        
        if eigen_scores:
            last_eigen_score = float(eigen_scores[-1])
        else:
            last_eigen_score = None
        
        results.append({
            'qid': qid,
            'last_step_eigen_score': last_eigen_score
        })

df = pd.DataFrame(results)
df

Unnamed: 0,qid,last_step_eigen_score
0,1c8e11ac08e311ebbda4ac1f6bf848b6,0.015028
1,064edb0908cd11ebbd93ac1f6bf848b6,-1.623762
2,6181829c0bb011ebab90acde48001122,-4.571876
3,d0995a680bda11eba7f7acde48001122,-2.118671
4,3e0cebde0bd911eba7f7acde48001122,-2.391613
...,...,...
495,519aee780baf11ebab90acde48001122,0.581712
496,883d60f208a611ebbd7fac1f6bf848b6,-0.639635
497,373b656c0bdc11eba7f7acde48001122,-2.789710
498,dbd9551d086b11ebbd60ac1f6bf848b6,-0.410766


In [37]:
total_2wiki = results_df.merge(df, on = 'qid')
total_2wiki

Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth,last_step_eigen_score
0,5c83d5960bde11eba7f7acde48001122,Where did Edward Hoby's father study?,1,7,cambridge,cambridge university,cambridge,"St. John's College, Cambridge",-3.364396
1,8ee8259e0bdb11eba7f7acde48001122,Where did Abe Laboriel Jr 's father study?,1,7,berklee college of music,berklee college of music,berklee college of music,Berklee College of Music,-3.133579
2,7adefa720bdc11eba7f7acde48001122,"Where did George Tryon, 1St Baron Tryon's fath...",1,7,london,london,london,Tripoli,-2.175076
3,36ffddee0bdc11eba7f7acde48001122,Where was the place of death of the director o...,1,7,los angeles,los angeles,los angeles,Hollywood,-1.517935
4,0e18c6be0bde11eba7f7acde48001122,Where did the performer of song Words Of Love ...,1,7,clear lake,clear lake,clear lake,"Clear Lake, Iowa",-4.130205
...,...,...,...,...,...,...,...,...,...
495,b23e2363086211ebbd5dac1f6bf848b6,"Which film has the director who died later, Li...",3,15,lincoln in the white house,lincoln in the white house,lincoln in the white house,Lincoln In The White House,-0.062791
496,3c3204c8084d11ebbd56ac1f6bf848b6,"Which film whose director is younger, I Enjoy ...",5,22,fear on trial,fear on trial,fear on trial,I Enjoy The World With You,-0.222997
497,cc6b0e9e084e11ebbd56ac1f6bf848b6,"Which film has the director born earlier, Spea...",5,21,being there,being there,being there,Being There,-0.579441
498,3f1b7d3b086c11ebbd61ac1f6bf848b6,Are both directors of films Shadow Of The Law ...,5,22,no,no,no,yes,-0.603179


In [39]:
with open("./outputs_new/2wiki_3_1/scores.json", 'w', encoding='utf-8') as f:
    total_2wiki.to_json(f,  orient='records', lines=True)

In [33]:
results_df = pd.read_json("outputs_new/2wiki_3_1/results.jsonl", lines=True)
print(len(results_df))
results_df.head(3)

500


Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,5c83d5960bde11eba7f7acde48001122,Where did Edward Hoby's father study?,1,7,cambridge,cambridge university,cambridge,"St. John's College, Cambridge"
1,8ee8259e0bdb11eba7f7acde48001122,Where did Abe Laboriel Jr 's father study?,1,7,berklee college of music,berklee college of music,berklee college of music,Berklee College of Music
2,7adefa720bdc11eba7f7acde48001122,"Where did George Tryon, 1St Baron Tryon's fath...",1,7,london,london,london,Tripoli


In [24]:
evaluator = MultiHopEvaluator()
for column_name in ["Final Answer", "Final Step Answer", "Final Read Answer"]:
    print(column_name)
    evaluator.eval_answer(results_df=results_df, answer_col=column_name)

Final Answer
EM: 0.3820	 F1: 0.4603	 Accuracy: 0.3980
Final Step Answer
EM: 0.3820	 F1: 0.4563	 Accuracy: 0.3980
Final Read Answer
EM: 0.3660	 F1: 0.4450	 Accuracy: 0.3820


In [12]:
! python bootstrap_multihop.py --input_file "outputs_new/2wiki_3_1/results.jsonl"\
                            --pred_cols "Final Answer" "Final Step Answer" "Final Read Answer"\
                            --gt_col "ground_truth"\
                            --n_rounds 1000

Final Answer
EM: 0.3820	 F1: 0.4603	 Accuracy: 0.3980
Bootstrapping for Final Answer: 100%|█████| 1000/1000 [00:00<00:00, 2703.41it/s]
Final Answer EM - Mean: 0.3820, Median: 0.3810, Std Dev: 0.0218, 95% CI: (0.3420, 0.4240)
Final Answer F1 - Mean: 0.4601, Median: 0.4591, Std Dev: 0.0206, 95% CI: (0.4214, 0.5004)
Final Answer Accuracy - Mean: 0.3981, Median: 0.3980, Std Dev: 0.0217, 95% CI: (0.3580, 0.4380)
--------------------------------------------------
Final Step Answer
EM: 0.3820	 F1: 0.4563	 Accuracy: 0.3980
Bootstrapping for Final Step Answer: 100%|█| 1000/1000 [00:00<00:00, 2520.99it/s
Final Step Answer EM - Mean: 0.3805, Median: 0.3800, Std Dev: 0.0217, 95% CI: (0.3380, 0.4220)
Final Step Answer F1 - Mean: 0.4551, Median: 0.4552, Std Dev: 0.0207, 95% CI: (0.4150, 0.4960)
Final Step Answer Accuracy - Mean: 0.3966, Median: 0.3960, Std Dev: 0.0217, 95% CI: (0.3540, 0.4380)
--------------------------------------------------
Final Read Answer
EM: 0.3660	 F1: 0.4450	 Accuracy: 0.38

In [12]:
results_df =  pd.read_json("outputs_new/music_3_1_1123_2147/results.jsonl", lines=True)
print(len(results_df))

500


In [13]:
results_df

Unnamed: 0,qid,question,Retrieval Times,Call LLM Times,Final Step Answer,Final Read Answer,Final Answer,ground_truth
0,2hop__249867_557232,Which country is the Desert Forest Golf Club l...,1,7,united states,united states,united states,Maricopa County
1,2hop__606439_127399,What year did the group that performed From Th...,1,7,2011,2011,2011,2005
2,2hop__496961_554601,What's the place of birth of the former member...,1,7,sydney,sydney,sydney,Kingscliff
3,2hop__149710_108549,Who is the show Hawkgirl is from by?,1,7,justice league unlimited,justice league unlimited,justice league unlimited,Alfred Gough
4,2hop__50910_177869,Who married the publisher of abolitionist news...,1,7,frederick douglass,anna murray,frederick douglass,Helen Pitts Douglass
...,...,...,...,...,...,...,...,...
495,3hop1__649930_15840_36002,What were the Genesis' advantages over the pla...,5,23,"1. The Sega Genesis had a 16-bit processor, wh...",16-bit processor,"1. The Sega Genesis had a 16-bit processor, wh...",built on 16-bit architectures and offered impr...
496,2hop__198459_61845,Who has played for both West Ham Ajax and the ...,5,22,michael essien,St. Mirren.,St. Mirren.,Mido
497,4hop2__5206_14670_8987_8529,"When was ""Slavs"" used in the national anthem o...",5,21,1927,1926,1926,1943–1992
498,4hop3__439878_88460_30152_20999,How were people from whom new coins were a pro...,5,19,not mentioned,the king of rattanakosin kingdom,not mentioned,The dynasty regrouped and defeated the Portuguese


In [15]:
with open('music_ours.jsonl', 'w', encoding='utf-8') as f:
    results_df[['Final Answer', 'ground_truth']].rename(columns={'Final Answer':'predict', 'ground_truth':'answer'}).to_json(f, orient='records', lines=True)

In [16]:
! python eval.py ./music_ours.jsonl 'predict' 'answer'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 2407.25it/s]
Mean Accuracy: 0.8440
Mean EM: 0.0100
Mean F1: 0.0187


In [8]:
! python eval.py ./outputs_new/music_3_1_1123_2147/results.jsonl 'Final Answer'	'ground_truth'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 2245.49it/s]
Mean Accuracy: 0.8440
Mean EM: 0.0100
Mean F1: 0.0187


In [9]:
! python eval.py ./outputs_new/hotpot_3_1_1120_1549/results.jsonl 'Final Answer' 'ground_truth'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 2593.49it/s]
Mean Accuracy: 0.7860
Mean EM: 0.0060
Mean F1: 0.0202


In [10]:
! python eval.py ./outputs_new/2wiki_3_1_1123_2315/results.jsonl 'Final Answer'	'ground_truth'

100%|███████████████████████████████████████| 500/500 [00:00<00:00, 2629.35it/s]
Mean Accuracy: 0.8260
Mean EM: 0.0040
Mean F1: 0.0403
