In [1]:
import json
from IPython.display import JSON
import importlib

In [2]:
#dataset_module_path = 'mintaka_dev'
#outfile = 'mintaka_dev_qwen3_3x3_1.out.1'
#
dataset_config_path = '2wikimultihop_sample_dev'
outfile = '2wiki_sample_dev_qwen3_3x3_1.out.0'

In [3]:
if dataset_config_path.endswith('.py'):
    dataset_config_path = dataset_config_path[:-3]
dataset_module = importlib.import_module(dataset_config_path)
dataset = getattr(dataset_module, 'dataset')

Loading /workspace/data/2wikimultihop/data/dev.json


In [4]:
evaluation_raw = []
with open(outfile) as fd:
    line = fd.readline()
    while line:
        evaluation_raw.append(json.loads(line))
        line = fd.readline()

In [5]:
header = evaluation_raw[:1]
evaluation = evaluation_raw[1:]

In [6]:
predictions = set(range(len(evaluation)))

In [18]:
fix_missing_prediction = True
if fix_missing_prediction:
    import re
    answer_pattern = re.compile('Answer: (.*)\.?')
    def get_prediction(full_prediction, remove_dot=True, eos_token='<|im_end|>'):
        prediction = ''
    
        full_prediction = full_prediction.split(eos_token, 1)[0]
        if remove_dot and full_prediction.endswith('.'):
            full_prediction = full_prediction[:-len('.')]
        match = answer_pattern.search(full_prediction)
        if match:
            prediction = match.group(1)
    
        return prediction
    
    for i in range(len(evaluation)):
        if evaluation[i]['prediction'] is None or evaluation[i]['prediction'] == '':
            evaluation[i]['prediction'] = get_prediction(evaluation[i]['full_prediction'])

In [45]:
# only the answered
answered = set([i for i in predictions if evaluation[i].get('prediction','')])
print(len(answered) / len(evaluation))

0.76


In [46]:
dontknow = set([i for i in predictions if evaluation[i].get('prediction','').startswith("I don't know")])
print(len(dontknow) / len(evaluation))

0.3


In [47]:
final_answers = answered - dontknow
print(len(final_answers) / len(evaluation))

0.46


In [48]:
#dataset[0]

In [49]:
for i in final_answers:
    assert evaluation[i]['question'] == dataset[i]['question']

In [50]:
# for mintaka
complexityType = {}
for i in final_answers:
    if not dataset[i]['complexityType'] in complexityType:
        complexityType[dataset[i]['complexityType']] = 0
    complexityType[dataset[i]['complexityType']] += 1 / len(final_answers)
complexityType

KeyError: 'complexityType'

In [51]:
for i in dontknow:
    print(dataset[i]['question'], '--', evaluation[i]['prediction'], '?==', dataset.get_answer(i))
    break

Which film has the director born first, 30 Days In Atlanta or Les Tuche 3? -- I don't know ?== 30 Days In Atlanta


In [52]:
evaluation[1]

{'question': 'Which film has the director born first, 30 Days In Atlanta or Les Tuche 3?',
 'answer_complete': False,
 'prediction': "I don't know",
 'full_prediction': "Reasoning: To determine which film has the director born first, I need to find the birth years of the directors of both films. However, the information provided does not include the birth years of the directors. Without this information, I cannot accurately compare the birth years and thus cannot determine which film's director was born first.\nFact: <30 Days in Atlanta> <genre> <Comedy film> .\nFact: <Les Tuche 3> <genre> <Comedy film> .\nSince the birth years of the directors are not provided, I do not have enough information to determine which director was born first.\n\nAnswer: I don't know.<|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end

# Metrics

In [53]:
import numpy as np

In [54]:
final_answers_idx = list(final_answers)

## Exact match

In [55]:
exact_match = np.zeros((len(evaluation),))
for i in final_answers:
    if evaluation[i]['prediction'] == dataset.get_answer(i):
        exact_match[i] = 1
print(exact_match.mean())
print(exact_match[final_answers_idx].mean())

0.08
0.17391304347826086


In [56]:
inclusion_match = np.zeros((len(evaluation),))
for i in final_answers:
    if dataset.get_answer(i) in evaluation[i]['prediction']:
        inclusion_match[i] = 1
print(inclusion_match.mean())
print(inclusion_match[final_answers_idx].mean())

0.2
0.43478260869565216


## BLEU, METEOR, ROUGE

In [57]:
import nltk
nltk.download('wordnet')

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
bleu_1 = np.zeros((len(evaluation),))
bleu_4 = np.zeros((len(evaluation),))
meteor = np.zeros((len(evaluation),))
rougeL = np.zeros((len(evaluation), 3)) # precision, recall, f1
for i in final_answers:
    reference = dataset.get_answer(i)
    reference_list = reference.split()
    hypothesis = evaluation[i]['prediction']
    hypothesis_list = hypothesis.split()
    bleu_1[i] = nltk.translate.bleu_score.sentence_bleu([reference_list], hypothesis_list, weights=(1,))
    bleu_4[i] = nltk.translate.bleu_score.sentence_bleu([reference_list], hypothesis_list, weights=(0.25, 0.25, 0.25, 0.25))
    meteor[i] = nltk.translate.meteor_score.meteor_score([reference_list], hypothesis_list)
    rouge_scores = scorer.score(reference, hypothesis)
    rougeL[i] = np.array(rouge_scores['rougeL'])
print('BLEU 1', bleu_1.mean())
print('BLEU 4', bleu_4.mean())
print('METEOR', meteor.mean())
print('rougeL', rougeL.mean(axis=0))
print('--final answers--')
print('BLEU 1', bleu_1[final_answers_idx].mean())
print('BLEU 4', bleu_4[final_answers_idx].mean())
print('METEOR', meteor[final_answers_idx].mean())
print('rougeL', rougeL[final_answers_idx].mean(axis=0))

BLEU 1 0.16040579710144925
BLEU 4 1.7633882395129633e-155
METEOR 0.17634327749035067
rougeL [0.22239065 0.34       0.23556652]
--final answers--
BLEU 1 0.34870825456836796
BLEU 4 3.833452694593399e-155
METEOR 0.38335495106597967
rougeL [0.48345793 0.73913043 0.51210114]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## Excel

In [59]:
import pandas as pd
import os

In [65]:
from transformers import AutoTokenizer
def pd_generator(evaluation, dataset, EM, IM, bleu1, bleu4, meteor, rougeL, final_answers, dontknow):
    tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B-Instruct')
    for i in range(len(evaluation)):
        prediction = evaluation[i]
        target = dataset[i]

        assert prediction['question'] == target['question']
        
        yield (target['question'],
            prediction['prediction'],
            dataset.get_answer(i),  
            EM[i],
            IM[i],
            bleu1[i],
            bleu4[i],
            meteor[i],
            rougeL[i,2],
            i in final_answers,
            i in dontknow,
            prediction['full_prediction'],
            prediction['full_sample'],
            prediction['triples'],
            target['answer'])

In [66]:
data = pd_generator(evaluation, dataset, exact_match, inclusion_match, bleu_1, bleu_4, meteor, rougeL, final_answers, dontknow)

In [67]:
columns = ['Question', 'Prediction', 'Answer', 'EM', 'IM', 'BLEU1', 'B4', 'METEOR', 'ROUGEL', 'Answered', 'DontKnow', 'FULL prediction', 'FULL sample', 'Triples', 'AnswerBig']
evaldf = pd.DataFrame(data, columns = columns)
print(evaldf.shape)
evaldf.head()

(50, 15)


Unnamed: 0,Question,Prediction,Answer,EM,IM,BLEU1,B4,METEOR,ROUGEL,Answered,DontKnow,FULL prediction,FULL sample,Triples,AnswerBig
0,Where was the father of Murad I born?,"Orhan I, the father of Murad I, was born in Sö...",Söğüt,0.0,1.0,0.0,0.0,0.0,0.285714,True,False,Reasoning: To find out where the father of Mur...,<|im_end|><|im_end|><|im_end|><|im_end|><|im_e...,"[ <Murad I> <father> <Orhan> .\n, <Orhan> <pl...",Söğüt
1,"Which film has the director born first, 30 Day...",I don't know,30 Days In Atlanta,0.0,0.0,0.0,0.0,0.0,0.0,False,True,Reasoning: To determine which film has the dir...,<|im_start|>system\nYou are a helpful question...,[ <30 Days in Atlanta> <genre> <Comedy film> ....,30 Days In Atlanta
2,Who is Henriette Of France (1727–1752)'s mater...,,Stanisław Leszczyński,0.0,0.0,0.0,0.0,0.0,0.0,False,False,"Reasoning: To answer this question, I need to ...",<|im_start|>system\nYou are a helpful question...,[ <Henriette of France> <mother> <Marie Leszcz...,Stanisław Leszczyński
3,Are both Les Coquetiers Railway Station and Ga...,"No, both Les Coquetiers Railway Station and Ga...",yes,0.0,0.0,0.0,0.0,0.0,0.0,True,False,Reasoning: To determine if both Les Coquetiers...,<|im_start|>system\nYou are a helpful question...,[ <Les Coquetiers railway station> <located in...,yes
4,"Which film has the director who died later, Go...",Goya or the Hard Way to Enlightenment.,Sins Of Rome,0.0,0.0,0.0,0.0,0.0,0.0,True,False,Reasoning: To determine which film has the dir...,<|im_end|><|im_end|><|im_start|>system\nYou ar...,[ <Goya or the Hard Way to Enlightenment> <dir...,Sins Of Rome


In [69]:
xlsx_file = f'{dataset_config_path}_{outfile}.xlsx'
assert not os.path.isfile(xlsx_file), f'Error: {xlsx_file} already exists'
print(xlsx_file)
with pd.ExcelWriter(xlsx_file) as writer:
    evaldf.to_excel(writer, sheet_name=f"{dataset_config_path} X {outfile}")

2wikimultihop_sample_dev_2wiki_sample_dev_qwen3_3x3_1.out.0.xlsx




1 correct ~
4 wrong
6 correct -
8 correct +
12 wrong
13 wrong
14 wrong

common problems:
- model answer too quickly --> prompt
- need to generate more triples --> prompt (maybe more than one-shot)
- need to generate different triples!! --> can I calculate number of leaves at index time to avoid the model to fall in the same leaf?

In [44]:
JSON(evaluation[6])

<IPython.core.display.JSON object>

In [45]:
i = 2
print(evaluation[i]['question'], dataset[i-1])
print(evaluation[i]['full_prediction'])

How many games are in the Uncharted series? How many games are in the Uncharted series?
Reasoning: To answer how many games are in the Uncharted series, I need to find information about the total number of games released under the Uncharted franchise. I will look for a list of all Uncharted games and count them.
Fact: <Uncharted 4: A Thief's End> <platform> <PlayStation 4> .
Fact: <Uncharted 3: Drake's Deception> <platform> <PlayStation 3> .
Fact: <Uncharted 2: Among Thieves> <platform> <PlayStation 3> .
Fact: <Uncharted> <platform> <PlayStation 3> .
Fact: <Uncharted 2: Among Thieves> <platform> <PlayStation 4> .
Fact: <Uncharted 4: A Thief's End> <developer> <Naughty Dog> .
Fact: <Uncharted 3: Drake's Deception> <developer> <Naughty Dog> .
Fact: <Uncharted 2: Among Thieves> <developer> <Naughty Dog> .
Fact: <Uncharted> <developer> <Naughty Dog> .
Fact: <Uncharted 4: A Thief's End> <publisher> <Sony Interactive Entertainment> .
Fact: <Uncharted 3: Drake's Deception> <publisher> <Sony I

In [46]:
print(dataset[i-1])

How many games are in the Uncharted series?


In [47]:
for i in range(1, len(evaluation)):
    print(i, evaluation[i]['prediction'])

1 
2 
3 
4 
5 
6  A Game of Thrones.<|im_end|>
7  The Seattle Seahawks have played in Super Bowls XLII, XLIII, XLV, XLIX, LI, LII, LIII, and LV but have not won any of them.<|im_end|>
8  I don't know.<|im_end|>
9  Yes, "Risky Business" came out before "Top Gun".<|im_end|>
10 
11  Morgan Freeman is older.
Note: It's worth mentioning that both actors share the same year of birth, but Morgan Freeman was born on June 1, 1937, while Anthony Hopkins was born on December 31, 1937, which makes Morgan Freeman the older of the two.<|im_end|>
12  Anthony Hopkins is older than George Clooney.<|im_end|>
13  There is not enough information provided to determine which movie had the lowest production cost between "Eraserhead" and "Little Miss Sunshine".<|im_end|>
14  Brad Pitt is older than George Clooney.<|im_end|>
15  No, "Gone with the Wind" did not come out before 1940.<|im_end|>
16  Johnny Depp is older.<|im_end|>
17 
18  I don't know.<|im_end|>
19  Yes, the movie Diamonds Are Forever came out be