In [3]:
import json
from IPython.display import JSON
import importlib

In [4]:
dataset_module_path = 'mintaka_dev'
outfile = 'mintaka_dev_qwen3_3x3_1.out.1'

In [5]:
dataset_module = importlib.import_module(dataset_module_path)
QADataset = getattr(dataset_module, 'QADataset')
dataset = QADataset()

Loading /workspace/data/mintaka_dev.json


In [6]:
evaluation_raw = []
with open(outfile) as fd:
    line = fd.readline()
    while line:
        evaluation_raw.append(json.loads(line))
        line = fd.readline()

In [7]:
header = evaluation_raw[:1]
evaluation = evaluation_raw[1:]

In [8]:
predictions = set(range(len(evaluation)))

In [9]:
# only the answered
answered = set([i for i in predictions if 'Answer:' in evaluation[i].get('full_prediction','')])
print(len(answered) / len(evaluation))

0.6975


In [10]:
dontknow = set([i for i in predictions if "I don't know." in evaluation[i].get('full_prediction','')])
print(len(dontknow) / len(evaluation))

0.1765


In [11]:
final_answers = answered - dontknow
print(len(final_answers) / len(evaluation))

0.523


In [12]:
dataset[0]

{'id': '9ace9041',
 'question': 'What is the fourth book in the Twilight series?',
 'translations': {'ar': 'ما عنوان الكتاب الرابع في سلسلة "الشفق"؟',
  'de': 'Welches ist das vierte Buch in der Twilight-Saga-Serie?',
  'ja': 'トワイライトシリーズの四番目の本はなんですか？',
  'hi': 'Twilight सीरीज की चौथी किताब कौन सी है?',
  'pt': 'Qual é o quarto livro da série Crespúsculo?',
  'es': '¿Cuál es el cuarto libro de la saga Crepúsculo?',
  'it': 'Qual è il quarto libro della serie Twilight?',
  'fr': 'Quel est le quatrième livre de la série Twilight ?'},
 'questionEntity': [{'name': 'Q44523',
   'entityType': 'entity',
   'label': 'Twilight',
   'mention': 'Twilight',
   'span': [31, 39]},
  {'name': 4, 'entityType': 'ordinal', 'mention': 'fourth', 'span': [12, 18]}],
 'answer': {'answerType': 'entity',
  'answer': [{'name': 'Q53945',
    'label': {'en': 'Breaking Dawn',
     'ar': 'بزوغ الفجر',
     'de': 'Bis(s) zum Ende der Nacht',
     'es': 'Amanecer',
     'fr': 'Révélation',
     'hi': None,
     'it':

In [13]:
for i in final_answers:
    assert evaluation[i]['question'] == dataset[i]['question']

In [14]:
complexityType = {}
for i in final_answers:
    if not dataset[i]['complexityType'] in complexityType:
        complexityType[dataset[i]['complexityType']] = 0
    complexityType[dataset[i]['complexityType']] += 1 / len(final_answers)
complexityType

{'intersection': 0.09560229445506703,
 'difference': 0.07361376673040151,
 'comparative': 0.1309751434034419,
 'count': 0.06596558317399612,
 'multihop': 0.11089866156787782,
 'ordinal': 0.10994263862332715,
 'superlative': 0.062141491395793426,
 'yesno': 0.11472275334608052,
 'generic': 0.2361376673040146}

In [15]:
for i in final_answers:
    print(dataset[i]['question'], evaluation[i]['prediction'], dataset[i]['answer']['mention'])
    break

Which book in A Song of Ice and Fire also shares the name of a popular TV series?  A Game of Thrones.<|im_end|> A Game of Thrones


# Metrics

In [16]:
import numpy as np

In [17]:
final_answers_idx = list(final_answers)

## Exact match

In [18]:
exact_match = np.zeros((len(evaluation),))
for i in final_answers:
    if evaluation[i]['prediction'] == dataset[i]['answer']['mention']:
        exact_match[i] = 1
print(exact_match.mean())
print(exact_match[final_answers_idx].mean())

0.0
0.0


In [19]:
inclusion_match = np.zeros((len(evaluation),))
for i in final_answers:
    if dataset[i]['answer']['mention'] in evaluation[i]['prediction']:
        inclusion_match[i] = 1
print(inclusion_match.mean())
print(inclusion_match[final_answers_idx].mean())

0.2875
0.5497131931166348


## BLEU, METEOR, ROUGE

In [20]:
import nltk
nltk.download('wordnet')

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
bleu_1 = np.zeros((len(evaluation),))
bleu_4 = np.zeros((len(evaluation),))
meteor = np.zeros((len(evaluation),))
rougeL = np.zeros((len(evaluation), 3)) # precision, recall, f1
for i in final_answers:
    reference = dataset[i]['answer']['mention']
    reference_list = reference.split()
    hypothesis = evaluation[i]['prediction']
    hypothesis_list = hypothesis.split()
    bleu_1[i] = nltk.translate.bleu_score.sentence_bleu([reference_list], hypothesis_list, weights=(1,))
    bleu_4[i] = nltk.translate.bleu_score.sentence_bleu([reference_list], hypothesis_list, weights=(0.25, 0.25, 0.25, 0.25))
    meteor[i] = nltk.translate.meteor_score.meteor_score([reference_list], hypothesis_list)
    rouge_scores = scorer.score(reference, hypothesis)
    rougeL[i] = np.array(rouge_scores['rougeL'])
print('BLEU 1', bleu_1.mean())
print('BLEU 4', bleu_4.mean())
print('METEOR', meteor.mean())
print('rougeL', rougeL.mean(axis=0))
print('--final answers--')
print('BLEU 1', bleu_1[final_answers_idx].mean())
print('BLEU 4', bleu_4[final_answers_idx].mean())
print('METEOR', meteor[final_answers_idx].mean())
print('rougeL', rougeL[final_answers_idx].mean(axis=0))

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU 1 0.04581046048563094
BLEU 4 0.0025929680412004287
METEOR 0.07907652969182206
rougeL [0.05962403 0.33270893 0.08941297]
--final answers--
BLEU 1 0.08759170264938994
BLEU 4 0.004957873883748429
METEOR 0.15119795352164828
rougeL [0.11400388 0.63615474 0.1709617 ]


## Excel

In [22]:
import pandas as pd
import os

In [23]:
from transformers import AutoTokenizer
def pd_generator(evaluation, dataset, EM, IM, bleu1, bleu4, meteor, rougeL, final_answers, dontknow):
    tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B-Instruct')
    for i in range(len(evaluation)):
        prediction = evaluation[i]
        target = dataset[i]

        assert prediction['question'] == target['question']
        
        yield (target['question'],
            prediction['prediction'],
            target['answer']['mention'],  
            EM[i],
            IM[i],
            bleu1[i],
            bleu4[i],
            meteor[i],
            rougeL[i,2],
            i in final_answers,
            i in dontknow,
            prediction['full_sample'],
            list(map(tokenizer.decode,prediction['triples'])),
            target['answer'])

In [27]:
data = pd_generator(evaluation, dataset, exact_match, inclusion_match, bleu_1, bleu_4, meteor, rougeL, final_answers, dontknow)
columns = ['Question', 'Prediction', 'Answer', 'EM', 'IM', 'BLEU1', 'B4', 'METEOR', 'ROUGEL', 'Answered', 'DontKnow', 'FULL', 'Triples', 'AnswerBig']
evaldf = pd.DataFrame(data, columns = columns)
print(evaldf.shape)
evaldf.head()

(2000, 14)


Unnamed: 0,Question,Prediction,Answer,EM,IM,BLEU1,B4,METEOR,ROUGEL,Answered,DontKnow,FULL,Triples,AnswerBig
0,What is the fourth book in the Twilight series?,,Breaking Dawn,0.0,0.0,0.0,0.0,0.0,0.0,False,False,<|im_start|>system\nYou are a helpful question...,"[ <Twilight> <author> <Julia Frankau> .\n, <T...","{'answerType': 'entity', 'answer': [{'name': '..."
1,How many games are in the Uncharted series?,,6,0.0,0.0,0.0,0.0,0.0,0.0,False,False,<|im_start|>system\nYou are a helpful question...,[ <Uncharted 4: A Thief's End> <platform> <Pla...,"{'answerType': 'numerical', 'answer': [6], 'me..."
2,"As of 2015, which group held the record for th...",,U2,0.0,0.0,0.0,0.0,0.0,0.0,False,False,<|im_start|>system\nYou are a helpful question...,[ <The Weeknd> <record label> <Republic Record...,"{'answerType': 'entity', 'answer': [{'name': '..."
3,Who is the oldest person to ever win an Academ...,,James Ivory,0.0,0.0,0.0,0.0,0.0,0.0,False,False,<|im_start|>system\nYou are a helpful question...,[ <Noel M. Smith> <date of birth> <1895-05-22T...,"{'answerType': 'entity', 'answer': [{'name': '..."
4,Which Mario Kart games do not feature Link as ...,,"Super Mario Kart, Mario Kart 64, Mario Kart: S...",0.0,0.0,0.0,0.0,0.0,0.0,False,False,<|im_start|>system\nYou are a helpful question...,"[ <Mario Kart 64> <characters> <Yoshi> .\n, <...","{'answerType': 'entity', 'answer': [{'name': '..."


In [26]:
xlsx_file = f'{dataset_module_path}_{outfile}.xlsx'
assert not os.path.isfile(xlsx_file), f'Error: {xlsx_file} already exists'
print(xlsx_file)
with pd.ExcelWriter(xlsx_file) as writer:
    evaldf.to_excel(writer, sheet_name=f"{dataset_module_path} X {outfile}")

mintaka_dev_mintaka_dev_qwen3_3x3_1.out.1.xlsx




1 correct ~
4 wrong
6 correct -
8 correct +
12 wrong
13 wrong
14 wrong

common problems:
- model answer too quickly --> prompt
- need to generate more triples --> prompt (maybe more than one-shot)
- need to generate different triples!! --> can I calculate number of leaves at index time to avoid the model to fall in the same leaf?

In [44]:
JSON(evaluation[6])

<IPython.core.display.JSON object>

In [45]:
i = 2
print(evaluation[i]['question'], dataset[i-1])
print(evaluation[i]['full_prediction'])

How many games are in the Uncharted series? How many games are in the Uncharted series?
Reasoning: To answer how many games are in the Uncharted series, I need to find information about the total number of games released under the Uncharted franchise. I will look for a list of all Uncharted games and count them.
Fact: <Uncharted 4: A Thief's End> <platform> <PlayStation 4> .
Fact: <Uncharted 3: Drake's Deception> <platform> <PlayStation 3> .
Fact: <Uncharted 2: Among Thieves> <platform> <PlayStation 3> .
Fact: <Uncharted> <platform> <PlayStation 3> .
Fact: <Uncharted 2: Among Thieves> <platform> <PlayStation 4> .
Fact: <Uncharted 4: A Thief's End> <developer> <Naughty Dog> .
Fact: <Uncharted 3: Drake's Deception> <developer> <Naughty Dog> .
Fact: <Uncharted 2: Among Thieves> <developer> <Naughty Dog> .
Fact: <Uncharted> <developer> <Naughty Dog> .
Fact: <Uncharted 4: A Thief's End> <publisher> <Sony Interactive Entertainment> .
Fact: <Uncharted 3: Drake's Deception> <publisher> <Sony I

In [46]:
print(dataset[i-1])

How many games are in the Uncharted series?


In [47]:
for i in range(1, len(evaluation)):
    print(i, evaluation[i]['prediction'])

1 
2 
3 
4 
5 
6  A Game of Thrones.<|im_end|>
7  The Seattle Seahawks have played in Super Bowls XLII, XLIII, XLV, XLIX, LI, LII, LIII, and LV but have not won any of them.<|im_end|>
8  I don't know.<|im_end|>
9  Yes, "Risky Business" came out before "Top Gun".<|im_end|>
10 
11  Morgan Freeman is older.
Note: It's worth mentioning that both actors share the same year of birth, but Morgan Freeman was born on June 1, 1937, while Anthony Hopkins was born on December 31, 1937, which makes Morgan Freeman the older of the two.<|im_end|>
12  Anthony Hopkins is older than George Clooney.<|im_end|>
13  There is not enough information provided to determine which movie had the lowest production cost between "Eraserhead" and "Little Miss Sunshine".<|im_end|>
14  Brad Pitt is older than George Clooney.<|im_end|>
15  No, "Gone with the Wind" did not come out before 1940.<|im_end|>
16  Johnny Depp is older.<|im_end|>
17 
18  I don't know.<|im_end|>
19  Yes, the movie Diamonds Are Forever came out be