Working on tasks 
* https://www.notion.so/msalnikov/b0b68b3db11b4c40a4bada127bfde310?v=635216a0f3d646d58fde31f60cc9e4c9&p=82caba2f68c94f4ea320134e855e7bb4&pm=c
* https://www.notion.so/msalnikov/b0b68b3db11b4c40a4bada127bfde310?v=635216a0f3d646d58fde31f60cc9e4c9&p=0cb24ac3b2804a0798e08a3f3e0f0526&pm=c
* https://www.notion.so/msalnikov/b0b68b3db11b4c40a4bada127bfde310?v=635216a0f3d646d58fde31f60cc9e4c9&p=4b768d2e96aa4a9a8cc6c8d12267976a&pm=c

In [1]:
import transformers
import datasets
import requests
import torch
import random
import numpy as np
from pywikidata import Entity
from joblib import Memory
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
import ujson
import os

from kbqa.seq2seq.utils import convert_to_features

from typing import Union
from collections import defaultdict, OrderedDict


from evaluateqa.mintaka import evaluate as evaluate_mintaka
from evaluateqa.mintaka import calculate_metrics_for_prediction
from evaluateqa.mintaka.evaluate import normalize_and_tokenize_text

tqdm.pandas()

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
torch.manual_seed(8)
random.seed(8)
np.random.seed(0)

In [4]:
memory = Memory('/tmp/cache', verbose=0)

@memory.cache
def get_wd_search_results(
    search_string: str,
    max_results: int = 500,
    language: str = 'en',
    mediawiki_api_url: str = "https://www.wikidata.org/w/api.php",
    user_agent: str = None,
) -> list:
    params = {
        'action': 'wbsearchentities',
        'language': language,
        'search': search_string,
        'format': 'json',
        'limit': 50
    }

    user_agent = "pywikidata" if user_agent is None else user_agent
    headers = {
        'User-Agent': user_agent
    }

    cont_count = 1
    results = []
    while cont_count > 0:
        params.update({'continue': 0 if cont_count == 1 else cont_count})

        reply = requests.get(mediawiki_api_url, params=params, headers=headers)
        reply.raise_for_status()
        search_results = reply.json()

        if 'success' not in search_results or search_results['success'] != 1:
            raise Exception('WD search failed')
        else:
            for i in search_results['search']:
                results.append(i['id'])

        if 'search-continue' not in search_results:
            cont_count = 0
        else:
            cont_count = search_results['search-continue']

        if cont_count > max_results:
            break

    return results

### Dataset: MINTAKA

In [5]:
dataset = datasets.load_dataset('AmazonScience/mintaka')
dataset

No config specified, defaulting to: mintaka/en
Found cached dataset mintaka (/root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'lang', 'question', 'answerText', 'category', 'complexityType', 'questionEntity', 'answerEntity'],
        num_rows: 14000
    })
    validation: Dataset({
        features: ['id', 'lang', 'question', 'answerText', 'category', 'complexityType', 'questionEntity', 'answerEntity'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['id', 'lang', 'question', 'answerText', 'category', 'complexityType', 'questionEntity', 'answerEntity'],
        num_rows: 4000
    })
})

In [6]:
dataset['validation'].to_pandas().head()

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity
0,9ace9041,en,What is the fourth book in the Twilight series?,Breaking Dawn,books,ordinal,"[{'name': 'Q44523', 'entityType': 'entity', 'l...","[{'name': 'Q53945', 'label': 'Breaking Dawn'}]"
1,88bdb808,en,How many games are in the Uncharted series?,6,videogames,count,"[{'name': 'Q1064135', 'entityType': 'entity', ...","[{'name': 'Q17150', 'label': 'Uncharted: Drake..."
2,ecfd471d,en,"As of 2015, which group held the record for th...",U2,music,generic,"[{'name': 'Q41254', 'entityType': 'entity', 'l...","[{'name': 'Q396', 'label': 'U2'}]"
3,5d8dc3ff,en,Who is the oldest person to ever win an Academ...,James Ivory,movies,superlative,"[{'name': 'Q19020', 'entityType': 'entity', 'l...","[{'name': 'Q51577', 'label': 'James Ivory'}]"
4,118daa85,en,Which Mario Kart games do not feature Link as ...,"Super Mario Kart, Mario Kart 64, Mario Kart: S...",videogames,difference,"[{'name': 'Q188196', 'entityType': 'entity', '...","[{'name': 'Q1061560', 'label': 'Super Mario Ka..."


In [7]:
dataset['train'].to_pandas()['complexityType'].unique()

array(['ordinal', 'intersection', 'generic', 'superlative', 'yesno',
       'comparative', 'multihop', 'difference', 'count'], dtype=object)

In [8]:
# model_checkpoint = 'google/t5-3b-ssm'
# model_checkpoint = 'google/t5-large-ssm'
model_checkpoint = '/mnt/storage/QA_System_Project/seq2seq_runs/mintaka_only_experiments_mintaka_tunned/model_t5_large_ssm_nq/models/checkpoint-7000/'
device = torch.device('cuda')
batch_size = 8
dataset_split = 'test'

model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
dataset = dataset.map(
    lambda batch: convert_to_features(
        batch, tokenizer, label_feature_name="answerText"
    ),
    batched=True,
)

columns = [
    "input_ids",
    "labels",
    "attention_mask",
]
dataset.set_format(type="torch", columns=columns)

Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-d82ee0eb58e1802d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-c8955b1d3baff169.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-66fb802b880b6e11.arrow


In [10]:
class AnswerItem():
    def __init__(self, answer: Union[Entity, int, float, bool]):
        if isinstance(answer, str):
            self.answer = AnswerItem.text_to_answer(answer)
        else:
            self.answer = answer

        self._type = None
    
    @property
    def type(self):
        if self._type is None:
            self._type = AnswerItem.extract_answer_type(self.answer)
        return self._type
    
    @classmethod
    def extract_answer_type(cls, answer):
        if isinstance(answer, bool):
            return 'yesno'
        elif isinstance(answer, str) and Entity._validate_entity_id(answer):
            answer_entity = Entity(answer)
            return answer_entity.instance_of + answer_entity.subclass_of
        elif isinstance(answer, (int, float)):
            return  'Number'
        else:
            return None

    @classmethod
    def text_to_answer(cls, text): #text to answer that can be entity_id, Number or yesno
        if text is None:
            return None

        if isinstance(text, str):
            _text = text.replace('.', '')
            if _text.lower() in ['yes', 'true']:
                return True
            elif _text.lower() in ['no', 'false']:
                return False

        try:
            entity = float(text)
            if int(entity) == entity:
                entity = int(entity)
        except:
            results = get_wd_search_results(text.replace('.', ''), max_results=1)
            if len(results) > 0:
                entity = results[0]
            else:
                entity = None
        return entity

In [None]:
seed = 13
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

test_df = dataset['test'].to_pandas().drop(['input_ids', 'attention_mask', 'labels'], axis=1)

generated_text = []
generated_entities = []
for row in tqdm(dataset['test']):
    generated_ids = model.generate(
        row["input_ids"].view(1, -1).to(device),
        num_beams=200,
        num_return_sequences=200,
        num_beam_groups=20,
        diversity_penalty=-0.1,
    )
    generated_decoded_batch = tokenizer.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )
    generated_text.append(list(dict.fromkeys(generated_decoded_batch).keys()))
    generated_entities.append([
        AnswerItem.text_to_answer(answer)
        for answer in generated_text[-1]
    ])

In [59]:
def generated_entities_to_type(generated_entities):
    types_freq = defaultdict(int)
    for answer in generated_entities:
        answer_item = AnswerItem(answer)
        if isinstance(answer_item.type, list):
            for answer_type in answer_item.type:
                if isinstance(answer_type, str):
                    answer_type_lbl = answer_type
                else:
                    answer_type_lbl = answer_type.idx
                types_freq[answer_type_lbl] += 1
        else:
            types_freq[answer_item.type] += 1

    answer_tetrieved_type = sorted(types_freq.items(), key=lambda item: -item[1])[0][0]
    return answer_tetrieved_type

In [35]:
test_df = pd.read_json('test_beam_search_preds_mintaka_with_types.json')

generated_kg_answers = test_df['generated_entities'].apply(lambda lst: lst[0]).values

In [13]:
def print_eval(generated_answers=None, mode='kg', df=None, groupbycols=['complexityType']):
    if df is None:
        if not isinstance(generated_answers, dict):
            answers = dict(zip(dataset[dataset_split]['id'], generated_answers))
        else:
            answers = generated_answers

        results_kg = evaluate_mintaka(
            predictions=answers,
            split=dataset_split,
            mode=mode,
        )
    else:
        results_kg = evaluate_mintaka(
            df_with_predictions=df,
            split=dataset_split,
            mode=mode,
            groupbycols=groupbycols,
        )
    
    if 'answerRetrievedType' in results_kg:
        items = sorted(
            results_kg['answerRetrievedType'].items(),
            key=lambda item: -item[1]['hits1 Number Correct Answer Of'][1]
        )[:10]
        # items = [(f"{key} ({Entity(key).label if 'Q' in key[:1] else ''})", val) for key, val in items]
        results_kg['answerRetrievedType'] = dict(items)

    print(f"{'Group':13s}  {'Hits@1':6s} (Correct Of Total)")
    print(f"{'All':13s}= {results_kg['All']['hits1']:2.4f} ({results_kg['All']['hits1 Number Correct Answer Of'][0]:4d} Of {results_kg['All']['hits1 Number Correct Answer Of'][1]:4d})", end='\n\n')
    for key in results_kg.keys():
        if 'All' == key:
            continue

        for key, val in results_kg[key].items():
            print(f"{key:13s}= {val['hits1']:2.4f} ({val['hits1 Number Correct Answer Of'][0]:4d} Of {val['hits1 Number Correct Answer Of'][1]:4d})")
        print('')
    return results_kg

In [14]:
df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], generated_kg_answers)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
results_kg = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

Group          Hits@1 (Correct Of Total)
All          = 0.2740 (1096 Of 4000)

comparative  = 0.4575 ( 183 Of  400)
count        = 0.2700 ( 108 Of  400)
difference   = 0.1725 (  69 Of  400)
generic      = 0.2075 ( 166 Of  800)
intersection = 0.2900 ( 116 Of  400)
multihop     = 0.1025 (  41 Of  400)
ordinal      = 0.1600 (  64 Of  400)
superlative  = 0.3925 ( 157 Of  400)
yesno        = 0.4800 ( 192 Of  400)

Q5           = 0.3214 ( 324 Of 1008)
Number       = 0.1549 ( 127 Of  820)
yesno        = 0.4651 ( 253 Of  544)
Q11424       = 0.1475 (  36 Of  244)
Q7889        = 0.2703 (  50 Of  185)
Q3624078     = 0.3793 (  33 Of   87)
Q35657       = 0.3671 (  29 Of   79)
Q482994      = 0.0725 (   5 Of   69)
Q7725634     = 0.1692 (  11 Of   65)
Q1093829     = 0.3462 (  18 Of   52)



In [15]:
def get_answers_type(answers: list) -> set:
    final_type = []
    for pred in answers:
        pred_type = AnswerItem(pred).type
        if isinstance(pred_type, list):
            final_type.extend(pred_type)
        else:
            final_type.append(pred_type)
    return final_type

def is_type_matched(row):
    if row['answer'] is not None and isinstance(row['answer'], list):
        answer_types = set(get_answers_type(row['answer']))
    else:
        answer_types = set()

    if row['pred'] is not None and isinstance(row['pred'], list):
        pred_types   = set(get_answers_type(row['pred']))
    else:
        pred_types = set()

    return len(answer_types.intersection(pred_types)) > 0

df['type_match'] = df.progress_apply(is_type_matched, axis=1)
df['type_match'].mean()

  0%|          | 0/4000 [00:00<?, ?it/s]

0.77575

In [16]:
df['is_hit'] = df['hits1'].astype(bool)

print(
    'Proportion of errors with incorrect type                =',
    df[(~df['type_match']) & (~df['is_hit'])].index.size / df[~df['is_hit']].index.size,
    end='\n\n'
)

for complexity_type, group in df.groupby('complexityType'):
    print(
        f'Proportion of errors with incorrect type ({complexity_type:12s}) =',
        group[(~group['type_match']) & (~group['is_hit'])].index.size / group[~group['is_hit']].index.size
    )

print('')
for category, group in df.groupby('category'):
    print(
        f'Proportion of errors with incorrect type ({category:12s}) =',
        group[(~group['type_match']) & (~group['is_hit'])].index.size / group[~group['is_hit']].index.size
    )

Proportion of errors with incorrect type                = 0.3033746556473829

Proportion of errors with incorrect type (comparative ) = 0.2304147465437788
Proportion of errors with incorrect type (count       ) = 0.010273972602739725
Proportion of errors with incorrect type (difference  ) = 0.4984894259818731
Proportion of errors with incorrect type (generic     ) = 0.3911671924290221
Proportion of errors with incorrect type (intersection) = 0.23943661971830985
Proportion of errors with incorrect type (multihop    ) = 0.4011142061281337
Proportion of errors with incorrect type (ordinal     ) = 0.3125
Proportion of errors with incorrect type (superlative ) = 0.3991769547325103
Proportion of errors with incorrect type (yesno       ) = 0.004807692307692308

Proportion of errors with incorrect type (books       ) = 0.39655172413793105
Proportion of errors with incorrect type (geography   ) = 0.3911671924290221
Proportion of errors with incorrect type (history     ) = 0.26851851851851855
Pr

In [17]:
def retrieve_answer_type_from_candidates(answers, return_types_count=False):
    types_count = defaultdict(int)
    for answer in answers:
        answer_type = AnswerItem(answer).type
        if isinstance(answer_type, list):
            for entity in answer_type:
                types_count[entity] += 1
        else:
            types_count[answer_type] += 1
    
    if len(types_count) == 0:
        return None

    sorted_types_count = sorted(types_count.items(), key=lambda x: -x[1])
    final_type = sorted_types_count[0][0]
    if final_type is None and len(sorted_types_count) > 1:
        final_type = sorted_types_count[1][0]

    if isinstance(final_type, Entity):
        final_type = final_type.idx
    
    if return_types_count:
        return final_type, OrderedDict(sorted_types_count)
    else:
        return final_type

In [18]:
test_df = pd.read_json('test_beam_search_preds_mintaka_with_types.json')

# test_df['answerRetrievedType'] = test_df['generated_entities'].progress_apply(retrieve_answer_type_from_candidates)
# test_df.to_json('test_beam_search_preds_mintaka_with_types.json')

test_df.head()

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity,generated_text,sequences_scores,generated_entities,answerRetrievedType,filtered_by_type_preds
0,fae46b21,en,What man was a famous American author and also...,Mark Twain,history,intersection,"[{'name': 'Q1497', 'entityType': 'entity', 'la...","[{'name': 'Q7245', 'label': 'Mark Twain'}]","[Edgar Allan Poe, Ernest Hemingway, Charles Di...","[-0.2734780908, -0.3756849766, -0.418252229700...","[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5,"[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616..."
1,bc8713cc,en,How many Academy Awards has Jake Gyllenhaal be...,1,movies,count,"[{'name': 'Q133313', 'entityType': 'entity', '...","[{'name': 'Q106291', 'label': 'Academy Award f...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...","[-0.6568749547, -0.7941160798, -0.851152122, -...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number,"[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,..."
2,d2a03f72,en,"Who is older, The Weeknd or Drake?",Drake,music,comparative,"[{'name': 'Q2121062', 'entityType': 'entity', ...","[{'name': 'Q33240', 'label': 'Drake'}]","[Drake, The Weeknd, Cody Jarrett, Dwight D. Ei...","[-0.0174380932, -0.8993775845, -1.415274024, -...","[Q7559, Q2121062, Q5140439, Q9916, Q713099, Q5...",Q5,"[Q2121062, Q5140439, Q9916, Q713099, Q513019, ..."
3,9a296167,en,How many children did Donald Trump have?,5,history,count,"[{'name': 'Q22686', 'entityType': 'entity', 'l...","[{'name': 'Q3713655', 'label': 'Donald Trump J...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 6 child...","[-0.49233829980000005, -1.0202715397, -1.06337...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, Q348559...",Number,"[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 13]"
4,e343ad26,en,Is the main hero in Final Fantasy IX named Kuja?,No,videogames,yesno,"[{'name': 'Q474573', 'entityType': 'entity', '...",[],"[Yes, No, Yuna, Yuna, Yuna, Yuna and Kuja are ...","[-0.3390540481, -0.3550684452, -1.4538880587, ...","[True, False, None, None, None, None, None, No...",yesno,[]


### ACT Selection (from prev paper)


In [19]:
from kbqa.candidate_selection.question_to_rank_by_instance_of import QuestionToRankInstanceOf


reranked_preds = []
for _, row in tqdm(test_df.iterrows(), total=test_df.index.size):
    answers_candidates = [e for e in row['generated_entities'] if e is not None and isinstance(e, str) and 'Q' == e[0]]
    if len(answers_candidates) == 0:
        reranked_answer = row['generated_entities'][0]
    else:
        reranked_answer = QuestionToRankInstanceOf(
            question=row['question'],
            question_entities=[e['name'] for e in row['questionEntity'] if e['entityType'] == 'entity' and 'Q' == e['name'][0]],
            answers_candidates=answers_candidates,
            only_forward_one_hop=True,
        ).final_answers()[0][1].idx

    reranked_preds.append(reranked_answer)


df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], reranked_preds)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
results_kg = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

  0%|          | 0/4000 [00:00<?, ?it/s]

Group          Hits@1 (Correct Of Total)
All          = 0.1615 ( 646 Of 4000)

comparative  = 0.1500 (  60 Of  400)
count        = 0.1750 (  70 Of  400)
difference   = 0.1325 (  53 Of  400)
generic      = 0.1787 ( 143 Of  800)
intersection = 0.2450 (  98 Of  400)
multihop     = 0.0300 (  12 Of  400)
ordinal      = 0.0850 (  34 Of  400)
superlative  = 0.2025 (  81 Of  400)
yesno        = 0.2375 (  95 Of  400)

Q5           = 0.1518 ( 153 Of 1008)
Number       = 0.1024 (  84 Of  820)
yesno        = 0.2408 ( 131 Of  544)
Q11424       = 0.1311 (  32 Of  244)
Q7889        = 0.1892 (  35 Of  185)
Q3624078     = 0.2644 (  23 Of   87)
Q35657       = 0.2532 (  20 Of   79)
Q482994      = 0.0000 (   0 Of   69)
Q7725634     = 0.1692 (  11 Of   65)
Q1093829     = 0.1923 (  10 Of   52)



### Filtered by type

In [20]:
def filter_generated_answers_by_type(generated_entities, answer_retrieved_type):
    for _answer in generated_entities:
        answer = AnswerItem(_answer)
        if isinstance(answer.type, list) and answer_retrieved_type in [e.idx for e in answer.type]:
                yield _answer
        else:
            if answer.type == answer_retrieved_type:
                yield _answer

test_df['filtered_by_type_preds'] = test_df.progress_apply(
     lambda row: list(filter_generated_answers_by_type(row['generated_entities'], row['answerRetrievedType'])),
     axis=1
)

df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], test_df['filtered_by_type_preds'].apply(lambda lst: lst[0]).values)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
results_kg = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

  0%|          | 0/4000 [00:00<?, ?it/s]

Group          Hits@1 (Correct Of Total)
All          = 0.2675 (1070 Of 4000)

comparative  = 0.4550 ( 182 Of  400)
count        = 0.2725 ( 109 Of  400)
difference   = 0.1975 (  79 Of  400)
generic      = 0.1925 ( 154 Of  800)
intersection = 0.2825 ( 113 Of  400)
multihop     = 0.1050 (  42 Of  400)
ordinal      = 0.1525 (  61 Of  400)
superlative  = 0.3800 ( 152 Of  400)
yesno        = 0.4450 ( 178 Of  400)

Q5           = 0.3145 ( 317 Of 1008)
Number       = 0.1573 ( 129 Of  820)
yesno        = 0.4651 ( 253 Of  544)
Q11424       = 0.1066 (  26 Of  244)
Q7889        = 0.2486 (  46 Of  185)
Q3624078     = 0.4368 (  38 Of   87)
Q35657       = 0.3924 (  31 Of   79)
Q482994      = 0.1304 (   9 Of   69)
Q7725634     = 0.1692 (  11 Of   65)
Q1093829     = 0.2692 (  14 Of   52)



In [21]:
df['pred'] = test_df['filtered_by_type_preds'].apply(lambda preds: [preds[0]] if isinstance(preds, list) else preds)
df['type_match'] = df.progress_apply(is_type_matched, axis=1)

df['is_hit'] = df['hits1'].astype(bool)

print(
    'Proportion of errors with incorrect type                =',
    df[(~df['type_match']) & (~df['is_hit'])].index.size / df[~df['is_hit']].index.size,
    end='\n\n'
)

for complexity_type, group in df.groupby('complexityType'):
    print(
        f'Proportion of errors with incorrect type ({complexity_type:12s}) =',
        group[(~group['type_match']) & (~group['is_hit'])].index.size / group[~group['is_hit']].index.size
    )

print('')
for category, group in df.groupby('category'):
    print(
        f'Proportion of errors with incorrect type ({category:12s}) =',
        group[(~group['type_match']) & (~group['is_hit'])].index.size / group[~group['is_hit']].index.size
    )

  0%|          | 0/4000 [00:00<?, ?it/s]

Proportion of errors with incorrect type                = 0.22252559726962456

Proportion of errors with incorrect type (comparative ) = 0.24770642201834864
Proportion of errors with incorrect type (count       ) = 0.010309278350515464
Proportion of errors with incorrect type (difference  ) = 0.2803738317757009
Proportion of errors with incorrect type (generic     ) = 0.29876160990712075
Proportion of errors with incorrect type (intersection) = 0.18466898954703834
Proportion of errors with incorrect type (multihop    ) = 0.2430167597765363
Proportion of errors with incorrect type (ordinal     ) = 0.2182890855457227
Proportion of errors with incorrect type (superlative ) = 0.3024193548387097
Proportion of errors with incorrect type (yesno       ) = 0.1036036036036036

Proportion of errors with incorrect type (books       ) = 0.37799043062200954
Proportion of errors with incorrect type (geography   ) = 0.2946708463949843
Proportion of errors with incorrect type (history     ) = 0.2090909

In [22]:
df['type_match'].mean()

0.837

In [23]:
test_df['goldAnswerRetrievedType'] = df['answer'].progress_apply(
    lambda answer: retrieve_answer_type_from_candidates(answer) if answer is not None else None
)
test_df.head()

  0%|          | 0/4000 [00:00<?, ?it/s]

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity,generated_text,sequences_scores,generated_entities,answerRetrievedType,filtered_by_type_preds,goldAnswerRetrievedType
0,fae46b21,en,What man was a famous American author and also...,Mark Twain,history,intersection,"[{'name': 'Q1497', 'entityType': 'entity', 'la...","[{'name': 'Q7245', 'label': 'Mark Twain'}]","[Edgar Allan Poe, Ernest Hemingway, Charles Di...","[-0.2734780908, -0.3756849766, -0.418252229700...","[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5,"[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5
1,bc8713cc,en,How many Academy Awards has Jake Gyllenhaal be...,1,movies,count,"[{'name': 'Q133313', 'entityType': 'entity', '...","[{'name': 'Q106291', 'label': 'Academy Award f...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...","[-0.6568749547, -0.7941160798, -0.851152122, -...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number,"[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number
2,d2a03f72,en,"Who is older, The Weeknd or Drake?",Drake,music,comparative,"[{'name': 'Q2121062', 'entityType': 'entity', ...","[{'name': 'Q33240', 'label': 'Drake'}]","[Drake, The Weeknd, Cody Jarrett, Dwight D. Ei...","[-0.0174380932, -0.8993775845, -1.415274024, -...","[Q7559, Q2121062, Q5140439, Q9916, Q713099, Q5...",Q5,"[Q2121062, Q5140439, Q9916, Q713099, Q513019, ...",Q5
3,9a296167,en,How many children did Donald Trump have?,5,history,count,"[{'name': 'Q22686', 'entityType': 'entity', 'l...","[{'name': 'Q3713655', 'label': 'Donald Trump J...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 6 child...","[-0.49233829980000005, -1.0202715397, -1.06337...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, Q348559...",Number,"[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 13]",Number
4,e343ad26,en,Is the main hero in Final Fantasy IX named Kuja?,No,videogames,yesno,"[{'name': 'Q474573', 'entityType': 'entity', '...",[],"[Yes, No, Yuna, Yuna, Yuna, Yuna and Kuja are ...","[-0.3390540481, -0.3550684452, -1.4538880587, ...","[True, False, None, None, None, None, None, No...",yesno,"[True, False]",yesno


In [24]:
test_df[test_df['goldAnswerRetrievedType'] != test_df['answerRetrievedType']]

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity,generated_text,sequences_scores,generated_entities,answerRetrievedType,filtered_by_type_preds,goldAnswerRetrievedType
5,b41ae115,en,Who performed at the Super Bowl XXIII halftime...,Elvis Presto,sports,generic,"[{'name': 'Q1307150', 'entityType': 'entity', ...",[],"[Coldplay, Justin Timberlake, Lady Gaga, The B...","[-0.1100655422, -0.6572936773, -0.6655324101, ...","[Q45188, Q43432, Q19848, Q134541, Q29564107, Q...",Q5,"[Q43432, Q19848, Q29564107, Q73437, Q121507, Q...",
9,7ed1858c,en,"Which movie, starring Al Jolson, is generally ...",The Jazz Singer,movies,ordinal,"[{'name': 'Q128532', 'entityType': 'entity', '...","[{'name': 'Q465087', 'label': 'The Jazz Singer'}]","[The Jazz Singer, On Stranger Tides, It's a Wo...","[-0.14734335240000002, -0.7283437252, -0.73821...","[Q465087, Q1660753, Q204191, Q778696, Q669749,...",Q11424,"[Q204191, Q778696, Q669749, Q777776, Q3110003,...",Q24869
11,dbc6f3e8,en,"Which movie came out first, Monsters, Inc. or ...",Monsters Inc,movies,comparative,"[{'name': 'Q36092', 'entityType': 'entity', 'l...","[{'name': 'Q187726', 'label': 'Monsters, Inc.'}]","[Lilo and Stitch, Monsters, Inc., Lilo & Stitc...","[-0.1103270054, -0.1441309899, -0.2825240791, ...","[Q590166, Q187726, Q590166, None, None, Q59016...",Q261636,"[Q590166, Q590166, Q590166, Q590166]",Q229390
15,f5ef3179,en,"Which movie had a bigger budget, Avatar or Tra...",Avatar,movies,comparative,"[{'name': 'Q24871', 'entityType': 'entity', 'l...","[{'name': 'Q24871', 'label': 'Avatar'}]","[Avatar, Training Day, Voyage of the Dawn Trea...","[-0.032006066300000004, -0.7113320231, -1.1629...","[Q11572, Q308929, Q331656, None, None, Q123760...",Q11424,"[Q308929, Q19590955, Q174284, Q51882895, Q284917]",Q229390
16,44b1cebb,en,Which movie sold more tickets: Titanic or Avatar?,Avatar,movies,comparative,"[{'name': 'Q24871', 'entityType': 'entity', 'l...","[{'name': 'Q24871', 'label': 'Avatar'}]","[Avatar, Titanic, Titanic, Avatar, Avatar, Tit...","[-0.1260389686, -0.30949947240000003, -1.29978...","[Q11572, Q25173, None, None, None, Q331656, No...",Q5398426,"[Q11572, Q11572, Q123760]",Q229390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3981,e94b4483,en,What's one of Yoshi's unique superpowers?,He can turn the enemies that he eats into eggs...,videogames,generic,"[{'name': 'Q214174', 'entityType': 'entity', '...",[],"[Transcendence, Magician's Touch, able to see ...","[-0.9820313454, -1.0229785442, -1.0529767275, ...","[Q193359, None, None, None, None, Q105854041, ...",Q1751513,"[Q2107094, Q2107094]",
3982,9b9a14d0,en,What was the name of Final Fantasy 4's main pr...,Cecil,videogames,generic,"[{'name': 'Q911226', 'entityType': 'entity', '...",[],"[Talion, Squall Leonhart, Sora, Kiritsugu, Clo...","[-0.6161286235, -0.7360379696, -0.7670309544, ...","[Q15730974, Q2607464, Q117382, Q4925676, Q1798...",Q15632617,"[Q2607464, Q4925676, Q1798592, Q12902673, Q226...",
3984,d0336c26,en,Who is the protagonist of Fallout 4?,The Sole Survivor,videogames,generic,"[{'name': 'Q10493813', 'entityType': 'entity',...",[],"[the ""Sole Survivor"", Bethesda, Geralt of Rivi...","[-0.7761734128000001, -0.8043124676000001, -0....","[None, Q584451, Q2492923, Q192469, Q12400, Non...",Q5,"[Q58444, Q438820, Q460572, Q460240, Q44024, Q9...",
3990,af4462a9,en,What game was released by Impressions Games an...,Cleopatra: Queen of the Nile,videogames,intersection,"[{'name': 'Q37110', 'entityType': 'entity', 'l...","[{'name': 'Q2980768', 'label': 'Cleopatra: Que...","[Pharaoh II, Pharaoh 2, Pharaoh: New Horizons,...","[-0.3500819206, -0.4563730657, -0.4766887128, ...","[None, None, None, None, None, None, None, Non...",Q11424,"[Q181795, Q15845551]",Q209163


In [25]:
test_df.apply(
    lambda row: len(row['generated_entities']) - len(row['filtered_by_type_preds']),
    axis=1
).mean()

38.226

In [26]:
test_df['isAnswerEntity'] = test_df['answerEntity'].apply(lambda entities: len(entities) > 0)
print(test_df['isAnswerEntity'].value_counts())
test_df.head()

isAnswerEntity
True     2816
False    1184
Name: count, dtype: int64


Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity,generated_text,sequences_scores,generated_entities,answerRetrievedType,filtered_by_type_preds,goldAnswerRetrievedType,isAnswerEntity
0,fae46b21,en,What man was a famous American author and also...,Mark Twain,history,intersection,"[{'name': 'Q1497', 'entityType': 'entity', 'la...","[{'name': 'Q7245', 'label': 'Mark Twain'}]","[Edgar Allan Poe, Ernest Hemingway, Charles Di...","[-0.2734780908, -0.3756849766, -0.418252229700...","[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5,"[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5,True
1,bc8713cc,en,How many Academy Awards has Jake Gyllenhaal be...,1,movies,count,"[{'name': 'Q133313', 'entityType': 'entity', '...","[{'name': 'Q106291', 'label': 'Academy Award f...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...","[-0.6568749547, -0.7941160798, -0.851152122, -...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number,"[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number,True
2,d2a03f72,en,"Who is older, The Weeknd or Drake?",Drake,music,comparative,"[{'name': 'Q2121062', 'entityType': 'entity', ...","[{'name': 'Q33240', 'label': 'Drake'}]","[Drake, The Weeknd, Cody Jarrett, Dwight D. Ei...","[-0.0174380932, -0.8993775845, -1.415274024, -...","[Q7559, Q2121062, Q5140439, Q9916, Q713099, Q5...",Q5,"[Q2121062, Q5140439, Q9916, Q713099, Q513019, ...",Q5,True
3,9a296167,en,How many children did Donald Trump have?,5,history,count,"[{'name': 'Q22686', 'entityType': 'entity', 'l...","[{'name': 'Q3713655', 'label': 'Donald Trump J...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 6 child...","[-0.49233829980000005, -1.0202715397, -1.06337...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, Q348559...",Number,"[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 13]",Number,True
4,e343ad26,en,Is the main hero in Final Fantasy IX named Kuja?,No,videogames,yesno,"[{'name': 'Q474573', 'entityType': 'entity', '...",[],"[Yes, No, Yuna, Yuna, Yuna, Yuna and Kuja are ...","[-0.3390540481, -0.3550684452, -1.4538880587, ...","[True, False, None, None, None, None, None, No...",yesno,"[True, False]",yesno,False


### Filtered by type With oracle for question: is answer entity or not

In [29]:
preds = test_df.apply(
    lambda row: [row['filtered_by_type_preds'][0]] if row['isAnswerEntity'] is True else row['generated_entities'][0],
    axis=1
)
df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], preds)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
_ = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

Group          Hits@1 (Correct Of Total)
All          = 0.2750 (1100 Of 4000)

comparative  = 0.4600 ( 184 Of  400)
count        = 0.2725 ( 109 Of  400)
difference   = 0.2050 (  82 Of  400)
generic      = 0.1988 ( 159 Of  800)
intersection = 0.2850 ( 114 Of  400)
multihop     = 0.1050 (  42 Of  400)
ordinal      = 0.1575 (  63 Of  400)
superlative  = 0.3875 ( 155 Of  400)
yesno        = 0.4800 ( 192 Of  400)

Q5           = 0.3264 ( 329 Of 1008)
Number       = 0.1561 ( 128 Of  820)
yesno        = 0.4651 ( 253 Of  544)
Q11424       = 0.1270 (  31 Of  244)
Q7889        = 0.2649 (  49 Of  185)
Q3624078     = 0.4368 (  38 Of   87)
Q35657       = 0.3924 (  31 Of   79)
Q482994      = 0.1304 (   9 Of   69)
Q7725634     = 0.2154 (  14 Of   65)
Q1093829     = 0.2692 (  14 Of   52)



### Evaluate only predictions, when answer in list of generated entities

In [30]:
def _is_answer_in_generate(row):
    gold_true_answers = [e['name'] for e in row['answerEntity'] if Entity._validate_entity_id(e['name'])]
    return len(set(gold_true_answers).intersection(row['generated_entities'])) > 0

test_df['is_answer_in_generated'] = test_df.progress_apply(_is_answer_in_generate, axis=1)
test_df[test_df['isAnswerEntity']]['is_answer_in_generated'].mean()

  0%|          | 0/4000 [00:00<?, ?it/s]

0.5014204545454546

In [33]:
_test_df = calculate_metrics_for_prediction(
    dict(zip(test_df['id'].values, preds)),
    split='test',
    mode='kg',
)

print('ONLY ANSWER IN GENERATED DATA. FILTERED BY TYPE')
_ = print_eval(df=_test_df[test_df['isAnswerEntity'] & test_df['is_answer_in_generated']], mode='kg')

ONLY ANSWER IN GENERATED DATA. FILTERED BY TYPE
Group          Hits@1 (Correct Of Total)
All          = 0.4873 ( 688 Of 1412)

comparative  = 0.5619 ( 109 Of  194)
count        = 0.0000 (   0 Of    3)
difference   = 0.4091 (  72 Of  176)
generic      = 0.4689 ( 143 Of  305)
intersection = 0.4669 ( 113 Of  242)
multihop     = 0.4043 (  38 Of   94)
ordinal      = 0.3812 (  61 Of  160)
superlative  = 0.6387 ( 152 Of  238)



In [34]:
_test_df = calculate_metrics_for_prediction(
    dict(zip(test_df['id'].values, test_df['generated_entities'].apply(lambda lst: lst[0]))),
    split='test',
    mode='kg',
)

print('ONLY ANSWER IN GENERATED DATA. WITHOUT FILTRATION BY TYPE')
_ = print_eval(df=_test_df[test_df['isAnswerEntity'] & test_df['is_answer_in_generated']], mode='kg')

ONLY ANSWER IN GENERATED DATA. WITHOUT FILTRATION BY TYPE
Group          Hits@1 (Correct Of Total)
All          = 0.4851 ( 685 Of 1412)

comparative  = 0.5567 ( 108 Of  194)
count        = 0.0000 (   0 Of    3)
difference   = 0.3352 (  59 Of  176)
generic      = 0.4918 ( 150 Of  305)
intersection = 0.4752 ( 115 Of  242)
multihop     = 0.3936 (  37 Of   94)
ordinal      = 0.3875 (  62 Of  160)
superlative  = 0.6471 ( 154 Of  238)



In [35]:
def _position_of_answer_in_generated(row):
    gold_true_answers = [e['name'] for e in row['answerEntity'] if Entity._validate_entity_id(e['name'])]
    for idx, answer_candidate in enumerate(row['generated_entities']):
        if answer_candidate in gold_true_answers:
            return idx

position_of_answer_in_generated = test_df[test_df['isAnswerEntity'] & test_df['is_answer_in_generated']].apply(
    _position_of_answer_in_generated,
    axis=1
)
position_of_answer_in_generated.describe()

count    1412.00000
mean        4.50779
std         9.77895
min         0.00000
25%         0.00000
50%         1.00000
75%         4.00000
max       101.00000
dtype: float64

In [36]:
position_of_answer_in_generated.value_counts().iloc[:10]

0     685
1     217
2      78
3      61
4      42
5      32
8      29
7      28
6      24
10     15
Name: count, dtype: int64

In [37]:
# for _, row in tqdm(dataset[dataset_split].to_pandas().iterrows(), total=len(dataset[dataset_split])):
#     with open(f'mintaka_{dataset_split}_question_entities_two_hop_neighbours.jsonl', 'a+') as f:
#         try:
#             question_entities = [
#                 Entity(e['name'])
#                 for e in row['questionEntity']
#                 if e['entityType'] == 'entity' and 'Q' == e['name'][0]
#             ]

#             question_entities_neighbours = []
#             for q_entity in question_entities:
#                 for one_hop_neighbour_prop, one_hop_neighbour_entity in q_entity.forward_one_hop_neighbours:
#                     question_entities_neighbours.append((
#                         q_entity.idx,
#                         one_hop_neighbour_prop.idx,
#                         one_hop_neighbour_entity.idx,
#                     ))

#                     for two_hop_neighbour_prop, two_hop_neighbour_entity in one_hop_neighbour_entity.forward_one_hop_neighbours:
#                         question_entities_neighbours.append((
#                             one_hop_neighbour_entity.idx,
#                             two_hop_neighbour_prop.idx,
#                             two_hop_neighbour_entity.idx,
#                         ))

#             f.write(
#                 ujson.dumps({
#                     'id': row['id'],
#                     'two_hop_neighbours': question_entities_neighbours
#                 }) + '\n'
#             )
#         except Exception as e:
#             print(f"Error with id={row['id']}: {e}")

question_entities_neighbours_df = pd.read_json(
    f'mintaka_{dataset_split}_question_entities_two_hop_neighbours.jsonl',
    lines=True,
)
question_entities_neighbours_df.head()

Unnamed: 0,id,two_hop_neighbours
0,fae46b21,"[[Q1497, P17, Q30], [Q30, P37, Q1860], [Q30, P..."
1,bc8713cc,"[[Q133313, P8839, Q116790562], [Q116790562, P2..."
2,d2a03f72,"[[Q2121062, P6886, Q1860], [Q1860, P17, Q27], ..."
3,9a296167,"[[Q22686, P27, Q30], [Q30, P37, Q1860], [Q30, ..."
4,e343ad26,"[[Q474573, P1434, Q99415917], [Q99415917, P170..."


In [38]:
question_entities_neighbours_df['neighbours_entities'] = question_entities_neighbours_df['two_hop_neighbours'].apply(
    lambda triplet_list: [triplet[-1] for triplet in triplet_list]
)
test_df = test_df.merge(question_entities_neighbours_df[['id', 'neighbours_entities']], on='id')
test_df.head()

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity,generated_text,sequences_scores,generated_entities,answerRetrievedType,filtered_by_type_preds,goldAnswerRetrievedType,isAnswerEntity,is_answer_in_generated,neighbours_entities
0,fae46b21,en,What man was a famous American author and also...,Mark Twain,history,intersection,"[{'name': 'Q1497', 'entityType': 'entity', 'la...","[{'name': 'Q7245', 'label': 'Mark Twain'}]","[Edgar Allan Poe, Ernest Hemingway, Charles Di...","[-0.2734780908, -0.3756849766, -0.418252229700...","[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5,"[Q16867, Q23434, Q5686, Q131149, Q34597, Q3616...",Q5,True,False,"[Q30, Q1860, Q61, Q30, Q49, Q6256, Q4917, Q627..."
1,bc8713cc,en,How many Academy Awards has Jake Gyllenhaal be...,1,movies,count,"[{'name': 'Q133313', 'entityType': 'entity', '...","[{'name': 'Q106291', 'label': 'Academy Award f...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...","[-0.6568749547, -0.7941160798, -0.851152122, -...","[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number,"[1, 2, 3, 4, 5, 11, 6, 0, 8, 7, 9, 10, 13, 12,...",Number,True,False,"[Q116790562, Q111143758, Q17126303, Q1860, Q27..."
2,d2a03f72,en,"Who is older, The Weeknd or Drake?",Drake,music,comparative,"[{'name': 'Q2121062', 'entityType': 'entity', ...","[{'name': 'Q33240', 'label': 'Drake'}]","[Drake, The Weeknd, Cody Jarrett, Dwight D. Ei...","[-0.0174380932, -0.8993775845, -1.415274024, -...","[Q7559, Q2121062, Q5140439, Q9916, Q713099, Q5...",Q5,"[Q2121062, Q5140439, Q9916, Q713099, Q513019, ...",Q5,True,False,"[Q1860, Q27, Q145, Q115, Q424, Q30, Q668, Q334..."
3,9a296167,en,How many children did Donald Trump have?,5,history,count,"[{'name': 'Q22686', 'entityType': 'entity', 'l...","[{'name': 'Q3713655', 'label': 'Donald Trump J...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 6 child...","[-0.49233829980000005, -1.0202715397, -1.06337...","[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, Q348559...",Number,"[2, 3, 4, 5, 6, 1, 8, 9, 0, 7, 11, 10, 13]",Number,True,False,"[Q30, Q1860, Q61, Q30, Q49, Q6256, Q4917, Q627..."
4,e343ad26,en,Is the main hero in Final Fantasy IX named Kuja?,No,videogames,yesno,"[{'name': 'Q474573', 'entityType': 'entity', '...",[],"[Yes, No, Yuna, Yuna, Yuna, Yuna and Kuja are ...","[-0.3390540481, -0.3550684452, -1.4538880587, ...","[True, False, None, None, None, None, None, No...",yesno,"[True, False]",yesno,False,False,"[Q99415917, Q312525, Q99397792, Q559618, Q2472..."


### Filtered by by availability in question entities two hop forward neighbours With oracle for question: is answer entity or not

In [39]:
def take_answer_filtered_by_two_hop_neighbours(row):
    filtered_candidates = [e for e in row['generated_entities'] if e in row['neighbours_entities']]
    if row['isAnswerEntity'] is True and len(filtered_candidates) > 0:
        return filtered_candidates[0]
    else:
        return row['generated_entities'][0]

preds = test_df.apply(
    take_answer_filtered_by_two_hop_neighbours,
    axis=1
)

df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], preds)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
_ = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

Group          Hits@1 (Correct Of Total)
All          = 0.2795 (1118 Of 4000)

comparative  = 0.4725 ( 189 Of  400)
count        = 0.2700 ( 108 Of  400)
difference   = 0.1925 (  77 Of  400)
generic      = 0.2325 ( 186 Of  800)
intersection = 0.3025 ( 121 Of  400)
multihop     = 0.1075 (  43 Of  400)
ordinal      = 0.1500 (  60 Of  400)
superlative  = 0.3550 ( 142 Of  400)
yesno        = 0.4800 ( 192 Of  400)

Q5           = 0.3115 ( 314 Of 1008)
Number       = 0.1549 ( 127 Of  820)
yesno        = 0.4651 ( 253 Of  544)
Q11424       = 0.1721 (  42 Of  244)
Q7889        = 0.2649 (  49 Of  185)
Q3624078     = 0.4253 (  37 Of   87)
Q35657       = 0.3924 (  31 Of   79)
Q482994      = 0.0870 (   6 Of   69)
Q7725634     = 0.2000 (  13 Of   65)
Q1093829     = 0.3462 (  18 Of   52)



### Filtered by by Type and availability in question entities two hop forward neighbours With oracle for question: is answer entity or not

In [40]:
def take_answer_filtered_by_types_and_two_hop_neighbours(row):
    filtered_candidates = [e for e in row['filtered_by_type_preds'] if e in row['neighbours_entities']]
    if row['isAnswerEntity'] is True and len(filtered_candidates) > 0:
        return filtered_candidates[0]
    else:
        return row['generated_entities'][0]

preds = test_df.apply(
    take_answer_filtered_by_types_and_two_hop_neighbours,
    axis=1
)
df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], preds)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
_ = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

Group          Hits@1 (Correct Of Total)
All          = 0.2757 (1103 Of 4000)

comparative  = 0.4625 ( 185 Of  400)
count        = 0.2700 ( 108 Of  400)
difference   = 0.1775 (  71 Of  400)
generic      = 0.2263 ( 181 Of  800)
intersection = 0.2950 ( 118 Of  400)
multihop     = 0.1100 (  44 Of  400)
ordinal      = 0.1550 (  62 Of  400)
superlative  = 0.3550 ( 142 Of  400)
yesno        = 0.4800 ( 192 Of  400)

Q5           = 0.3115 ( 314 Of 1008)
Number       = 0.1549 ( 127 Of  820)
yesno        = 0.4651 ( 253 Of  544)
Q11424       = 0.1393 (  34 Of  244)
Q7889        = 0.2595 (  48 Of  185)
Q3624078     = 0.4483 (  39 Of   87)
Q35657       = 0.3924 (  31 Of   79)
Q482994      = 0.1159 (   8 Of   69)
Q7725634     = 0.2308 (  15 Of   65)
Q1093829     = 0.2885 (  15 Of   52)



### Filter question entities two hop forward neighbours (Only TOP-100) by type extracted from seq2seq

In [41]:
def take_answer_filtered_by_types_from_two_hop_neighbours(row):
    if row['isAnswerEntity'] is True:
        for entity_id in row['neighbours_entities'][:100]:
            answer_type = AnswerItem(entity_id).type
            if isinstance(answer_type, list):
                if row['answerRetrievedType'] in [e.idx for e in answer_type]:
                    return entity_id
            elif answer_type == row['answerRetrievedType']:
                return entity_id

    return row['generated_entities'][0]

preds = test_df.progress_apply(
    take_answer_filtered_by_types_from_two_hop_neighbours,
    axis=1
)
df = calculate_metrics_for_prediction(
    dict(zip(dataset[dataset_split]['id'], preds)),
    dataset_split,
    'kg',
)
df['answerRetrievedType'] = test_df['answerRetrievedType']
_ = print_eval(df=df, groupbycols=['complexityType', 'answerRetrievedType'])

  0%|          | 0/4000 [00:00<?, ?it/s]

Group          Hits@1 (Correct Of Total)
All          = 0.1978 ( 791 Of 4000)

comparative  = 0.3350 ( 134 Of  400)
count        = 0.2700 ( 108 Of  400)
difference   = 0.1100 (  44 Of  400)
generic      = 0.1288 ( 103 Of  800)
intersection = 0.1500 (  60 Of  400)
multihop     = 0.0775 (  31 Of  400)
ordinal      = 0.0925 (  37 Of  400)
superlative  = 0.2050 (  82 Of  400)
yesno        = 0.4800 ( 192 Of  400)

Q5           = 0.1359 ( 137 Of 1008)
Number       = 0.1549 ( 127 Of  820)
yesno        = 0.4651 ( 253 Of  544)
Q11424       = 0.1230 (  30 Of  244)
Q7889        = 0.1514 (  28 Of  185)
Q3624078     = 0.1494 (  13 Of   87)
Q35657       = 0.2658 (  21 Of   79)
Q482994      = 0.0725 (   5 Of   69)
Q7725634     = 0.1077 (   7 Of   65)
Q1093829     = 0.1346 (   7 Of   52)



### Request two hop question entities neighbours forward and backward filtered by retrieved type

In [42]:
# from pywikidata.utils import request_to_wikidata

# def get_neighbours_with_type_filter(
#     entity_id: str,
#     type_entity_id: str, # InstanceOf or SubclassOf
# ):
#     query = """
#     PREFIX wd: <http://www.wikidata.org/entity/>
#     PREFIX wdt: <http://www.wikidata.org/prop/direct/>

#     SELECT DISTINCT ?property ?object WHERE {
#     VALUES ?p { wdt:P31 wdt:P279 } 
#     {
#         ?object ?property wd:<ENTITY> .
#         ?object ?p wd:<TYPE>
#     } UNION {
#         wd:<ENTITY> ?property ?object .
#         ?object ?p wd:<TYPE> .
#     }
#     }
#     """.replace('<ENTITY>', entity_id).replace('<TYPE>', type_entity_id)
#     for responce in request_to_wikidata(query):
#         yield Entity(responce['property']['value']), Entity(responce['object']['value'])


# for _, row in tqdm(test_df.iterrows(), total=test_df.index.size, desc='two_hop_neighbours_type_filtered'):
#     with open(f'mintaka_test_question_entities_two_hop_neighbours_filtered_by_type.jsonl', 'a+') as f:
#         try:
#             if Entity._validate_entity_id(row['answerRetrievedType']):
#                 question_entities = [
#                     Entity(e['name'])
#                     for e in row['questionEntity']
#                     if e['entityType'] == 'entity' and 'Q' == e['name'][0]
#                 ]

#                 question_entities_neighbours = []
#                 for q_entity in question_entities:
#                     for one_hop_neighbour_prop, one_hop_neighbour_entity in get_neighbours_with_type_filter(q_entity.idx, row['answerRetrievedType']):
#                         question_entities_neighbours.append((
#                             q_entity.idx,
#                             one_hop_neighbour_prop.idx,
#                             one_hop_neighbour_entity.idx,
#                         ))

#                         for two_hop_neighbour_prop, two_hop_neighbour_entity in get_neighbours_with_type_filter(one_hop_neighbour_entity.idx, row['answerRetrievedType']):
#                             question_entities_neighbours.append((
#                                 one_hop_neighbour_entity.idx,
#                                 two_hop_neighbour_prop.idx,
#                                 two_hop_neighbour_entity.idx,
#                             ))

#                 f.write(
#                     ujson.dumps({
#                         'id': row['id'],
#                         'two_hop_neighbours': question_entities_neighbours,
#                         'type': row['answerRetrievedType'],
#                     }) + '\n'
#                 )
#         except Exception as e:
#             print(f"Error with id={row['id']}: {e}")
#             break


question_entities_filtered_neighbours_df = pd.read_json(
    f'mintaka_test_question_entities_two_hop_neighbours_filtered_by_type.jsonl',
    lines=True,
)
question_entities_filtered_neighbours_df.head()

Unnamed: 0,id,two_hop_neighbours,type
0,fae46b21,"[[Q1497, P19, Q5075459], [Q5075459, P22, Q2020...",Q5
1,d2a03f72,"[[Q2121062, P451, Q21286612], [Q21286612, P451...",Q5
2,b41ae115,[],Q5
3,7ed1858c,"[[Q128532, P161, Q20949964], [Q20949964, P136,...",Q11424
4,bfc9807b,[],Q35657


In [43]:
question_entities_filtered_neighbours_df['two_hop_neighbours'].apply(len).describe()

count       96.000000
mean      4842.677083
std      12761.744159
min          0.000000
25%         95.000000
50%        358.500000
75%       2492.500000
max      85216.000000
Name: two_hop_neighbours, dtype: float64

### Evaluate ChatGPT

In [44]:
gpt_preds = pd.read_json('mintaka_test_chatgpt_gpt_3_5_turbo_0301_answers.jsonl', lines=True)
gpt_preds.head()

Unnamed: 0,id,predText
0,fae46b21,Mark Twain.
1,bc8713cc,Zero.
2,d2a03f72,The Weeknd.
3,9a296167,5.
4,e343ad26,No.


In [47]:
_ = print_eval(dict(gpt_preds.values), mode='text')

Group          Hits@1 (Correct Of Total)
All          = 0.3425 (1370 Of 4000)

comparative  = 0.3500 ( 140 Of  400)
count        = 0.2950 ( 118 Of  400)
difference   = 0.1900 (  76 Of  400)
generic      = 0.5250 ( 420 Of  800)
intersection = 0.5275 ( 211 Of  400)
multihop     = 0.3075 ( 123 Of  400)
ordinal      = 0.4100 ( 164 Of  400)
superlative  = 0.2950 ( 118 Of  400)
yesno        = 0.0000 (   0 Of  400)

books        = 0.3240 ( 162 Of  500)
geography    = 0.3720 ( 186 Of  500)
history      = 0.3480 ( 174 Of  500)
movies       = 0.2900 ( 145 Of  500)
music        = 0.3260 ( 163 Of  500)
politics     = 0.3920 ( 196 Of  500)
sports       = 0.3880 ( 194 Of  500)
videogames   = 0.3000 ( 150 Of  500)



In [48]:
gpt_preds['predKG'] = gpt_preds['predText'].progress_apply(AnswerItem.text_to_answer)

_ = print_eval(dict(gpt_preds[['id', 'predKG']].values), mode='kg')

  0%|          | 0/4000 [00:00<?, ?it/s]

Group          Hits@1 (Correct Of Total)
All          = 0.3595 (1438 Of 4000)

comparative  = 0.6750 ( 270 Of  400)
count        = 0.2175 (  87 Of  400)
difference   = 0.1325 (  53 Of  400)
generic      = 0.3563 ( 285 Of  800)
intersection = 0.3650 ( 146 Of  400)
multihop     = 0.1975 (  79 Of  400)
ordinal      = 0.2225 (  89 Of  400)
superlative  = 0.2025 (  81 Of  400)
yesno        = 0.8700 ( 348 Of  400)

books        = 0.3080 ( 154 Of  500)
geography    = 0.3820 ( 191 Of  500)
history      = 0.3440 ( 172 Of  500)
movies       = 0.3480 ( 174 Of  500)
music        = 0.3120 ( 156 Of  500)
politics     = 0.4380 ( 219 Of  500)
sports       = 0.3960 ( 198 Of  500)
videogames   = 0.3480 ( 174 Of  500)



In [49]:
gpt_df = calculate_metrics_for_prediction(dict(gpt_preds[['id', 'predText']].values), split='test', mode='text')
gpt_df['predKG'] = gpt_df['pred'].progress_apply(AnswerItem.text_to_answer)
gpt_df.head()

  0%|          | 0/4000 [00:00<?, ?it/s]

Unnamed: 0,id,question,translations,questionEntity,answer,category,complexityType,pred,exact_match,f1,hits1,predKG
0,fae46b21,What man was a famous American author and also...,{'ar': 'مَن الرجل الذي كان مؤلفًا أمريكيًا مشه...,"[{'name': 'Q1497', 'entityType': 'entity', 'la...",Mark Twain,history,intersection,Mark Twain.,True,0.5,True,Q7245
1,bc8713cc,How many Academy Awards has Jake Gyllenhaal be...,{'ar': 'كم عدد جوائز الأوسكار التي ترشح لها ال...,"[{'name': 'Q133313', 'entityType': 'entity', '...",1,movies,count,Zero.,False,0.0,False,Q204
2,d2a03f72,"Who is older, The Weeknd or Drake?",{'ar': 'أيهما أكبر سنًا، The Weeknd أم Drake؟'...,"[{'name': 'Q2121062', 'entityType': 'entity', ...",Drake,music,comparative,The Weeknd.,False,0.0,False,Q2121062
3,9a296167,How many children did Donald Trump have?,"{'ar': 'كم عدد أطفال دونالد ترامب؟', 'de': 'Wi...","[{'name': 'Q22686', 'entityType': 'entity', 'l...",5,history,count,5.,True,0.0,True,5
4,e343ad26,Is the main hero in Final Fantasy IX named Kuja?,{'ar': 'هل كان البطل الرئيسي في لعبة الفيديو F...,"[{'name': 'Q474573', 'entityType': 'entity', '...",False,videogames,yesno,No.,False,0.0,False,False
