# Task: Evaluate ability to use MINTAKA as a basic dataset for answer type prediction 

https://www.notion.so/msalnikov/Evaluate-ability-to-use-MINTAKA-as-a-basic-dataset-for-answer-type-prediction-cdbaa44b9f904d5882a28840ae04bcfd?pvs=4

In [1]:
import datasets
import pandas as pd
from transformers import T5ForConditionalGeneration, AutoTokenizer
from tqdm.auto import tqdm
from joblib import Parallel, delayed

from kbqa.seq2seq.eval import predict_answers as seq2seq_predict_answers
from kbqa.candidate_selection.question_to_rank_by_instance_of import QuestionToRankInstanceOfSimple

import nltk

In [2]:
from typing import Optional

import requests
from joblib import Memory
from pywikidata import Entity

memory = Memory('/tmp/cache', verbose=0)


@memory.cache
def get_wd_search_results(
    search_string: str,
    max_results: int = 500,
    language: str = 'en',
    mediawiki_api_url: str = "https://www.wikidata.org/w/api.php",
    user_agent: str = None,
) -> list:
    params = {
        'action': 'wbsearchentities',
        'language': language,
        'search': search_string,
        'format': 'json',
        'limit': 50
    }

    user_agent = "pywikidata" if user_agent is None else user_agent
    headers = {
        'User-Agent': user_agent
    }

    cont_count = 1
    results = []
    while cont_count > 0:
        params.update({'continue': 0 if cont_count == 1 else cont_count})

        reply = requests.get(mediawiki_api_url, params=params, headers=headers)
        reply.raise_for_status()
        search_results = reply.json()

        if search_results['success'] != 1:
            raise Exception('WD search failed')
        else:
            for i in search_results['search']:
                results.append(i['id'])

        if 'search-continue' not in search_results:
            cont_count = 0
        else:
            cont_count = search_results['search-continue']

        if cont_count > max_results:
            break

    return results

In [3]:
types_df = pd.read_csv(
    './mintaka_train_simpleq_answer_types.txt',
    sep='\t',
    names=['id', 'answer_id', 'type_id'],
)
types_df.head()

Unnamed: 0,id,answer_id,type_id
0,cf871296,Q185107,Q5
1,8239c2cd,Q669749,Q11424
2,058d3478,Q976022,Q5
3,e347e0ab,Q192724,Q11424
4,d6ef9a28,Q312098,Q5


In [4]:
df: pd.DataFrame = datasets.load_dataset('AmazonScience/mintaka', split='train').to_pandas()
df.head()

No config specified, defaulting to: mintaka/en
Found cached dataset mintaka (/root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)


Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity
0,a9011ddf,en,What is the seventh tallest mountain in North ...,Mount Lucania,geography,ordinal,"[{'name': 'Q49', 'entityType': 'entity', 'labe...","[{'name': 'Q1153188', 'label': 'Mount Lucania'}]"
1,2723bb1b,en,Which actor was the star of Titanic and was bo...,Leonardo DiCaprio,movies,intersection,"[{'name': 'Q44578', 'entityType': 'entity', 'l...","[{'name': 'Q38111', 'label': 'Leonardo DiCapri..."
2,88349c89,en,Which actor starred in Vanilla Sky and was mar...,Tom Cruise,movies,intersection,"[{'name': 'Q174346', 'entityType': 'entity', '...","[{'name': 'Q37079', 'label': 'Tom Cruise'}]"
3,bff78c91,en,What year was the first book of the A Song of ...,1996,books,generic,"[{'name': 'Q45875', 'entityType': 'entity', 'l...",[]
4,982450cf,en,Who is the youngest current US governor?,Ron DeSantis,politics,superlative,"[{'name': 'Q889821', 'entityType': 'entity', '...","[{'name': 'Q3105215', 'label': 'Ron DeSantis'}]"


In [5]:
df = df.merge(types_df, on='id')
df.head()

Unnamed: 0,id,lang,question,answerText,category,complexityType,questionEntity,answerEntity,answer_id,type_id
0,cf871296,en,Who was the president of Argentina from 1989 t...,Carlos Menem,politics,generic,"[{'name': 'Q414', 'entityType': 'entity', 'lab...","[{'name': 'Q185107', 'label': 'Carlos Menem'}]",Q185107,Q5
1,8239c2cd,en,What movie won the first Golden Globe award fo...,The Song of Bernadette,movies,generic,"[{'name': 'Q102427', 'entityType': 'entity', '...","[{'name': 'Q669749', 'label': 'The Song of Ber...",Q669749,Q11424
2,058d3478,en,Who played Christian Grey in Fifty Shades of G...,Jamie Dornan,movies,generic,"[{'name': 'Q110110619', 'entityType': 'entity'...","[{'name': 'Q976022', 'label': 'Jamie Dornan'}]",Q976022,Q5
3,e347e0ab,en,What was the first movie in the Marvel Cinemat...,Iron Man,movies,generic,"[{'name': 'Q642878', 'entityType': 'entity', '...","[{'name': 'Q192724', 'label': 'Iron Man'}]",Q192724,Q11424
4,d6ef9a28,en,Who won the Academy Award for Best Actor in 1950?,Jose Ferrer,movies,generic,"[{'name': 'Q103916', 'entityType': 'entity', '...","[{'name': 'Q312098', 'label': 'José Ferrer'}]",Q312098,Q5


In [6]:
model = T5ForConditionalGeneration.from_pretrained('/mnt/storage/QA_System_Project/seq2seq_runs/wdsq_tunned/google_t5-large-ssm-nq/models/checkpoint-4000/')
tokenizer = AutoTokenizer.from_pretrained('/mnt/storage/QA_System_Project/seq2seq_runs/wdsq_tunned/google_t5-large-ssm-nq/models/checkpoint-4000/')

In [7]:
dataset = datasets.Dataset.from_pandas(df).map(
    lambda batch: tokenizer(
        batch['question'],
        padding='max_length',
        truncation=True,
        max_length=64,
    ),
    batched=True,
)
columns = [
    "input_ids",
    "attention_mask",
]
dataset.set_format(type="torch", columns=columns)

Map:   0%|          | 0/1252 [00:00<?, ? examples/s]

In [10]:
device = 'cuda:2'
generated_answers = seq2seq_predict_answers(
    model=model.to(device),
    tokenizer=tokenizer,
    dataset=dataset,
    num_beams=200,
    num_return_sequences=200,
    num_beam_groups=5,
    diversity_penalty=0.1,
    batch_size=2,
    device=device,
)

evaluate model:   0%|          | 0/626 [00:00<?, ?it/s]



In [11]:
generated_answer_df = pd.DataFrame(generated_answers)
generated_answer_df.head()

Unnamed: 0,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,answer_7,answer_8,answer_9,...,answer_190,answer_191,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199
0,Leopoldo Fortunato Galtieri Castelli,Luis Guillermo Sols Rivera,Leopoldo Fortunato Galtieri Castelli,Augusto José Ramón Pinochet Ugarte,lvaro Ugarte,Lázaro Cárdenas,Enrique Pea Nieto,Luis Guillermo Sols Rivera,Luis Guillermo Sols Vidal,lvaro Obrador,...,Leopoldo Evoluti,Luis Guillermo Stábile Carrillo,Leopoldo Fortunato Galtieri Campanella,Ricardo Rosselló,Buenos Aires,Juan Manuel Santos,Luis Guillermo Bartolomeu Dias,Carlos Menem,Luis Guillermo Sols Trevio,Luis Guillermo Sols Muoz Barrios
1,The Grasshopper Lies Heavy,Underworld: The Curse of the Blood Rubies,The Grasshopper Lies Heavy,Bridget Moynahan's Diary,I'll See You in My Dreams,Les Misérables,Underworld: The Curse of the Blood Rubies,Strangers on a Train,Bridget Jones's Diary,Sleeping with the Enemy,...,L'Italiano by Luchino Visi,In the Name of the Father,Underworld: The World's Most Dangerous,Underworld: The Curse of the Nerds,Dead Man's Chest: Resurrection,Erin Brockovich and His Friends,Strangers on a Strange Planet,Underworld: The Curse of the Nerd,Underworld: The Curse of the Blood Rubies,Dead Man's Chest: The Story of the Dead
2,"James ""Jamie"" Dornan","James ""Jamie"" Dornan",James Dornan,Jamie Dornan,Idris Elba,"James ""Jamie"" Dornan",Bill Skarsgrd,Adrian McLoughlin,Matthew Quincy Daddario,James Dornan,...,Freddie Highmore,"James ""Jamie"" Stevenson","James ""Jamie"" Lepley","James ""Jamie"" Garner","James ""Jamie"" Duczmal",Eric Andre,"James ""Jamie"" Gleeson","James ""Jamie"" Bluher",D. W. Weissman,Eric Bana
3,Marvel's Daredevil,Marvel's The Avengers,Marvel's Daredevil,Marvel's The Avengers,Spider-Man: Homecoming,Marvel's Daredevil,Captain America: Civil War,Avengers: Age of Ultron,Marvel's The Avengers: Age of Ultron,Spider-Man: Homecoming,...,Marvel's The Marvelous Land of Ymir,Marvel's The Avengers: The Avengers (2012),Marvel's The Avengers (2012),X-Men Origins: Woden,Marvel's The Marvelous Land of Piranha!,Marvel's Dracula,Captain America: Apocalypse (2003),Marvel’s The Avengers,Captain America: The Civil War,Marvel's Superheroes
4,Sidney Poitier,Daniel Day-Lewis,Sidney Poitier,Harry Belafonte,Daniel Day-Lewis,Sidney Poitier,Harry Belafonte,Daniel Day-Lewis,Robert Donat,Ernest Borgnine,...,Claude Berris,Donald Woods,Victor Jory,William Theodore Wyler,William Theodore Wadsworth,Frank Sinatra,Oscar Levant,William Theodore Henlein,Louis Blinn,William Theodore Struthers


In [12]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'


def get_type_by_act(row):
    candidates_entities = [get_wd_search_results(label, 1)[:1] for label in row.unique().tolist()]
    candidates_entities = [l[0] for l in candidates_entities if len(l) > 0]

    act_typing_result = QuestionToRankInstanceOfSimple(
        '',
        [],
        candidates_entities,
    ).answer_instance_of
    return [e.idx for e in act_typing_result]


act_typing_results = Parallel(n_jobs=2)(
    delayed(get_type_by_act)(row) 
    for _,row in tqdm(generated_answer_df.iterrows(), total=generated_answer_df.index.size)
)
# act_typing_results = [
#     get_type_by_act(row)
#     for _,row in tqdm(generated_answer_df.iterrows(), total=generated_answer_df.index.size)
# ]

df['predicted_types_id'] = act_typing_results

  0%|          | 0/1252 [00:00<?, ?it/s]

In [16]:
df.apply(
    lambda row: row['type_id'] in row['predicted_types_id'],
    axis=1
).sum() / df.index.size

0.6837060702875399

In [17]:
df.apply(
    lambda row: row['type_id'] in row['predicted_types_id'][:1],
    axis=1
).sum() / df.index.size

0.5838658146964856