results_*.csv - results from seq2seq.py eval script, corresponding for train/test parts.

In [2]:
import pandas as pd
from pywikidata import Entity
from tqdm.auto import tqdm
import json
import itertools
import datasets 

from kbqa.entity_linkink import build_mgenre_pipeline, EntitiesSelection
from kbqa.caches.ner_to_sentence_insertion import NerToSentenceInsertion

tqdm.pandas()

In [2]:
import requests
from joblib import Memory

memory = Memory('/tmp/cache', verbose=0)


@memory.cache
def get_wd_search_results(
    search_string: str,
    max_results: int = 500,
    language: str = 'en',
    mediawiki_api_url: str = "https://www.wikidata.org/w/api.php",
    user_agent: str = None,
) -> list:
    params = {
        'action': 'wbsearchentities',
        'language': language,
        'search': search_string,
        'format': 'json',
        'limit': 50
    }

    user_agent = "pywikidata" if user_agent is None else user_agent
    headers = {
        'User-Agent': user_agent
    }

    cont_count = 1
    results = []
    while cont_count > 0:
        params.update({'continue': 0 if cont_count == 1 else cont_count})

        reply = requests.get(mediawiki_api_url, params=params, headers=headers)
        reply.raise_for_status()
        search_results = reply.json()

        if search_results['success'] != 1:
            raise Exception('WD search failed')
        else:
            for i in search_results['search']:
                results.append(i['id'])

        if 'search-continue' not in search_results:
            cont_count = 0
        else:
            cont_count = search_results['search-continue']

        if cont_count > max_results:
            break

    return results

In [3]:
def prepare_data(data_df, results_df, wd_search_results_top_k: int = 1, mgenre=None, ner=None, entities_selection=None):
    answers_cols = [c for c in results_df.columns if 'answer_' in c]

    results_df['answers_ids'] = results_df[answers_cols].progress_apply(
        lambda row: [
            get_wd_search_results(label, 5, language='en')[:wd_search_results_top_k]
            for label in row.unique()[:5]
        ],
        axis=1
    ).apply(lambda list_of_list_of_answers: list(itertools.chain(*list_of_list_of_answers)))

    df = results_df.merge(data_df, on='question')


    data = []
    for _, row in tqdm(df.iterrows(), total=df.index.size):
        golden_true_entity = Entity(row['O'])

        if mgenre is None or ner is None or entities_selection is None:
            question_entity = Entity(row['S'])
            additional_candidates = [e.idx for _,e in question_entity.forward_one_hop_neighbours if e != golden_true_entity][:5]
            questionEntity = [question_entity.idx]
        else:
            question_with_ner, entities_list = ner.entity_labeling(row['question'], True)
            mgenre_results = mgenre(question_with_ner)
            selected_entities = entities_selection(entities_list, mgenre_results)

            questionEntity = list(itertools.chain(*[
                get_wd_search_results(l, 1, language='en')[:1]
                for l in selected_entities
            ]))
            additional_candidates = []

        candidates_ids = set(additional_candidates + row['answers_ids'] + [golden_true_entity.idx])
        
        for candidate_id in candidates_ids:
            candidate_entity = Entity(candidate_id)

            yield {
                'question': row['question'],
                'answerEntity': [candidate_entity.idx],
                'questionEntity': questionEntity,
                'groundTruthAnswerEntity': [golden_true_entity.idx],
            }


In [4]:
!wget -nc https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_train_answerable.txt
!wget -nc https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_valid_answerable.txt
!wget -nc https://raw.githubusercontent.com/askplatypus/wikidata-simplequestions/master/annotated_wd_data_test_answerable.txt

File ‘annotated_wd_data_train_answerable.txt’ already there; not retrieving.

File ‘annotated_wd_data_valid_answerable.txt’ already there; not retrieving.

File ‘annotated_wd_data_test_answerable.txt’ already there; not retrieving.



In [5]:
!mkdir to_subgraphs

mkdir: cannot create directory ‘to_subgraphs’: File exists


In [6]:
import torch

ner = NerToSentenceInsertion('/home/salnikov/data_kbqa/ner/spacy_models/wdsq_tuned/model-best/')
mgenre = build_mgenre_pipeline(torch.device('cuda:4'))
entities_selection = EntitiesSelection(ner.model)



In [7]:
data_train_df = pd.read_csv('annotated_wd_data_train_answerable.txt', sep='\t', names=['S', 'P', 'O', 'question'])

results_train_df: pd.DataFrame = pd.read_csv('./results_train.csv') # Train
results_train_df['question'] = results_train_df['question'].apply(lambda s: s.replace('\n', ''))

with open('to_subgraphs/sqwd_train.jsonl', 'w') as f:
    for data_line in prepare_data(data_train_df, results_train_df):
        f.write(json.dumps(data_line)+'\n')

with open('to_subgraphs/sqwd_train_with_entity_linker.jsonl', 'w') as f:
    for data_line in prepare_data(data_train_df, results_train_df, 1, mgenre, ner, entities_selection):
        f.write(json.dumps(data_line)+'\n')

  results_train_df: pd.DataFrame = pd.read_csv('./results_train.csv') # Train


  0%|          | 0/11012 [00:00<?, ?it/s]

  0%|          | 0/11244 [00:00<?, ?it/s]

  0%|          | 0/11012 [00:00<?, ?it/s]

  0%|          | 0/11244 [00:00<?, ?it/s]



In [8]:
data_valid_df = pd.read_csv('annotated_wd_data_valid_answerable.txt', sep='\t', names=['S', 'P', 'O', 'question'])

results_valid_df: pd.DataFrame = pd.read_csv('./results_validation.csv') # Validation
results_valid_df['question'] = results_valid_df['question'].apply(lambda s: s.replace('\n', ''))

with open('to_subgraphs/sqwd_validation.jsonl', 'w') as f:
    for data_line in prepare_data(data_valid_df, results_valid_df):
        f.write(json.dumps(data_line)+'\n')

with open('to_subgraphs/sqwd_validation_with_entity_linker.jsonl', 'w') as f:
    for data_line in prepare_data(data_valid_df, results_valid_df, 1, mgenre, ner, entities_selection):
        f.write(json.dumps(data_line)+'\n')

  0%|          | 0/2561 [00:00<?, ?it/s]

  0%|          | 0/2568 [00:00<?, ?it/s]

  0%|          | 0/2561 [00:00<?, ?it/s]

  0%|          | 0/2568 [00:00<?, ?it/s]



In [11]:
data_test_df = pd.read_csv('annotated_wd_data_test_answerable.txt', sep='\t', names=['S', 'P', 'O', 'question'])

results_test_df: pd.DataFrame = pd.read_csv('./results_test.csv') # Test
results_test_df['question'] = results_test_df['question'].apply(lambda s: s.replace('\n', ''))

with open('to_subgraphs/sqwd_test.jsonl', 'w') as f:
    for data_line in prepare_data(data_test_df, results_test_df):
        f.write(json.dumps(data_line)+'\n')

with open('to_subgraphs/sqwd_test_with_entity_linker.jsonl', 'w') as f:
    for data_line in prepare_data(data_test_df, results_test_df, 1, mgenre, ner, entities_selection):
        f.write(json.dumps(data_line)+'\n')

  0%|          | 0/5136 [00:00<?, ?it/s]

  0%|          | 0/5186 [00:00<?, ?it/s]

  0%|          | 0/5136 [00:00<?, ?it/s]

  0%|          | 0/5186 [00:00<?, ?it/s]

