# Imports and setup

In [154]:
### imports
from transformers import pipeline
import wikipediaapi
import pandas as pd
import sparql
import spacy_dbpedia_spotlight
import spacy



In [None]:
### SETUP
# the model is pretrained on SQuAD by default, but you can either specify it
# model_name = "deepset/roberta-base-squad2"
qa = pipeline("question-answering")

# model for dbpedia-spotlight
nlp = spacy.load('en_core_web_lg')
# TODO: check if it could be done without all the pipeline
nlp.add_pipe('dbpedia_spotlight', config={'language_code':'en', 'overwrite_ents':True})

# wikipedia api
wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

# Load data

In [97]:
# load LC-QALD
data = pd.read_json('../data/train-data.json')
data.head(5)

Unnamed: 0,_id,corrected_question,intermediary_question,sparql_query,sparql_template_id
0,1501,How many movies did Stanley Kubrick direct?,How many <movies> are there whose <director> i...,SELECT DISTINCT COUNT(?uri) WHERE {?uri <http:...,101
1,2586,Which city's foundeer is John Forbes?,What <city>'s <founded by> is <John Forbes (Br...,SELECT DISTINCT ?uri WHERE {?uri <http://dbped...,301
2,2653,What is the river whose mouth is in deadsea?,What is the <river> whose <river mouth> is <De...,SELECT DISTINCT ?uri WHERE {?uri <http://dbped...,301
3,1055,What is the allegiance of John Kotelawala ?,What is the <allegiance> of John Kotelawala ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,2
4,705,How many races have the horses bred by Jacques...,What is the total number of <race> of the <rac...,SELECT DISTINCT COUNT(?uri) WHERE { ?x <http:/...,105


In [98]:
# filter simple questions and keep only questions and sparql queries
questions = data[data.sparql_template_id == 2][['corrected_question', 'sparql_query']]
# REMOVE NEXT LINE
#questions = questions.head(20)

# Prepare data

## Extract answers

In [99]:
# extract answer from dbpedia endpoint
def get_answers(query, endpoint = 'http://dbpedia.org/sparql'):
    answers = []
    results = sparql.query(endpoint, query)
    for row in results:
        answers.append(sparql.unpack_row(row)[0])        
    return answers

In [100]:
questions['answer'] = questions['sparql_query'].apply(get_answers)

In [101]:
# TODO: be sure to have fixed get_answers to avoid this operation
# questions['answers'] = questions['answers'].apply(lambda x: [item for sublist in x for item in sublist])

In [None]:
# questions['answer'] = questions['answer'].apply(lambda x : [' '.join(item.split('/')[-1].split('_')) for item in x])

In [102]:
# save questions
questions.to_csv('../data/questions.csv')

## Extract dbpedia entities

In [121]:
def get_entities(question):
    # entity extraction and dbpedia linking
    try:
        # TODO: fix error
        tmp = nlp(question)
    except:
        ent = 'SpacyError'
        return ent

    # check results
    if len(tmp.ents) == 0:
        ent = 'NotFound'
    else:
        # TODO: i'm not sure on what happens if there are more than one ents :/
        # maybe in an english question the subject is always the last entity... 
        ent = tmp.ents[-1].text # change [0] to [-1]? seems better...
    return ent


# test
get_entities(questions.loc[3966]['corrected_question'])

'Purnima Banerjee'

In [122]:
# store entities
questions['entity'] = questions['corrected_question'].apply(get_entities)
questions

Unnamed: 0,corrected_question,sparql_query,answer,entity
3,What is the allegiance of John Kotelawala ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[http://dbpedia.org/resource/Sri_Lanka],John Kotelawala
6,What is the region of Tom Perriello ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],Tom Perriello
9,Name the mascot of Austin College ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[http://dbpedia.org/resource/Kangaroo],Austin College
13,Name the origin of Henry Cluney ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,"[http://dbpedia.org/resource/Northern_Ireland,...",Henry Cluney
19,Name the resting place of Charles Edward Magoon ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,"[http://dbpedia.org/resource/Lincoln,_Nebraska...",Charles Edward Magoon
...,...,...,...,...
3966,What is the political party to which Purnima B...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[http://dbpedia.org/resource/Indian_National_C...,Purnima Banerjee
3972,What is the currency of Republic of Montenegro...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],Montenegro
3974,What is the club of Perianne Jones ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[Nakkertok Ski Club],Perianne Jones
3994,List all the mmebers of Mekong River Commission?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,"[http://dbpedia.org/resource/Cambodia, http://...",Mekong River Commission


In [117]:
# entities = []
# for _, row in questions.iterrows():
#     # entity extraction and dbpedia linking
#     print(row['corrected_question'])
#     # TODO: fix the error
#     tmp = nlp(row['corrected_question'])
#     # check results
#     if len(tmp.ents) == 0:
#         ent = 'NotFound'
#     else:
#         # TODO: i'm not sure on what happens if there are more than one ents :/
#         ent = tmp.ents[0].text

# entities.append(ent)

In [123]:
# save questions
questions.to_csv('../data/questions.csv')

In [133]:
# check inconsistent records
inconsistent = questions[(questions['entity'].isin(['NotFound', 'SpacyError'])) | (len(questions['answer']) == 0)]
print('Invalid records: ', len(inconsistent))
inconsistent

Invalid records:  30


Unnamed: 0,corrected_question,sparql_query,answer,entity
99,Where is the swedish covenant hospital?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],NotFound
143,Which driver came first in the 1993 European G...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[http://dbpedia.org/resource/Ayrton_Senna],SpacyError
220,Who built the Ford Y-block engine ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[http://dbpedia.org/resource/Ford_Motor_Company],SpacyError
270,What is Bob Adams (American football) known for?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[http://dbpedia.org/resource/National_Football...,SpacyError
366,which politician is currently ruling over Rish...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],NotFound
380,Name the monarch of L Buwei ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],NotFound
519,Who is the writer of He's a Pirate ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,"[http://dbpedia.org/resource/Hans_Zimmer, Klau...",NotFound
596,Who created the world series of Poker?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,"[http://dbpedia.org/resource/United_States, ht...",NotFound
784,which person created the women in the garden?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],NotFound
790,List the main ingredient of Pizza-ghetti ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,[],NotFound


# QA on wikipedia

In [None]:
# load data to avoid executing the blocks above
questions = pd.read_csv('../data/questions.csv', index_col = 0)
questions.head(5)

In [164]:
# TODO: fix return values
def search_answer(question, entity, debug = False):
    # get wikipedia page
    p = wiki.page(entity)
    
    # iterate on sections and find answer in each span
    max_score = 0
    best_answer = None
    for section in p.sections:

        # TODO: how to identify a smaller span? (check if it's necessary...)
        # if subject in section.text: # look only in relevant sections???
        
        # exlude last section
        if section.title != 'External links':
            if debug: print(section.title)
            context = section.text
            # exlude empty sections (i.e. macro section intestation)
            if context != '':
                answer = qa(question=question, context=context)
                if answer['score'] > max_score:
                    max_score = answer['score']
                    best_answer = answer
                if debug: print(f"Answer: '{answer['answer']}' with score {answer['score']}", '\n', 10*'#')
                

    # print best answer
    if best_answer is not None:  
        if debug: print(f"BEST ANSWER: '{best_answer['answer']}' with score {best_answer['score']}")
    else:
        best_answer = 'Sorry, answer not found!'
        if debug: print(best_answer)
    return best_answer

In [165]:
# test a question
example = questions.iloc[110]

print('# QUESTION:\n', example['corrected_question'])
print('# SPARQL:\n', example['sparql_query'])
print('# POSSIBLE ANSWERS:\n', example['answer'])
print('_'*30)

search_answer(example['corrected_question'], example['entity'], debug = True)

# QUESTION:
 What are the houses of Parliament Security Services ?
# SPARQL:
  SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Parliament_Security_Services> <http://dbpedia.org/property/houses> ?uri } 
# POSSIBLE ANSWERS:
 ['http://dbpedia.org/resource/Rajya_Sabha', 'http://dbpedia.org/resource/Lok_Sabha', '']
______________________________


'Sorry, answer not found!'

In [166]:
answers = []
for index, row in questions.iterrows():
    answers.append(search_answer(row['corrected_question'], row['entity']))


In [169]:
answers

[{'score': 0.9133368134498596,
  'start': 53,
  'end': 70,
  'answer': 'political leaders'},
 {'score': 0.9734827876091003,
  'start': 71,
  'end': 83,
  'answer': 'Sierra Leone'},
 {'score': 0.9678401350975037,
  'start': 1438,
  'end': 1451,
  'answer': 'David Peeples'},
 'Sorry, answer not found!',
 {'score': 0.80088871717453, 'start': 36, 'end': 40, 'answer': 'Cuba'},
 {'score': 0.9982991218566895, 'start': 693, 'end': 701, 'answer': 'He Kexin'},
 {'score': 0.614020049571991,
  'start': 1726,
  'end': 1753,
  'answer': 'Chief Justice John Marshall'},
 {'score': 0.5060786604881287,
  'start': 14,
  'end': 55,
  'answer': 'Recipient of the Grand Order of Mugunghwa'},
 'Sorry, answer not found!',
 {'score': 0.8656526803970337, 'start': 607, 'end': 614, 'answer': 'Chinese'},
 {'score': 0.1327703446149826,
  'start': 192,
  'end': 225,
  'answer': 'Everything\nTelevision Supervision'},
 {'score': 0.9831961393356323,
  'start': 479,
  'end': 491,
  'answer': 'Selena Gomez'},
 {'score': 0

In [184]:
answers_text = [item['answer'] if type(item) != str else item for item in answers]

In [188]:
# store answers
questions['wiki_answer'] =  answers_text

In [190]:
questions.to_csv('../data/questions_answered.csv')