In [21]:
from transformers import pipeline
import wikipediaapi
import pandas as pd
import sparql
import spacy_dbpedia_spotlight
import spacy


# Load model

In [2]:

# Generating an answer to the question in context
# the model is pretrained on SQuAD by default, but you can either specify it
qa = pipeline("question-answering")


# Model test

In [3]:
# Open and read the article
question = "What is the capital of the Netherlands?"
context = r"The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore."

In [4]:
# search the answer
answer = qa(question=question, context=context)

In [5]:
# Print the answer
print(f"Context: {context}")
print(f"Question: {question}")
print(f"Answer: '{answer['answer']}' with score {answer['score']}")

Context: The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore.
Question: What is the capital of the Netherlands?
Answer: 'Amsterdam' with score 0.3774993121623993


# Wikipedia test

In [6]:
wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

p = wiki.page("John Kotelawala")


In [7]:
# get text
text = p.text
print(text)

General Sir John Lionel Kotelawala  (Sinhala: ශ්‍රිමත් ජෝන් ලයනල් කොතලාවල; 4 April 1895 – 2 October 1980) was a Sri Lankan statesman, most notable as the 3rd Prime Minister of Ceylon (Sri Lanka) from 1953 to 1956.
Born to a wealthy landholding and mining family, Kotelawala had a difficult childhood with the suicide of his father and financial difficulties that followed. He was educated at Royal College, Colombo and Christ's College, Cambridge before returning to become a planter and run the family estates and mines. Being from a politically active family, he entered active politics at the age of 35 years having been elected to the State Council of Ceylon. He later served as Minister of Communications and Works in the Second Board of Ministers of Ceylon. With Ceylon gaining independence in 1945, he was elected to Parliament and became a member of the first Cabinet as Minister of Transport and Works. Overlooked for the post of Prime Minister when the first Prime Minister of Ceylon, D. S.

In [8]:
# print text splitted in sections
def print_sections(sections, level=0):
        for s in sections:
                print("%s: %s - \n%s" % ("*" * (level + 1), s.title, s.text))
                print_sections(s.sections, level + 1)

         
print_sections(p.sections)

*: Early life and education - 
Kotelawala was born on 4 April 1895 to John Kotelawala Snr, a police inspector, who later turned businessman and Alice Elisabeth Kotalawala (née Attygalle), daughter of Mudaliyar Don Charles Gemoris Attygalle, a wealthy land and mine owner. He had a younger brother Justin Kotalawela and a sister Freda, who later married C. V. S. Corea.

The Kotelawalas lived in considerable comfort owing to the considerable land and mine holdings of his grandfather Mudaliyar Attygalle, which his father managed following the death of his grandfather. After he was forced out of the management of the Attygalle estates by the family, Kotelawala Snr started his own business ventures including the Ceylon-Japan Trading Company. In 1907, he was arrested and found guilty of conspiring to murder his brother-in-law, Francis Attygalle. While the murder trail was underway, Kotelawala Snr committed suicide by poisioning himself.Kotelawala was eleven years old when his father died and w

# SPARQL test

In [9]:
# perform query
q = 'SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Doug_Acomb> <http://dbpedia.org/property/playedFor> ?uri } '
result = sparql.query('http://dbpedia.org/sparql', q)
# print results
for row in result:
    print(sparql.unpack_row(row))

['http://dbpedia.org/resource/Toronto_Maple_Leafs']


In [45]:
def get_answers(query):
    answers = []
    results = sparql.query('http://dbpedia.org/sparql', query)
    for row in results:
        answers.append(sparql.unpack_row(row)[0])        
        
    return answers
get_answers(q)

['http://dbpedia.org/resource/Toronto_Maple_Leafs']

# LC-SQuAD

In [11]:
data = pd.read_json('../data/train-data.json')

In [12]:
data.head(5)

Unnamed: 0,_id,corrected_question,intermediary_question,sparql_query,sparql_template_id
0,1501,How many movies did Stanley Kubrick direct?,How many <movies> are there whose <director> i...,SELECT DISTINCT COUNT(?uri) WHERE {?uri <http:...,101
1,2586,Which city's foundeer is John Forbes?,What <city>'s <founded by> is <John Forbes (Br...,SELECT DISTINCT ?uri WHERE {?uri <http://dbped...,301
2,2653,What is the river whose mouth is in deadsea?,What is the <river> whose <river mouth> is <De...,SELECT DISTINCT ?uri WHERE {?uri <http://dbped...,301
3,1055,What is the allegiance of John Kotelawala ?,What is the <allegiance> of John Kotelawala ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,2
4,705,How many races have the horses bred by Jacques...,What is the total number of <race> of the <rac...,SELECT DISTINCT COUNT(?uri) WHERE { ?x <http:/...,105


In [13]:
questions = data[data.sparql_template_id == 2][['corrected_question', 'sparql_query']]
questions

Unnamed: 0,corrected_question,sparql_query
3,What is the allegiance of John Kotelawala ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
6,What is the region of Tom Perriello ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
9,Name the mascot of Austin College ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
13,Name the origin of Henry Cluney ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
19,Name the resting place of Charles Edward Magoon ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
...,...,...
3966,What is the political party to which Purnima B...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
3972,What is the currency of Republic of Montenegro...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
3974,What is the club of Perianne Jones ?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....
3994,List all the mmebers of Mekong River Commission?,SELECT DISTINCT ?uri WHERE { <http://dbpedia....


In [14]:
# 3, 9 
id = 3999
q_ex = questions.loc[id]['corrected_question']
sparql_ex = questions.loc[id]['sparql_query']
print('QUESTION:', q_ex)
print('SPARQL:', sparql_ex)
print('POSSIBLE ANSWERS: ', get_answers(sparql_ex))

QUESTION: List the team for which Doug Acomb played ?
SPARQL:  SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Doug_Acomb> <http://dbpedia.org/property/playedFor> ?uri } 
POSSIBLE ANSWERS:  [['http://dbpedia.org/resource/Toronto_Maple_Leafs']]


## NEL 

In [22]:
# this is any existing model
nlp = spacy.load('en_core_web_lg')
# add the pipeline stage
nlp.add_pipe('dbpedia_spotlight')
# see the pipeline, the added stage is at the end
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'dbpedia_spotlight']


In [32]:
doc = nlp(q_ex)
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])

Entities [('Doug Acomb', 'DBPEDIA_ENT', 'http://dbpedia.org/resource/Doug_Acomb')]


In [36]:
ent = doc.ents[0].text

## Search on wikipedia

In [38]:
# id: 3
# resource = 'Austin College'
# subject = 'mascot'

# id: 3999
resource = ent
subject = 'team'
p = wiki.page(resource)

In [39]:
max_score = 0
best_answer = None
# iterate on sections and find answer in each span
for section in p.sections:
    
    #if subject in section.text: # look only in relevant sections???
    if section.title != 'External links':
        print(section.title)
        context = section.text
        # exlude empty sections (i.e. macro section intestation)
        if context != '':
            answer = qa(question=q_ex, context=context)
            if answer['score'] > max_score:
                max_score = answer['score']
                best_answer = answer
            print(f"Answer: '{answer['answer']}' with score {answer['score']}")
            print('#'*10)

# print best answer
if best_answer is not None:  
    print(f"BEST ANSWER: '{best_answer['answer']}' with score {best_answer['score']}")
else:
    print('Sorry, answer not found!')

Playing career
Answer: 'Toronto Maple Leafs' with score 0.7962615489959717
##########
Post-retirement
Answer: 'hockey' with score 0.14017722010612488
##########
Career statistics
Answer: 'Port Huron' with score 0.2455081343650818
##########
BEST ANSWER: 'Toronto Maple Leafs' with score 0.7962615489959717
