In [None]:
from transformers import pipeline
import wikipediaapi
import pandas as pd
import sparql

# Load model

In [None]:

# Generating an answer to the question in context
# the model is pretrained on SQuAD by default, but you can either specify it
qa = pipeline("question-answering")


# Model test

In [None]:
# Open and read the article
question = "What is the capital of the Netherlands?"
context = r"The four largest cities in the Netherlands are Amsterdam, Rotterdam, The Hague and Utrecht.[17] Amsterdam is the country's most populous city and nominal capital,[18] while The Hague holds the seat of the States General, Cabinet and Supreme Court.[19] The Port of Rotterdam is the busiest seaport in Europe, and the busiest in any country outside East Asia and Southeast Asia, behind only China and Singapore."

In [None]:
# search the answer
answer = qa(question=question, context=context)

In [None]:
# Print the answer
print(f"Context: {context}")
print(f"Question: {question}")
print(f"Answer: '{answer['answer']}' with score {answer['score']}")

# Wikipedia test

In [None]:
wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

p = wiki.page("John Kotelawala")


In [None]:
# get text
text = p.text
print(text)

In [None]:
# print text splitted in sections
def print_sections(sections, level=0):
        for s in sections:
                print("%s: %s - \n%s" % ("*" * (level + 1), s.title, s.text))
                print_sections(s.sections, level + 1)

         
print_sections(p.sections)

# SPARQL test

In [None]:
# perform query
q = 'SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Doug_Acomb> <http://dbpedia.org/property/playedFor> ?uri } '
result = sparql.query('http://dbpedia.org/sparql', q)
# print results
for row in result:
    print(sparql.unpack_row(row))

In [None]:
def get_answers(query):
    answers = []
    results = sparql.query('http://dbpedia.org/sparql', query)
    for row in results:
        answers.append(sparql.unpack_row(row))        
        
    return answers


# LC-SQuAD

In [None]:
data = pd.read_json('./data/train-data.json')

In [None]:
data.head(5)

In [None]:
questions = data[data.sparql_template_id == 2][['corrected_question', 'sparql_query']]
questions

In [None]:
# 3, 9 
id = 3999
q_ex = questions.loc[id]['corrected_question']
sparql_ex = questions.loc[id]['sparql_query']
print('QUESTION:', q_ex)
print('SPARQL:', sparql_ex)
print('POSSIBLE ANSWERS: ', get_answers(sparql_ex))

## Search on wikipedia

In [None]:
# id: 3
# resource = 'Austin College'
# subject = 'mascot'

# id: 3999
resource = 'Doug Acomb'
subject = 'team'
p = wiki.page(resource)

In [None]:
max_score = 0
best_answer = None
# iterate on sections and find answer in each span
for section in p.sections:
    
    #if subject in section.text: # look only in relevant sections???
    if section.title != 'External links':
        print(section.title)
        context = section.text
        # exlude empty sections (i.e. macro section intestation)
        if context != '':
            answer = qa(question=q_ex, context=context)
            if answer['score'] > max_score:
                max_score = answer['score']
                best_answer = answer
            print(f"Answer: '{answer['answer']}' with score {answer['score']}")
            print('#'*10)

# print best answer
if best_answer is not None:  
    print(f"BEST ANSWER: '{best_answer['answer']}' with score {best_answer['score']}")
else:
    print('Sorry, answer not found!')