In [2]:
import pandas as pd
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

#### Displaying progress
tqdm is a progress bar. It will come usefeful when we extract the answers for all the articles.

In [4]:
from tqdm import tqdm

In [5]:
import time

for i in tqdm(range(10)):
    time.sleep(0.5)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.95it/s]


In [6]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [8]:
train = pd.read_json('../data/train-v1.1.json', orient='column')
dev = pd.read_json('../data/dev-v1.1.json', orient='column')

df = pd.concat([train, dev], ignore_index=True)

In [9]:
df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [10]:
currText = df['data'][0]['paragraphs'][0]['context']
currQas = df['data'][0]['paragraphs'][0]['qas']

In [11]:
currDoc = nlp(currText)

In [12]:
#Extract answers and the sentence they are in
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

In [13]:
currAnswers = extractAnswers(currQas, currDoc)
currAnswers

[{'sentenceId': 1, 'text': 'a golden statue of the Virgin Mary'},
 {'sentenceId': 2, 'text': 'a copper statue of Christ'},
 {'sentenceId': 3, 'text': 'the Main Building'},
 {'sentenceId': 4, 'text': 'a Marian place of prayer and reflection'},
 {'sentenceId': 5, 'text': 'Saint Bernadette Soubirous'}]

In [14]:
#TODO - Clean answers from stopwords?
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

In [15]:
tokenIsAnswer('the Main Building', 4, currAnswers)

False

In [16]:
#Save named entities start points

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

In [17]:
currNeStarts = getNEStartIndexs(currDoc)

if 6 in currNeStarts:
    print(currNeStarts[6].label_)

NORP


In [18]:
def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1

In [19]:
senStarts = getSentenceStartIndexes(currDoc)
senStarts

[0, 9, 25, 55, 68, 84, 108]

In [20]:
getSentenceForWordPosition(108, senStarts)

In [21]:
#Creating the dataframe
wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
wordDf = pd.DataFrame(columns=wordColums)

newWord = ['koala', True, 0, 0, 4, 1, None, None, None, None, 'xxxxx']
newWords = []

newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape


In [22]:
def addWordsForParagrapgh(newWords, titleId, paragraphId):
    text = df['data'][titleId]['paragraphs'][paragraphId]['context']
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']

    doc = nlp(text)

    answers = extractAnswers(qas, doc)
    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            tokenIsAnswer(word.text, currentSentence, answers),
                            titleId,
                            paragraphId,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                tokenIsAnswer(word.text, currentSentence, answers),
                                titleId,
                                paragraphId,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1


In [23]:
newWords

[]

In [24]:
addWordsForParagrapgh(newWords, 0, 0)

In [25]:
newWords[0]

['Architecturally', False, 0, 0, 0, 1, None, 'ADV', 'RB', 'advmod', 'Xxxxx']

In [26]:
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx


In [27]:
newWordsDf[newWordsDf['isAnswer'] == True].head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
21,the Main Building,True,0,0,3.0,3,FAC,,,,xxx Xxxx Xxxxx
37,Saint Bernadette Soubirous,True,0,0,5.0,3,WORK_OF_ART,,,,Xxxxx Xxxxx Xxxxx


Generating a words for 2 titles

In [28]:
words = []

#titlesCount = len(df['data'])
titlesCount = 2

for titleId in tqdm(range(titlesCount)):
    paragraphsCount = len(df['data'][titleId]['paragraphs'])
        
    for paragraphId in range(paragraphsCount):
        addWordsForParagrapgh(words, titleId, paragraphId)
        

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.91s/it]


In [29]:
wordsDf = pd.DataFrame(words, columns=wordColums)
wordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape
0,Architecturally,False,0,0,0.0,1,,ADV,RB,advmod,Xxxxx
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx


In [30]:
print("Total words for 2 articles:", len(wordsDf))

Total words for 2 articles: 8611


In [33]:
wordPickleName = '../data/wordsDf.pkl'

#If the dataframe is already generated, load it.
if (pickleExists(wordPickleName)):
    print("Pickle found. Saved some time.")
    wordsDf = loadPickle(wordPickleName)
else:
    #Extracting words
    words = []

#     titlesCount = len(df['data'])   
    titlesCount = 2   

    for titleId in tqdm(range(titlesCount)):
        paragraphsCount = len(df['data'][titleId]['paragraphs'])

#         printProgress(titleId, titlesCount - 1)

        for paragraphId in range(paragraphsCount):
            addWordsForParagrapgh(words, titleId, paragraphId)
    
    #Create the dataframe
    wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    wordsDf = pd.DataFrame(words, columns=wordColums)
    
    #Pickle the result
    dumpPickle(wordPickleName, wordsDf)
    print("Result was not pickled. You had to wait.")

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.59s/it]

Result was not pickled. You had to wait.





Total extracted words:

In [34]:
print("Total words for all articles:", len(wordsDf))

Total words for all articles: 8611


Check what percentage of the extracted words are answers in the dataframe. They should be pretty low

In [35]:
totalAnswers = len(wordsDf[wordsDf['isAnswer'] == True])
print(totalAnswers, 'total answers', '{:.2f}%'.format((totalAnswers / len(wordsDf)) * 100), 'of all words are answers.')

389 total answers 4.52% of all words are answers.
