In [4]:
import json
import os
import pandas
import pickle
import spacy
import importlib
import re
from collections import defaultdict
from nltk import sent_tokenize

import data_utils
importlib.reload(data_utils)

pandas.set_option('display.max_colwidth', -1)
pandas.set_option('display.max_rows', 500)


## Create SQuAD dataset with annotated answers in paragraphs and answer sents

In [24]:
'''Function that returns SQuAD text-question items with answer annotations in paragraphs/answer sentences'''

def make_squad_qg_dataset(squad_data):
    #import pdb;pdb.set_trace()
    wrangled_squad_data = {'article_id': [],
                           'paragraph': [],
                           'answer_sent': [],
                           'question': [],
                           'answer': [],
                          }

    for article_idx, article in enumerate(squad_data['data']):
        for paragraph in article['paragraphs']:
            paragraph_sents = data_utils.segment_sents(paragraph['context'])
            for qa in paragraph['qas']:
                question_text = qa['question']
                all_answers = [answer['text'] for answer in qa['answers']]
                #import pdb;pdb.set_trace()
                longest_answer_idx, longest_answer = sorted(enumerate(all_answers), 
                                            key=lambda answer:len(answer[1]), reverse=True)[0]
                # In case of multiple answers for a given question, just annotate longest answer span
                answer = qa['answers'][longest_answer_idx]
                answer_text = answer['text']
                answer_char_start = answer['answer_start']
                answer_char_end = answer_char_start + len(answer_text)
                for sent_idx in range(len(paragraph_sents['sents'])):
                    sent_char_start = paragraph_sents['sent_char_start_idxs'][sent_idx]
                    if sent_idx + 1 == len(paragraph_sents['sents']):
                        #import pdb;pdb.set_trace()
                        sent_char_end = (paragraph_sents['sent_char_start_idxs'][-1] + 
                                         len(paragraph_sents['sents'][-1]))
                        if sent_char_end != len(paragraph['context']):
                            import pdb;pdb.set_trace()
                    else:
                        sent_char_end = paragraph_sents['sent_char_start_idxs'][sent_idx + 1]
                    if sent_char_start <= answer_char_start < sent_char_end:
                        while answer_char_end > sent_char_end:
                            # Answer spans multiple sentences, which is probably a segmentation failure;
                            # just append next sentence to this one as answer sentence
                            try:
#                                     import pdb;pdb.set_trace()
                                paragraph_sents['sents'][sent_idx] = (paragraph_sents['sents'][sent_idx] + 
                                                                    paragraph_sents['sents'][sent_idx + 1])
                                paragraph_sents['sents'].pop(sent_idx + 1)
                                paragraph_sents['sent_char_start_idxs'].pop(sent_idx + 1)
                                if sent_idx + 1 == len(paragraph_sents['sents']):
                                    sent_char_end = len(paragraph['context'])
                                else:
                                    sent_char_end = paragraph_sents['sent_char_start_idxs'][sent_idx + 1]
                            except:
                                import pdb;pdb.set_trace()
                        break
                answer_sent_text = paragraph_sents['sents'][sent_idx]
                #insert answer tokens into sentence
                answer_start_insert_idx = answer_char_start - sent_char_start
                answer_end_insert_idx = answer_char_end - sent_char_start
                answer_sent_text = (answer_sent_text[:answer_start_insert_idx] + "<ANSWER> "
                               + answer_sent_text[answer_start_insert_idx:])
                answer_sent_text = (answer_sent_text[:answer_end_insert_idx + len("<ANSWER> ")]
                               + " </ANSWER>" +
                               answer_sent_text[answer_end_insert_idx + len("<ANSWER> "):])
                paragraph_sents_with_answer = paragraph_sents['sents'][:]
                paragraph_sents_with_answer[sent_idx] = answer_sent_text
                paragraph_text = "".join(paragraph_sents_with_answer)

                wrangled_squad_data['article_id'].append(article_idx)
                #For some reason, there are line breaks in a few of the texts in the test set, be sure to replace
                wrangled_squad_data['paragraph'].append(paragraph_text.replace("\n", ""))
                wrangled_squad_data['answer_sent'].append(answer_sent_text.replace("\n", ""))
                wrangled_squad_data['question'].append(question_text.replace("\n", ""))
                wrangled_squad_data['answer'].append("\t".join([answer.replace("\n", "")
                                                                for answer in list(set(all_answers))]))
#                     wrangled_squad_data['all_answers'].append("\t".join([answer.replace("\n", "") 
#                                                                          for answer in list(all_answers)]))
            #print("paragraph done")
        if article_idx and article_idx % 10 == 0:
            print(article_idx)
            #break
    return wrangled_squad_data

In [25]:
'''Load original SQuAD data, selecting partition (train or valid)'''

partition = 'test'
assert partition in ('train', 'test')
with open("/home/mroemmele/CoreNLP/others/reading_comprehension/v1/{}-v1.1.json".format(
        'dev' if partition == 'test' else 'train')) as f:
    squad_data = json.load(f)

In [31]:
#pandas.DataFrame(qg_data)[:10]

In [26]:
'''Make the dataset'''

qg_data = make_squad_qg_dataset(squad_data)
pandas.DataFrame(qg_data)[-100:]

10
20
30
40


Unnamed: 0,article_id,paragraph,answer_sent,question,answer
10470,47,"The development of fundamental theories for forces proceeded along the lines of unification of disparate ideas. For example, Isaac Newton unified the force responsible for objects falling at the surface of the Earth with the force responsible for the orbits of celestial mechanics in his universal theory of gravitation. Michael Faraday and James Clerk Maxwell demonstrated that electric and magnetic forces were unified through one consistent theory of electromagnetism. In the 20th century, the development of quantum mechanics led to a modern understanding that the first three fundamental forces (all except gravity) are manifestations of matter (fermions) interacting by exchanging virtual particles called gauge bosons. This standard model of particle physics posits a similarity between the forces and led scientists to predict the unification of the weak and electromagnetic forces in electroweak theory subsequently confirmed by observation. The complete formulation of the standard model predicts an as yet unobserved Higgs mechanism, but observations such as neutrino oscillations indicate that the standard model is incomplete. A Grand Unified Theory allowing for the combination of the electroweak interaction with the strong force is held out as a possibility with candidate theories such as supersymmetry proposed to accommodate some of the outstanding unsolved problems in physics. Physicists are still attempting to develop <ANSWER> self-consistent unification models that would combine all four fundamental interactions </ANSWER> into a theory of everything. Einstein tried and failed at this endeavor, but currently the most popular approach to answering this question is string theory.:212–219",Physicists are still attempting to develop <ANSWER> self-consistent unification models that would combine all four fundamental interactions </ANSWER> into a theory of everything.,What type of physics model did Einstein fail to make?,self-consistent unification models that would combine all four fundamental interactions\tself-consistent unification\tself-consistent unification models
10471,47,"What we now call gravity was not identified as a universal force until the work of <ANSWER> Isaac Newton </ANSWER>. Before Newton, the tendency for objects to fall towards the Earth was not understood to be related to the motions of celestial objects. Galileo was instrumental in describing the characteristics of falling objects by determining that the acceleration of every object in free-fall was constant and independent of the mass of the object. Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of about 9.81 meters per second squared (this measurement is taken from sea level and may vary depending on location), and points toward the center of the Earth. This observation means that the force of gravity on an object at the Earth's surface is directly proportional to the object's mass. Thus an object that has a mass of will experience a force:",What we now call gravity was not identified as a universal force until the work of <ANSWER> Isaac Newton </ANSWER>.,Who identified gravity as a force?,Isaac Newton
10472,47,"What we now call gravity was not identified as a universal force until the work of Isaac Newton. Before Newton, the tendency for objects to fall towards the Earth was not understood to be related to the motions of celestial objects. <ANSWER> Galileo </ANSWER> was instrumental in describing the characteristics of falling objects by determining that the acceleration of every object in free-fall was constant and independent of the mass of the object. Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of about 9.81 meters per second squared (this measurement is taken from sea level and may vary depending on location), and points toward the center of the Earth. This observation means that the force of gravity on an object at the Earth's surface is directly proportional to the object's mass. Thus an object that has a mass of will experience a force:",<ANSWER> Galileo </ANSWER> was instrumental in describing the characteristics of falling objects by determining that the acceleration of every object in free-fall was constant and independent of the mass of the object.,Who came up with the concept that falling objects fell at the same speed regardless of weight?,Galileo
10473,47,"What we now call gravity was not identified as a universal force until the work of Isaac Newton. Before Newton, the tendency for objects to fall towards the Earth was not understood to be related to the motions of celestial objects. Galileo was instrumental in describing the characteristics of falling objects by determining that the acceleration of every object in free-fall was constant and independent of the mass of the object. Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of <ANSWER> about 9.81 meters per second squared </ANSWER> (this measurement is taken from sea level and may vary depending on location), and points toward the center of the Earth. This observation means that the force of gravity on an object at the Earth's surface is directly proportional to the object's mass. Thus an object that has a mass of will experience a force:","Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of <ANSWER> about 9.81 meters per second squared </ANSWER> (this measurement is taken from sea level and may vary depending on location), and points toward the center of the Earth.",How fast do objects fall on Earth?,about 9.81 meters per second\tabout 9.81 meters per second squared\t9.81 meters per second
10474,47,"What we now call gravity was not identified as a universal force until the work of Isaac Newton. Before Newton, the tendency for objects to fall towards the Earth was not understood to be related to the motions of celestial objects. Galileo was instrumental in describing the characteristics of falling objects by determining that the acceleration of every object in free-fall was constant and independent of the mass of the object. Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of about 9.81 meters per second squared (this measurement is taken <ANSWER> from sea level </ANSWER> and may vary depending on location), and points toward the center of the Earth. This observation means that the force of gravity on an object at the Earth's surface is directly proportional to the object's mass. Thus an object that has a mass of will experience a force:","Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of about 9.81 meters per second squared (this measurement is taken <ANSWER> from sea level </ANSWER> and may vary depending on location), and points toward the center of the Earth.",Where was the measurment for the standard gravity on Earth taken?,from sea level\tsea level
10475,47,"What we now call gravity was not identified as a universal force until the work of Isaac Newton. Before Newton, the tendency for objects to fall towards the Earth was not understood to be related to the motions of celestial objects. Galileo was instrumental in describing the characteristics of falling objects by determining that the acceleration of every object in free-fall was constant and independent of the mass of the object. Today, this acceleration due to gravity towards the surface of the Earth is usually designated as and has a magnitude of about 9.81 meters per second squared (this measurement is taken from sea level and may vary depending on location), and points toward the center of the Earth. This observation means that <ANSWER> the force of gravity on an object </ANSWER> at the Earth's surface is directly proportional to the object's mass. Thus an object that has a mass of will experience a force:",This observation means that <ANSWER> the force of gravity on an object </ANSWER> at the Earth's surface is directly proportional to the object's mass.,What is an object's mass proportional to at the surface of the Earth?,the force of gravity on an object\tforce of gravity
10476,47,"Newton came to realize that the effects of gravity might be observed in different ways <ANSWER> at larger distances. </ANSWER> In particular, Newton determined that the acceleration of the Moon around the Earth could be ascribed to the same force of gravity if the acceleration due to gravity decreased as an inverse square law. Further, Newton realized that the acceleration due to gravity is proportional to the mass of the attracting body. Combining these ideas gives a formula that relates the mass () and the radius () of the Earth to the gravitational acceleration:",Newton came to realize that the effects of gravity might be observed in different ways <ANSWER> at larger distances. </ANSWER>,How might gravity effects be observed differently according to Newton?,at larger distances\tat larger distances.
10477,47,"Newton came to realize that the effects of gravity might be observed in different ways at larger distances. In particular, Newton determined that the acceleration of the Moon around the Earth could be ascribed to <ANSWER> the same force of gravity if the acceleration due to gravity decreased as an inverse square law. </ANSWER> Further, Newton realized that the acceleration due to gravity is proportional to the mass of the attracting body. Combining these ideas gives a formula that relates the mass () and the radius () of the Earth to the gravitational acceleration:","In particular, Newton determined that the acceleration of the Moon around the Earth could be ascribed to <ANSWER> the same force of gravity if the acceleration due to gravity decreased as an inverse square law. </ANSWER>",What could be attributed to gravity acceleration around the Earth?,the mass () and the radius () of the Earth\tthe same force of gravity if the acceleration due to gravity decreased as an inverse square law.\tthe Moon\tforce of gravity
10478,47,"Newton came to realize that the effects of gravity might be observed in different ways at larger distances. In particular, Newton determined that the acceleration of the Moon around the Earth could be ascribed to the same force of gravity if the acceleration due to gravity decreased as an inverse square law. Further, Newton realized that the acceleration due to gravity is proportional to <ANSWER> the mass of the attracting body </ANSWER>. Combining these ideas gives a formula that relates the mass () and the radius () of the Earth to the gravitational acceleration:","Further, Newton realized that the acceleration due to gravity is proportional to <ANSWER> the mass of the attracting body </ANSWER>.",What is gravitational acceleration proportional to?,the mass of the attracting body\tmass
10479,47,"Newton came to realize that the effects of gravity might be observed in different ways at larger distances. In particular, Newton determined that the acceleration of the Moon around the Earth could be ascribed to the same force of gravity if the acceleration due to gravity decreased as an inverse square law. Further, Newton realized that the acceleration due to gravity is proportional to the mass of the attracting body. Combining these ideas gives a formula that relates the mass () and <ANSWER> the radius () of the Earth </ANSWER> to the gravitational acceleration:",Combining these ideas gives a formula that relates the mass () and <ANSWER> the radius () of the Earth </ANSWER> to the gravitational acceleration:,"What is included along with gravitational acceration, and mass of the Earth in a formula about rotation about the Earth?",radius\tthe radius () of the Earth\tradius () of the Earth


In [27]:
'''Save the dataset'''
data_dir = "/home/mroemmele/question_generation/squad_untok_data/"

if not os.path.isdir(os.path.join(data_dir, partition)):
    os.mkdir(os.path.join(data_dir, partition))
       
with open(os.path.join(data_dir, partition, 'paragraphs.txt'), 'w') as f:
    f.write("\n".join(qg_data['paragraph']))
    
with open(os.path.join(data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(qg_data['answer_sent']))

with open(os.path.join(data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(qg_data['question']))
    
with open(os.path.join(data_dir, partition, 'answers_only.txt'), 'w') as f:
    f.write("\n".join(qg_data['answer']))
    
# with open(os.path.join(data_dir, partition, 'all_answers_only.txt'), 'w') as f:
#     f.write("\n".join(qg_data['all_answers']))
    

## Create entity-masked dataset

In [3]:
def mask_entities_in_data(input_texts, questions, ignore_tokens=["<ANSWER>", "</ANSWER>"]):
    masked_input_texts = []
    masked_questions = []
    tokens_to_tags = []
    for input_text, question in zip(input_texts, questions):
        item_tokens_to_tags = data_utils.get_tokens_to_tags(input_text + " " + question,
                                                            ignore_tokens)
        (masked_input_text, masked_question,
         item_tokens_to_tags) = data_utils.mask_entity_tokens(input_text,
                                                              question,
                                                              item_tokens_to_tags)
        masked_input_texts.append(masked_input_text)
        masked_questions.append(masked_question)
        tokens_to_tags.append(item_tokens_to_tags)
    return {'input_text': masked_input_texts, 
            'question': masked_questions, 
            'tokens_to_tags': tokens_to_tags}


In [44]:
'''Load SQuAD data'''

data_dir = "/home/mroemmele/question_generation/squad_untok_data/"
partition = 'test'
qg_data = {'input_text': [text.strip() for text in open(os.path.join(data_dir, partition, "answer_sents.txt"))],
          'question': [question.strip() for question in open(os.path.join(data_dir, partition, "questions.txt"))]}

In [46]:
'''Apply masking'''

masked_qg_data = mask_entities_in_data(qg_data['input_text'][:10], qg_data['question'][:10])
pandas.DataFrame(masked_qg_data)[-100:]

Unnamed: 0,input_text,question,tokens_to_tags
0,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ANSWER> <ORGANIZATION_3> </ANSWER> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ORGANIZATION_6> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team represented the <ORGANIZATION_2> at <ORGANIZATION_7> 50?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
1,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ANSWER> <ORGANIZATION_3> </ANSWER> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ORGANIZATION_6> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team represented the <ORGANIZATION_2> at <ORGANIZATION_7> 50?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
2,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ANSWER> <ORGANIZATION_3> </ANSWER> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ORGANIZATION_6> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team represented the <ORGANIZATION_2> at <ORGANIZATION_7> 50?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
3,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ORGANIZATION_3> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ANSWER> <ORGANIZATION_6> </ANSWER> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team represented the <ORGANIZATION_5> at <ORGANIZATION_7> 50?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
4,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ORGANIZATION_3> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ANSWER> <ORGANIZATION_6> </ANSWER> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team represented the <ORGANIZATION_5> at <ORGANIZATION_7> 50?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
5,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ORGANIZATION_3> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ANSWER> <ORGANIZATION_6> </ANSWER> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team represented the <ORGANIZATION_5> at <ORGANIZATION_7> 50?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
6,"The game was played on <DATE_1>, at <LOCATION_1> in the <LOCATION_2> at <ANSWER> <LOCATION_3>, <LOCATION_4> </ANSWER>.",Where did <EVENT_1> take place?,"{'February 7, 2016': '<DATE_1>', 'Levi's Stadium': '<LOCATION_1>', 'San Francisco Bay Area': '<LOCATION_2>', 'Santa Clara': '<LOCATION_3>', 'California': '<LOCATION_4>', 'Super Bowl 50': '<EVENT_1>'}"
7,"The game was played on <DATE_1>, at <ANSWER> <LOCATION_1> </ANSWER> in the <LOCATION_2> at <LOCATION_3>, <LOCATION_4>.",Where did <EVENT_1> take place?,"{'February 7, 2016': '<DATE_1>', 'Levi's Stadium': '<LOCATION_1>', 'San Francisco Bay Area': '<LOCATION_2>', 'Santa Clara': '<LOCATION_3>', 'California': '<LOCATION_4>', 'Super Bowl 50': '<EVENT_1>'}"
8,"The game was played on <DATE_1>, at <ANSWER> <LOCATION_1> in the <LOCATION_2> at <LOCATION_3>, <LOCATION_4>. </ANSWER>",Where did <EVENT_1> take place?,"{'February 7, 2016': '<DATE_1>', 'Levi's Stadium': '<LOCATION_1>', 'San Francisco Bay Area': '<LOCATION_2>', 'Santa Clara': '<LOCATION_3>', 'California': '<LOCATION_4>', 'Super Bowl 50': '<EVENT_1>'}"
9,The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ANSWER> <ORGANIZATION_3> </ANSWER> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ORGANIZATION_6> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.,Which <ORGANIZATION_8> team won <EVENT_1>?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>', 'Super Bowl 50': '<EVENT_1>'}"


In [47]:
'''Save the masked dataset'''

data_dir = "/home/mroemmele/question_generation/squad_entity_masked_data"

if not os.path.isdir(os.path.join(data_dir, partition)):
    os.mkdir(os.path.join(data_dir, partition))
    
with open(os.path.join(data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(masked_qg_data['input_text']))

with open(os.path.join(data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(masked_qg_data['question']))
    
with open(os.path.join(data_dir, partition, 'tokens_to_tags.pkl'), 'wb') as f:
    pickle.dump(masked_qg_data['tokens_to_tags'], f)

## Create tokenized dataset that can be used for training

In [14]:

def tokenize_data(input_texts, questions, tokenizer, tokenize_fn):
    # Add special character after answer/entity annotations so tokenizer will respect space
    tok_input_texts = []
    tok_questions = []
    for input_text, question in zip(input_texts, questions):
        tok_question = " ".join(tokenize_fn(tokenizer, question))
        tok_questions.append(tok_question)
        tok_input_text = " ".join(tokenize_fn(tokenizer, input_text))
        tok_input_texts.append(tok_input_text)
    return {'input_text': tok_input_texts,
            'question': tok_questions}

In [15]:
'''Load SQuAD data'''

data_dir = "/home/mroemmele/question_generation/squad_rule_generated_data/untok_data/"
partition = 'train'
qg_data = {'input_text': [text.strip() for text in open(os.path.join(data_dir, partition, "answer_sents.txt"))],
           'question': [question.strip() for question in open(os.path.join(data_dir, partition, "questions.txt"))],
           'tokens_to_tags': pickle.load(open(os.path.join(data_dir, partition, "tokens_to_tags.pkl"), 'rb'))}
          #}

In [13]:
'''Load tokenizer and define tokenization function'''

from transformers import GPT2Tokenizer, BertTokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Add special tokens for designating answers in squad texts
special_tokens = ["<ANSWER>", "</ANSWER>"]
# Add entity tags as special tokens (each tag indexed up to 100)
special_tokens.extend(["Ġ<{}".format(tag) for tag in data_utils.ENTITY_TAGS])
special_tokens.extend(["<{}".format(tag) for tag in data_utils.ENTITY_TAGS])
special_tokens.extend(["_{}>".format(num) for num in range(1,101)])
tokenizer.add_tokens(special_tokens)
#Save tokenizer vocab with special tokens added
tokenizer.save_pretrained("/home/mroemmele/question_generation/gpt2_tokenizer_vocab/")

I1113 19:27:10.912777 140299533100864 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/mroemmele/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
I1113 19:27:10.914906 140299533100864 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/mroemmele/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I1113 19:27:11.531922 140299533100864 tokenization_utils.py:517] Adding <ANSWER> to the vocabulary
I1113 19:27:11.534209 140299533100864 tokenization_utils.py:517] Adding </ANSWER> to the vocabulary
I1113 19:27:11.536138 140299533100864 tokenization_utils.py:517] Adding Ġ<PRODUCT to the vocabulary
I1113 19:27:11.537971 140299533100864

I1113 19:27:11.673314 140299533100864 tokenization_utils.py:517] Adding _45> to the vocabulary
I1113 19:27:11.675102 140299533100864 tokenization_utils.py:517] Adding _46> to the vocabulary
I1113 19:27:11.676871 140299533100864 tokenization_utils.py:517] Adding _47> to the vocabulary
I1113 19:27:11.678644 140299533100864 tokenization_utils.py:517] Adding _48> to the vocabulary
I1113 19:27:11.680408 140299533100864 tokenization_utils.py:517] Adding _49> to the vocabulary
I1113 19:27:11.682241 140299533100864 tokenization_utils.py:517] Adding _50> to the vocabulary
I1113 19:27:11.684034 140299533100864 tokenization_utils.py:517] Adding _51> to the vocabulary
I1113 19:27:11.685983 140299533100864 tokenization_utils.py:517] Adding _52> to the vocabulary
I1113 19:27:11.687686 140299533100864 tokenization_utils.py:517] Adding _53> to the vocabulary
I1113 19:27:11.689454 140299533100864 tokenization_utils.py:517] Adding _54> to the vocabulary
I1113 19:27:11.691282 140299533100864 tokenization

('/home/mroemmele/question_generation/gpt2_tokenizer_vocab/vocab.json',
 '/home/mroemmele/question_generation/gpt2_tokenizer_vocab/merges.txt',
 '/home/mroemmele/question_generation/gpt2_tokenizer_vocab/special_tokens_map.json',
 '/home/mroemmele/question_generation/gpt2_tokenizer_vocab/added_tokens.json')

In [22]:
def tokenize_fn(tokenizer, text):
    # Add designated marker ("|") after special tokens so subsequent token will have correct whitespace indicator
    text = re.sub("</?[A-Z]+_?[0-9]*>",
                  lambda match: match.group() + "|",
                  text)
    text = re.sub("(<{})(_[0-9]+>)".format("|".join(data_utils.ENTITY_TAGS)),
                  lambda match: " ".join(match.groups()),
                  text)
    text = re.sub("\s<" + "({})".format("|".join(data_utils.ENTITY_TAGS)),
                  lambda match: " Ġ" + match.group().strip(),
                  text)
    tokens = tokenizer.tokenize(text)
    tokens = [token for token in tokens if token != "|"]
    return tokens

def detokenize_fn(tokenizer, tokens):
    text = tokenizer.convert_tokens_to_string(tokens)
    text = text.replace("| ","")
    text = text.replace("<ANSWER>", " <ANSWER>")
    text = text.replace("</ANSWER>", " </ANSWER>")
    return text

In [50]:
tok_qg_data = tokenize_data(qg_data['input_text'], qg_data['question'],
                            tokenizer, tokenize_fn)
pandas.DataFrame({**tok_qg_data, **{'tokens_to_tags': qg_data['tokens_to_tags'][:10]}})[:100]

Unnamed: 0,input_text,question,tokens_to_tags
0,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _3> </ANSWER> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion Ġ<ORGANIZATION _6> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġrepresented Ġthe Ġ<ORGANIZATION _2> Ġat Ġ<ORGANIZATION _7> Ġ50 ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
1,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _3> </ANSWER> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion Ġ<ORGANIZATION _6> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġrepresented Ġthe Ġ<ORGANIZATION _2> Ġat Ġ<ORGANIZATION _7> Ġ50 ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
2,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _3> </ANSWER> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion Ġ<ORGANIZATION _6> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġrepresented Ġthe Ġ<ORGANIZATION _2> Ġat Ġ<ORGANIZATION _7> Ġ50 ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
3,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion Ġ<ORGANIZATION _3> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _6> </ANSWER> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġrepresented Ġthe Ġ<ORGANIZATION _5> Ġat Ġ<ORGANIZATION _7> Ġ50 ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
4,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion Ġ<ORGANIZATION _3> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _6> </ANSWER> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġrepresented Ġthe Ġ<ORGANIZATION _5> Ġat Ġ<ORGANIZATION _7> Ġ50 ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
5,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion Ġ<ORGANIZATION _3> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _6> </ANSWER> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġrepresented Ġthe Ġ<ORGANIZATION _5> Ġat Ġ<ORGANIZATION _7> Ġ50 ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>'}"
6,"The Ġgame Ġwas Ġplayed Ġon Ġ<DATE _1> , Ġat Ġ<LOCATION _1> Ġin Ġthe Ġ<LOCATION _2> Ġat <ANSWER> Ġ<LOCATION _3> , Ġ<LOCATION _4> </ANSWER> .",Where Ġdid Ġ<EVENT _1> Ġtake Ġplace ?,"{'February 7, 2016': '<DATE_1>', 'Levi's Stadium': '<LOCATION_1>', 'San Francisco Bay Area': '<LOCATION_2>', 'Santa Clara': '<LOCATION_3>', 'California': '<LOCATION_4>', 'Super Bowl 50': '<EVENT_1>'}"
7,"The Ġgame Ġwas Ġplayed Ġon Ġ<DATE _1> , Ġat <ANSWER> Ġ<LOCATION _1> </ANSWER> Ġin Ġthe Ġ<LOCATION _2> Ġat Ġ<LOCATION _3> , Ġ<LOCATION _4> .",Where Ġdid Ġ<EVENT _1> Ġtake Ġplace ?,"{'February 7, 2016': '<DATE_1>', 'Levi's Stadium': '<LOCATION_1>', 'San Francisco Bay Area': '<LOCATION_2>', 'Santa Clara': '<LOCATION_3>', 'California': '<LOCATION_4>', 'Super Bowl 50': '<EVENT_1>'}"
8,"The Ġgame Ġwas Ġplayed Ġon Ġ<DATE _1> , Ġat <ANSWER> Ġ<LOCATION _1> Ġin Ġthe Ġ<LOCATION _2> Ġat Ġ<LOCATION _3> , Ġ<LOCATION _4> . </ANSWER>",Where Ġdid Ġ<EVENT _1> Ġtake Ġplace ?,"{'February 7, 2016': '<DATE_1>', 'Levi's Stadium': '<LOCATION_1>', 'San Francisco Bay Area': '<LOCATION_2>', 'Santa Clara': '<LOCATION_3>', 'California': '<LOCATION_4>', 'Super Bowl 50': '<EVENT_1>'}"
9,The Ġ<ORGANIZATION _1> Ġ( <ORGANIZATION _2> ) Ġchampion <ANSWER> Ġ<ORGANIZATION _3> </ANSWER> Ġdefeated Ġthe Ġ<ORGANIZATION _4> Ġ( <ORGANIZATION _5> ) Ġchampion Ġ<ORGANIZATION _6> Ġ<CARDINAL _1> Ġto Ġearn Ġtheir Ġ<ORDINAL _1> Ġ<ORGANIZATION _7> Ġtitle .,Which Ġ<ORGANIZATION _8> Ġteam Ġwon Ġ<EVENT _1> ?,"{'American Football Conference': '<ORGANIZATION_1>', 'AFC': '<ORGANIZATION_2>', 'Denver Broncos': '<ORGANIZATION_3>', 'National Football Conference': '<ORGANIZATION_4>', 'NFC': '<ORGANIZATION_5>', 'Carolina Panthers': '<ORGANIZATION_6>', '24–10': '<CARDINAL_1>', 'third': '<ORDINAL_1>', 'Super Bowl': '<ORGANIZATION_7>', 'NFL': '<ORGANIZATION_8>', 'Super Bowl 50': '<EVENT_1>'}"


In [51]:
'''Save the tokenized dataset'''

data_dir = "/home/mroemmele/question_generation/squad_gpt2_tok_entity_masked_data"

if not os.path.isdir(os.path.join(data_dir, partition)):
    os.mkdir(os.path.join(data_dir, partition))
    
with open(os.path.join(data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(tok_qg_data['input_text']))

with open(os.path.join(data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(tok_qg_data['question']))
    

In [52]:
detokenize_fn(tokenizer, tok_qg_data['input_text'][0].split(" "))

'The <ORGANIZATION_1> (<ORGANIZATION_2>) champion <ANSWER> <ORGANIZATION_3> </ANSWER> defeated the <ORGANIZATION_4> (<ORGANIZATION_5>) champion <ORGANIZATION_6> <CARDINAL_1> to earn their <ORDINAL_1> <ORGANIZATION_7> title.'