In [3]:
import os
import pandas
import pickle
import importlib
import random
from IPython.display import display

import data_utils
importlib.reload(data_utils)

pandas.set_option('display.max_colwidth', -1)
pandas.set_option('display.max_rows', 500)

random.seed(0)

## Create a validation set from existing training dataset

Use this set for validation when training question generation model, so that provided validation data can instead be used as a held-out test set.

In [105]:
data_dir = '''/home/mroemmele/question_generation/newsqa_rule_generated_data/untok_data/'''
percent_heldout = 0.02

In [109]:
'''Load training data'''

answer_sents = [sent.strip()
                for sent in open(os.path.join(data_dir, 'train', 'answer_sents.txt'))]
questions = [question.strip()
             for question in open(os.path.join(data_dir, 'train', 'questions.txt'))]
answers = [answer.strip()
           for answer in open(os.path.join(data_dir, 'train', 'answers_only.txt'))]
paragraphs = [paragrasph.strip()
              for paragraph in open(os.path.join(data_dir, 'train', 'paragraphs.txt'))]
#paragraphs = [""] * len(answer_sents)

In [107]:
'''Randomly select indices of heldout data items or load them from file'''

rand_idxs = list(range(len(answer_sents)))
random.shuffle(rand_idxs)
n_heldout = int(percent_heldout * len(answer_sents))
heldout_idxs = set(rand_idxs[:n_heldout])

# with open("/home/mroemmele/question_generation/squad_rule_mimic_heldout_train_idxs_2_percent.pkl", 'rb') as f:
#     heldout_idxs = pickle.load(f)

list(heldout_idxs)[:10]

[131072, 786435, 393231, 917525, 21, 262167, 262166, 786457, 131097, 917533]

In [108]:
'''Save the random indices so they can be used for other datasets (only run if new indices generated)'''

with open(os.path.join(data_dir, "heldout_train_idxs_2_percent.pkl"), 'wb') as f:
    pickle.dump(heldout_idxs, f)

In [110]:
'''Partition out validation data from training data by randomly selecting items'''

train_paragraphs, train_answer_sents, train_questions, train_answers = [], [], [], []
valid_paragraphs, valid_answer_sents, valid_questions, valid_answers = [], [], [], []

for idx, (paragraph, answer_sent,
          question, answer) in enumerate(zip(paragraphs, answer_sents, questions, answers)):
    #import pdb;pdb.set_trace()
    if idx in heldout_idxs:
        valid_paragraphs.append(paragraph)
        valid_answer_sents.append(answer_sent)
        valid_questions.append(question)
        valid_answers.append(answer)
    else:
        train_paragraphs.append(paragraph)
        train_answer_sents.append(answer_sent)
        train_questions.append(question)
        train_answers.append(answer)

assert len(train_paragraphs) == len(train_answer_sents) == len(train_questions) == len(train_answers)
assert len(valid_paragraphs) == len(valid_answer_sents) == len(valid_questions) == len(valid_answers)

In [114]:
'''Save partitioned data'''

if not os.path.isdir(os.path.join(data_dir, 'valid')):
    os.mkdir(os.path.join(data_dir, 'valid'))
    
#Save train    
with open(os.path.join(data_dir, 'train', 'paragraphs.txt'), 'w') as f:
    f.write("\n".join(train_paragraphs))
    
with open(os.path.join(data_dir, 'train', 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(train_answer_sents))

with open(os.path.join(data_dir, 'train', 'questions.txt'), 'w') as f:
    f.write("\n".join(train_questions))

with open(os.path.join(data_dir, 'train', 'answers_only.txt'), 'w') as f:
    f.write("\n".join(train_answers))
    
#Save valid
with open(os.path.join(data_dir, 'valid', 'paragraphs.txt'), 'w') as f:
    f.write("\n".join(valid_paragraphs))
    
with open(os.path.join(data_dir, 'valid', 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(valid_answer_sents))

with open(os.path.join(data_dir, 'valid', 'questions.txt'), 'w') as f:
    f.write("\n".join(valid_questions))
    
with open(os.path.join(data_dir, 'valid', 'answers_only.txt'), 'w') as f:
    f.write("\n".join(valid_answers))
    

## Concatenate two datasets into one

In [1]:
data_dir1 = "/home/mroemmele/question_generation/squad_untok_data/"
data_dir2 = "/home/mroemmele/question_generation/newsqa_untok_data/"
partition = 'test'

In [4]:
'''Load datasets for given partition'''

paragraphs1 = [paragraph.strip()
              for paragraph in open(os.path.join(data_dir1, partition, 'paragraphs.txt'))]
answer_sents1 = [sent.strip()
                for sent in open(os.path.join(data_dir1, partition, 'answer_sents.txt'))]
questions1 = [question.strip()
             for question in open(os.path.join(data_dir1, partition, 'questions.txt'))]
answers1 = [answer.strip()
           for answer in open(os.path.join(data_dir1, partition, 'answers_only.txt'))]
assert len(paragraphs1) == len(answer_sents1) == len(questions1) == len(answers1)

paragraphs2 = [paragraph.strip()
              for paragraph in open(os.path.join(data_dir2, partition, 'paragraphs.txt'))]
answer_sents2 = [sent.strip()
                for sent in open(os.path.join(data_dir2, partition, 'answer_sents.txt'))]
questions2 = [question.strip()
             for question in open(os.path.join(data_dir2, partition, 'questions.txt'))]
answers2 = [answer.strip()
           for answer in open(os.path.join(data_dir2, partition, 'answers_only.txt'))]
assert len(paragraphs2) == len(answer_sents2) == len(questions2) == len(answers2)

In [5]:
'''Concatenate'''

paragraphs = paragraphs1 + paragraphs2
answer_sents = answer_sents1 + answer_sents2
questions = questions1 + questions2
answers = answers1 + answers2
assert len(paragraphs) == len(answer_sents) == len(questions) == len(answers)

In [7]:
len(answers)

14911

In [8]:
concat_data_dir = "/home/mroemmele/question_generation/squad_newsqa_untok_data/"

if not os.path.isdir(concat_data_dir):
    os.mkdir(concat_data_dir)

In [124]:
'''Shuffle the new dataset, if partition is training'''

if partition == 'train':
    rand_idxs = list(range(len(answer_sents)))
    random.shuffle(rand_idxs)

    #Save shuffle indices in case
    with open(os.path.join(concat_data_dir, "shuffle_order_train_indices.pkl"), 'wb') as f:
        pickle.dump(rand_idxs, f)

    paragraphs = [paragraphs[idx] for idx in rand_idxs]
    answer_sents = [answer_sents[idx] for idx in rand_idxs]
    questions = [questions[idx] for idx in rand_idxs]
    answers = [answers[idx] for idx in rand_idxs]

In [9]:
pandas.DataFrame({'answer_sent': answer_sents, 'question': questions})

Unnamed: 0,answer_sent,question
0,The American Football Conference (AFC) champion <ANSWER> Denver Broncos </ANSWER> defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title.,Which NFL team represented the AFC at Super Bowl 50?
1,The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion <ANSWER> Carolina Panthers </ANSWER> 24–10 to earn their third Super Bowl title.,Which NFL team represented the NFC at Super Bowl 50?
2,"The game was played on February 7, 2016, at <ANSWER> Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. </ANSWER>",Where did Super Bowl 50 take place?
3,The American Football Conference (AFC) champion <ANSWER> Denver Broncos </ANSWER> defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title.,Which NFL team won Super Bowl 50?
4,"As this was the 50th Super Bowl, the league emphasized the ""<ANSWER> gold </ANSWER>en anniversary"" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as ""Super Bowl L""), so that the logo could prominently feature the Arabic numerals 50.",What color was used to emphasize the 50th anniversary of the Super Bowl?
...,...,...
14906,"""A total of <ANSWER> seven </ANSWER> died on our property,"" O'Connor told CNN.",How many horses died?
14907,Teams are trying to figure out what happened at the <ANSWER> International Polo Club Palm Beach in Florida. </ANSWER>,Where were the races held?
14908,"Fourteen thoroughbred horses dropped dead in a mysterious scene Sunday before a polo match near <ANSWER> West Palm Beach, Florida, </ANSWER> officials said.",Where was the match set?
14909,"The cause of the deaths <ANSWER> has not been determined, </ANSWER> and necropsies and blood tests were underway, he said.",What was the reason that horses die?


In [14]:
'''Save new dataset'''

if not os.path.isdir(os.path.join(concat_data_dir, partition)):
    os.mkdir(os.path.join(concat_data_dir, partition))
     
with open(os.path.join(concat_data_dir, partition, 'paragraphs.txt'), 'w') as f:
    f.write("\n".join(paragraphs))
    
with open(os.path.join(concat_data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(answer_sents))

with open(os.path.join(concat_data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(questions))

with open(os.path.join(concat_data_dir, partition, 'answers_only.txt'), 'w') as f:
    f.write("\n".join(answers))

## Create entity-masked dataset

In [None]:
def mask_entities_in_data(input_texts, questions, ignore_tokens=["<ANSWER>", "</ANSWER>"]):
    masked_input_texts = []
    masked_questions = []
    tokens_to_tags = []
    for input_text, question in zip(input_texts, questions):
        item_tokens_to_tags = data_utils.get_tokens_to_tags(input_text + " " + question,
                                                            ignore_tokens)
        (masked_input_text, masked_question,
         item_tokens_to_tags) = data_utils.mask_entity_tokens(input_text,
                                                              question,
                                                              item_tokens_to_tags)
        masked_input_texts.append(masked_input_text)
        masked_questions.append(masked_question)
        tokens_to_tags.append(item_tokens_to_tags)
    return {'input_text': masked_input_texts, 
            'question': masked_questions, 
            'tokens_to_tags': tokens_to_tags}

In [None]:
'''Load QG data'''

data_dir = "/home/mroemmele/question_generation/squad_untok_data/"
partition = 'test'
qg_data = {'input_text': [text.strip() for text in open(os.path.join(data_dir, partition, "answer_sents.txt"))],
          'question': [question.strip() for question in open(os.path.join(data_dir, partition, "questions.txt"))]}

In [None]:
'''Apply masking'''

masked_qg_data = mask_entities_in_data(qg_data['input_text'][:10], qg_data['question'][:10])
pandas.DataFrame(masked_qg_data)[-100:]

In [None]:
'''Save the masked dataset'''

data_dir = "/home/mroemmele/question_generation/squad_entity_masked_data"

if not os.path.isdir(os.path.join(data_dir, partition)):
    os.mkdir(os.path.join(data_dir, partition))
    
with open(os.path.join(data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(masked_qg_data['input_text']))

with open(os.path.join(data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(masked_qg_data['question']))
    
with open(os.path.join(data_dir, partition, 'tokens_to_tags.pkl'), 'wb') as f:
    pickle.dump(masked_qg_data['tokens_to_tags'], f)

## Create tokenized dataset as input to QG system

In [15]:
def tokenize_data(input_texts, questions, tokenizer, tokenize_fn, entity_masking=False):
    # Add special character after answer/entity annotations so tokenizer will respect space
    tok_input_texts = []
    tok_questions = []
    for idx, (input_text, question) in enumerate(zip(input_texts, questions)):
        tok_question = " ".join(tokenize_fn(tokenizer, question))
        tok_questions.append(tok_question)
        tok_input_text = " ".join(tokenize_fn(tokenizer, input_text, handle_entities=entity_masking))
        tok_input_texts.append(tok_input_text)
        if idx and idx % 10000 == 0:
            print(idx)
    return {'input_text': tok_input_texts,
            'question': tok_questions}

In [11]:
'''Load tokenizer and define special tokens to respect, or load tokenizer if already saved'''

from transformers import GPT2Tokenizer

tokenizer_dirpath = "/home/mroemmele/question_generation/gpt2_tokenizer_vocab/"

if not os.path.isdir(tokenizer_dirpath):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    # Add special tokens for designating answers in squad texts
    special_tokens = ["<ANSWER>", "</ANSWER>"]
    # Add entity tags as special tokens (each tag indexed up to 100)
    special_tokens.extend(["Ġ<{}".format(tag) for tag in data_utils.ENTITY_TAGS])
    special_tokens.extend(["<{}".format(tag) for tag in data_utils.ENTITY_TAGS])
    special_tokens.extend(["_{}>".format(num) for num in range(1,101)])
    tokenizer.add_tokens(special_tokens)
    #Save tokenizer vocab with special tokens added
    tokenizer.save_pretrained(tokenizer_dirpath)
else:
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dirpath)

In [16]:
'''Load data'''

data_dir = "/home/mroemmele/question_generation/squad_newsqa_untok_data/"
partition = 'test'
entity_masking = False
qg_data = {'input_text': [text.strip() for text in open(os.path.join(data_dir, partition, "answer_sents.txt"))],
           'question': [question.strip() for question in open(os.path.join(data_dir, partition, "questions.txt"))]}
if entity_masking:
    qg_data['tokens_to_tags'] = pickle.load(open(os.path.join(data_dir, partition, "tokens_to_tags.pkl"), 'rb'))


In [18]:
pandas.DataFrame(qg_data)[1000:1500]

Unnamed: 0,input_text,question
1000,"They have extensive botanical collection of rare domestic and foreign plants, while a palm house in the <ANSWER> New Orangery </ANSWER> displays plants of subtropics from all over the world.",Where is a palm house with subtropic plants from all over the world on display?
1001,"Besides, within the city borders, there are also: <ANSWER> Pole Mokotowskie </ANSWER> (a big park in the northern Mokotów, where was the first horse racetrack and then the airport), Park Ujazdowski (close to the Sejm and John Lennon street), Park of Culture and Rest in Powsin, by the southern city border, Park Skaryszewski by the right Vistula bank, in Praga.",Where was the first horse racetrack located?
1002,"Besides, within the city borders, there are also: Pole Mokotowskie (a big park in the northern Mokotów, where was the first horse racetrack and then the airport), <ANSWER> Park Ujazdowski </ANSWER> (close to the Sejm and John Lennon street), Park of Culture and Rest in Powsin, by the southern city border, Park Skaryszewski by the right Vistula bank, in Praga.",What park is close to John Lennon street?
1003,"In <ANSWER> 1927 </ANSWER> a zoological garden (Ogród Zoologiczny) was established on the park grounds, and in 1952 a bear run, still open today.",When was a zoological garden established in the Praga Park?
1004,"The species richness is mainly due to the <ANSWER> location of Warsaw </ANSWER> within the border region of several big floral regions comprising substantial proportions of close-to-wilderness areas (natural forests, wetlands along the Vistula) as well as arable land, meadows and forests.",Why is Warsaw's flora very rich in species?
1005,"Bielany Forest, located <ANSWER> within the borders of Warsaw </ANSWER>, is the remaining part of the Masovian Primeval Forest.",Where is Bielany Forest located?
1006,"Bielany Forest, located within the borders of Warsaw, is the remaining part of the <ANSWER> Masovian Primeval Forest </ANSWER>.",What is the Bielany Forest the last remnant of?
1007,Other big forest area is <ANSWER> Kabaty Forest </ANSWER> by the southern city border.,What forest is by Warsaw's southern border?
1008,Warsaw has also <ANSWER> two </ANSWER> botanic gardens: by the Łazienki park (a didactic-research unit of the University of Warsaw) as well as by the Park of Culture and Rest in Powsin (a unit of the Polish Academy of Science).,How many botanical gardens does Warsaw have?
1009,"In 1939, c. <ANSWER> 1,300,000 </ANSWER> people lived in Warsaw, but in 1945 – only 420,000.",How many people lived in Warsaw in 1939?


In [19]:
'''Create tokenized dataset'''

tok_qg_data = tokenize_data(qg_data['input_text'], qg_data['question'],
                            tokenizer, tokenize_fn=data_utils.tokenize_fn, entity_masking=entity_masking)
if entity_masking:
    display(pandas.DataFrame({**tok_qg_data, **{'tokens_to_tags': qg_data['tokens_to_tags'][:10]}})[:100])
else:
    display(pandas.DataFrame(tok_qg_data))
    

10000


Unnamed: 0,input_text,question
0,The ĠAmerican ĠFootball ĠConference Ġ( A FC ) Ġchampion <ANSWER> ĠDenver ĠBroncos </ANSWER> Ġdefeated Ġthe ĠNational ĠFootball ĠConference Ġ( N FC ) Ġchampion ĠCarolina ĠPanthers Ġ24 âĢĵ 10 Ġto Ġearn Ġtheir Ġthird ĠSuper ĠBowl Ġtitle .,Which ĠNFL Ġteam Ġrepresented Ġthe ĠAFC Ġat ĠSuper ĠBowl Ġ50 ?
1,The ĠAmerican ĠFootball ĠConference Ġ( A FC ) Ġchampion ĠDenver ĠBroncos Ġdefeated Ġthe ĠNational ĠFootball ĠConference Ġ( N FC ) Ġchampion <ANSWER> ĠCarolina ĠPanthers </ANSWER> Ġ24 âĢĵ 10 Ġto Ġearn Ġtheir Ġthird ĠSuper ĠBowl Ġtitle .,Which ĠNFL Ġteam Ġrepresented Ġthe ĠNFC Ġat ĠSuper ĠBowl Ġ50 ?
2,"The Ġgame Ġwas Ġplayed Ġon ĠFebruary Ġ7 , Ġ2016 , Ġat <ANSWER> ĠLevi 's ĠStadium Ġin Ġthe ĠSan ĠFrancisco ĠBay ĠArea Ġat ĠSanta ĠClara , ĠCalifornia . </ANSWER>",Where Ġdid ĠSuper ĠBowl Ġ50 Ġtake Ġplace ?
3,The ĠAmerican ĠFootball ĠConference Ġ( A FC ) Ġchampion <ANSWER> ĠDenver ĠBroncos </ANSWER> Ġdefeated Ġthe ĠNational ĠFootball ĠConference Ġ( N FC ) Ġchampion ĠCarolina ĠPanthers Ġ24 âĢĵ 10 Ġto Ġearn Ġtheir Ġthird ĠSuper ĠBowl Ġtitle .,Which ĠNFL Ġteam Ġwon ĠSuper ĠBowl Ġ50 ?
4,"As Ġthis Ġwas Ġthe Ġ50 th ĠSuper ĠBowl , Ġthe Ġleague Ġemphasized Ġthe Ġ"" <ANSWER> Ġgold </ANSWER> en Ġanniversary "" Ġwith Ġvarious Ġgold - themed Ġinitiatives , Ġas Ġwell Ġas Ġtemporarily Ġsuspending Ġthe Ġtradition Ġof Ġnaming Ġeach ĠSuper ĠBowl Ġgame Ġwith ĠRoman Ġnumer als Ġ( under Ġwhich Ġthe Ġgame Ġwould Ġhave Ġbeen Ġknown Ġas Ġ"" Super ĠBowl ĠL ""), Ġso Ġthat Ġthe Ġlogo Ġcould Ġprominently Ġfeature Ġthe ĠArabic Ġnumer als Ġ50 .",What Ġcolor Ġwas Ġused Ġto Ġemphasize Ġthe Ġ50 th Ġanniversary Ġof Ġthe ĠSuper ĠBowl ?
...,...,...
14906,""" A Ġtotal Ġof <ANSWER> Ġseven </ANSWER> Ġdied Ġon Ġour Ġproperty ,"" ĠO ' Connor Ġtold ĠCNN .",How Ġmany Ġhorses Ġdied ?
14907,Te ams Ġare Ġtrying Ġto Ġfigure Ġout Ġwhat Ġhappened Ġat Ġthe <ANSWER> ĠInternational ĠPolo ĠClub ĠPalm ĠBeach Ġin ĠFlorida . </ANSWER>,Where Ġwere Ġthe Ġraces Ġheld ?
14908,"Four teen Ġthorough bred Ġhorses Ġdropped Ġdead Ġin Ġa Ġmysterious Ġscene ĠSunday Ġbefore Ġa Ġpol o Ġmatch Ġnear <ANSWER> ĠWest ĠPalm ĠBeach , ĠFlorida , </ANSWER> Ġofficials Ġsaid .",Where Ġwas Ġthe Ġmatch Ġset ?
14909,"The Ġcause Ġof Ġthe Ġdeaths <ANSWER> Ġhas Ġnot Ġbeen Ġdetermined , </ANSWER> Ġand Ġne cro ps ies Ġand Ġblood Ġtests Ġwere Ġunderway , Ġhe Ġsaid .",What Ġwas Ġthe Ġreason Ġthat Ġhorses Ġdie ?


In [20]:
#Sanity check to ensure tokenization works 

data_utils.detokenize_fn(tokenizer, tok_qg_data['input_text'][-11].split(" "))

"NASA launches a rocket from California's Vandenberg Air Force Base on <ANSWER> Tuesday. </ANSWER>"

In [22]:
'''Save the tokenized dataset'''

data_dir = "/home/mroemmele/question_generation/squad_newsqa_gpt2_tok_data/"

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

if not os.path.isdir(os.path.join(data_dir, partition)):
    os.mkdir(os.path.join(data_dir, partition))
    
with open(os.path.join(data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(tok_qg_data['input_text']))

with open(os.path.join(data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(tok_qg_data['question']))

In [21]:
partition

'test'