In [1]:
import os
import pandas
import pickle
import importlib
import random
import re

import data_utils
importlib.reload(data_utils)

pandas.set_option('display.max_colwidth', -1)
pandas.set_option('display.max_rows', 500)

random.seed(0)

## Create NewsQA dataset with annotated answers in paragraphs and answer sents

In [4]:
'''Function that returns tokenized data with answer annotations in paragraphs/answer sentences'''


def make_newsqa_qg_dataset(data):
    wrangled_data = {'article_id': [],
                     'paragraph': [],
                     'answer_sent': [],
                     'question': [],
                     'answer': [],
                     }
#     import pdb
#     pdb.set_trace()
    for article_idx, article in enumerate(data):
        #         for paragraph in article['paragraphs']:
        paragraph = article['text']
        paragraph_sents = data_utils.segment_sents(paragraph)
        for qa in article['questions']:
            question_text = qa['q']
            try:
                answer_char_start = qa['consensus']['s']
                answer_char_end = qa['consensus']['e']
            except:
                #No answer to this question
#                 import pdb;pdb.set_trace()
                continue
            #Ensure first character in question is capitalized and question ends with question mark
            if question_text[0].islower():
                question_text = question_text[0].upper() + question_text[1:]
            if question_text[-1] != "?":
                question_text += "?"
            answer_text = re.sub("\n+", " ", paragraph[answer_char_start:answer_char_end].strip())
            for sent_idx in range(len(paragraph_sents['sents'])):
                sent_char_start = paragraph_sents['sent_char_start_idxs'][sent_idx]
                if sent_idx + 1 == len(paragraph_sents['sents']):
                    #import pdb;pdb.set_trace()
                    sent_char_end = (paragraph_sents['sent_char_start_idxs'][-1] +
                                     len(paragraph_sents['sents'][-1]))
                    if sent_char_end != len(paragraph):
                        import pdb
                        pdb.set_trace()
                else:
                    sent_char_end = paragraph_sents['sent_char_start_idxs'][sent_idx + 1]
                if sent_char_start <= answer_char_start < sent_char_end:
                    while answer_char_end > sent_char_end:
                        # Answer spans multiple sentences, which is probably a segmentation failure;
                        # just append next sentence to this one as answer sentence
                        try:
                            #                                     import pdb;pdb.set_trace()
                            paragraph_sents['sents'][sent_idx] = (paragraph_sents['sents'][sent_idx] +
                                                                  paragraph_sents['sents'][sent_idx + 1])
                            paragraph_sents['sents'].pop(sent_idx + 1)
                            paragraph_sents['sent_char_start_idxs'].pop(
                                sent_idx + 1)
                            if sent_idx + 1 == len(paragraph_sents['sents']):
                                sent_char_end = len(paragraph)
                            else:
                                sent_char_end = paragraph_sents['sent_char_start_idxs'][sent_idx + 1]
                        except:
                            import pdb
                            pdb.set_trace()
                    answer_sent_text = paragraph_sents['sents'][sent_idx]
                    # insert answer tokens into sentence
                    answer_start_insert_idx = answer_char_start - sent_char_start
                    answer_end_insert_idx = answer_char_end - sent_char_start
                    answer_sent_text = (answer_sent_text[:answer_start_insert_idx] + "<ANSWER> "
                                        + answer_sent_text[answer_start_insert_idx:])
                    answer_sent_text = (answer_sent_text[:answer_end_insert_idx + len("<ANSWER> ")]
                                        + "</ANSWER> " +
                                        answer_sent_text[answer_end_insert_idx + len(" <ANSWER>"):])
                    answer_sent_text = re.sub("\n+", " ", answer_sent_text.strip())
                    paragraph_sents_with_answer = paragraph_sents['sents'][:]
                    paragraph_sents_with_answer[sent_idx] = answer_sent_text
                    paragraph_text = " ".join([re.sub("\n+", " ", sent.strip()) 
                                               for sent in paragraph_sents_with_answer])
                    break
            wrangled_data['article_id'].append(article_idx)
            wrangled_data['paragraph'].append(paragraph_text) 
            wrangled_data['answer_sent'].append(answer_sent_text)
            wrangled_data['question'].append(question_text.strip())
            wrangled_data['answer'].append(answer_text)
            #print("paragraph done")
        if article_idx and article_idx % 50 == 0:
            print(article_idx)
#             break
    return wrangled_data

In [5]:
'''Load pre-processed NewsQA dataset'''

newsqa_data_dir = "/home/mroemmele/news_QA/"
partition = 'test'
assert partition in ('train', 'test')
data = pickle.load(open(os.path.join(newsqa_data_dir, 
                                     '{}_data.pkl'.format('dev' if partition == 'test' else partition)), 'rb'))   

In [None]:
'''Make the dataset'''

wrangled_data = make_newsqa_qg_dataset(data)
pandas.DataFrame(wrangled_data)[-100:]

> <ipython-input-4-5723da3fbaa7>(21)make_newsqa_qg_dataset()
-> answer_char_start = qa['consensus']['s']
(Pdb) qa
{'isQuestionBad': 0.0, 'q': 'Iran criticizes who?', 'validatedAnswers': [{'count': 2, 's': 63, 'e': 97}], 'answers': [{'sourcerAnswers': [{'s': 68, 'e': 97}]}, {'sourcerAnswers': [{'s': 63, 'e': 97}]}, {'sourcerAnswers': [{'noAnswer': True}]}], 'consensus': {'s': 63, 'e': 97}, 'isAnswerAbsent': 0.33333333333299997}
(Pdb) c
> <ipython-input-4-5723da3fbaa7>(20)make_newsqa_qg_dataset()
-> import pdb;pdb.set_trace()
(Pdb) n
> <ipython-input-4-5723da3fbaa7>(21)make_newsqa_qg_dataset()
-> answer_char_start = qa['consensus']['s']
(Pdb) qa
{'isQuestionBad': 0.0, 'q': 'What are US and Iran relations tensioned about?', 'validatedAnswers': [{'count': 2, 's': 2558, 'e': 2575}], 'answers': [{'sourcerAnswers': [{'noAnswer': True}]}, {'sourcerAnswers': [{'s': 2558, 'e': 2575}]}, {'sourcerAnswers': [{'s': 120, 'e': 169}]}], 'consensus': {'s': 2558, 'e': 2575}, 'isAnswerAbsent': 0.333333333

In [127]:
'''Save the dataset'''

data_dir = "/home/mroemmele/question_generation/newsqa_untok_data"

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

if not os.path.isdir(os.path.join(data_dir, partition)):
    os.mkdir(os.path.join(data_dir, partition))
       
with open(os.path.join(data_dir, partition, 'paragraphs.txt'), 'w') as f:
    f.write("\n".join(wrangled_data['paragraph']))
    
with open(os.path.join(data_dir, partition, 'answer_sents.txt'), 'w') as f:
    f.write("\n".join(wrangled_data['answer_sent']))

with open(os.path.join(data_dir, partition, 'questions.txt'), 'w') as f:
    f.write("\n".join(wrangled_data['question']))
    
with open(os.path.join(data_dir, partition, 'answers_only.txt'), 'w') as f:
    f.write("\n".join(wrangled_data['answer']))