In [1]:
import json
import re

In [64]:
class NQAExample(object):
    """
    A single training/test example for the NQA dataset.
    For examples without an answer, the start and end position are -1.
    """

    def __init__(self,
                 qas_id,
                 question_text,
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None,
                 is_impossible=None):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible
        
def read_NQA_examples(input_file):
    """Read an NQA json file into a list of NQAExample."""
    data = []
    for line in open(input_file, 'r'):
        data.append(json.loads(line))

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    examples = []
    for entry in data:
        qas_id = entry['example_id']
        question_text = entry['question_text']
        doc_text = entry['document_text']
        split_text = doc_text.split(' ')
        if entry['annotations'][0]['long_answer']['candidate_index']==-1:
            continue
        if len(entry['annotations'][0]['short_answers'])==0:
            is_impossible=True
        else:
            is_impossible=False

        doc_tokens = doc_text.split(' ')

        if not is_impossible:
            start_position = entry['annotations'][0]['short_answers'][0]['start_token']
            end_position = entry['annotations'][0]['short_answers'][0]['end_token']
            assert start_position >= entry['annotations'][0]['long_answer']['start_token'] \
                and end_position <= entry['annotations'][0]['long_answer']['end_token'], qas_id
            orig_answer_text = ' '.join([item for item in split_text[start_position:end_position]])

        else:
            start_position = -1
            end_position = -1
            orig_answer_text = ""

        example = NQAExample(
            qas_id=qas_id,
            question_text=question_text,
            doc_tokens=doc_tokens,
            orig_answer_text=orig_answer_text,
            start_position=start_position,
            end_position=end_position,
            is_impossible=is_impossible)
        examples.append(example)

    return examples

def print_example(example):
    print("Question text ", example.question_text)
    print("Doc tokens ", example.doc_tokens)
    print("Orig answer ", example.orig_answer_text)
    print("Constructed answer ", example.doc_tokens[example.start_position:example.end_position])
    print("Start pos ", example.start_position)
    print("End pos ", example.end_position)

In [71]:
examples = read_NQA_examples('datasets/train_10k.json')

In [72]:
# print_example(examples[30])

In [73]:
len(examples)

4940

In [77]:
# print_example(examples[0])