In [7]:
from __future__ import absolute_import, division, print_function

import argparse
import collections
import json
import logging
import math
import os
import random
import time
import re
import string
import sys
from io import open

import numpy as np


In [8]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)


## Squad-Dataset Example and Feature Extraction

In [17]:
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


In [21]:
class SquadObject(object):
    def __init__(self, q_id, ques_txt, doc_tokens, original_ans=None, start_position=None, end_position=None, is_impossible=None):
        self.q_id = q_id
        self.ques_txt = ques_txt
        self.doc_tokens = doc_tokens
        self.original_ans = original_ans
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible
    
    def __str__(self):
        return self.__repr__()
    
    def __repr__(self):
        s = ""
        s += "Question ID: %s" % (self.q_id)
        s += ", Question Text: %s" % (self.ques_txt)
        s += ", Document Tokens: %s" % (self.doc_tokens)
        if self.original_ans:
            s += ", Original Answer: %s" % (self.original_ans)
        if self.start_position:
            s += ", Start Position: %d" % (self.start_position)
        if self.end_position:
            s += ", End Position: %d" % (self.end_position)
        if self.is_impossible:
            s += ", Is Impossible: %s" % (self.is_impossible)
        return s
    
def whitepace_tokenize(text):
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


def read_squad_examples(input_file, version_2_with_negative,method='training'):
    """Read a SQuAD json file into a list of SquadExample."""
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if method == 'training':
                    if version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadObject(
                    q_id=qas_id,
                    ques_txt=question_text,
                    doc_tokens=doc_tokens,
                    original_ans=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=is_impossible)
                
                examples.append(example)
    return examples


In [22]:
training_file = '../Data/train-v2.0.json'
version_2_negative = True
training_entries =  read_squad_examples(
    training_file,version_2_negative, )
print("Number of training entries: %d" % (len(training_entries)))


Number of training entries: 130319


In [23]:
print(training_entries[0])

Question ID: 56be85543aeaaa14008c9063, Question Text: When did Beyonce start becoming popular?, Document Tokens: ['Beyoncé', 'Giselle', 'Knowles-Carter', '(/biːˈjɒnseɪ/', 'bee-YON-say)', '(born', 'September', '4,', '1981)', 'is', 'an', 'American', 'singer,', 'songwriter,', 'record', 'producer', 'and', 'actress.', 'Born', 'and', 'raised', 'in', 'Houston,', 'Texas,', 'she', 'performed', 'in', 'various', 'singing', 'and', 'dancing', 'competitions', 'as', 'a', 'child,', 'and', 'rose', 'to', 'fame', 'in', 'the', 'late', '1990s', 'as', 'lead', 'singer', 'of', 'R&B', 'girl-group', "Destiny's", 'Child.', 'Managed', 'by', 'her', 'father,', 'Mathew', 'Knowles,', 'the', 'group', 'became', 'one', 'of', 'the', "world's", 'best-selling', 'girl', 'groups', 'of', 'all', 'time.', 'Their', 'hiatus', 'saw', 'the', 'release', 'of', "Beyoncé's", 'debut', 'album,', 'Dangerously', 'in', 'Love', '(2003),', 'which', 'established', 'her', 'as', 'a', 'solo', 'artist', 'worldwide,', 'earned', 'five', 'Grammy', 'A