In [1]:
# This example covers the basics of augmenting an existing SQuAD-like dataset with various types of noise.

In [2]:
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features

In [3]:
# Load SQuAD dev set from package
import pkg_resources
import json
data_file = pkg_resources.resource_filename(
            'kitanaqa', 'support/dev-v1.1.json')
with open(data_file, 'r') as f:
    data = json.load(f)

In [4]:
# Define augmentation parameters
hparams = {
    "num_replacements": 2,  # Replace up to 2 terms in the original question for each example
    "sample_ratio":     0.0001,  # Generate augmentations for 0.1% of original dataset size
    "sampling_strategy":'random',  # Randomly sample replacement tokens (excluding entities)
    "sampling_k":       5,  # Only consider the 5 highest importance scores across tokens
    "is_training":      False,  # Using dev dataset
    "out_prefix":       'dev',
    "save_freq":        1000,
    "p_misspelling":    1,  # Unnormalized sampling weight for misspelling perturbation
    "p_replace":        0.5,  # Unnormalized sampling weight for w2v synonym perturbation
    "p_dropword":       1,  # Unnormalized sampling weight for drop term perturbation
    "from_checkpoint":  False,  # Do not load progress from local checkpoint
}

In [6]:
# Instantiate the Dataset object.
from kitanaqa.augment.augment_squad import SQuADDataset
generator = SQuADDataset(data, **hparams)

# Generate augmented examples according to the perturbation distribution: 20% synonyms, 40% misspellings, 40% drop terms. The perturbed examples will be generated for each original example until the total number defined by the sample_ratio is reached.
generator.generate()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/asisto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
11/12/2020 18:28:38 - kitanaqa - INFO - Running SQuADDataset with hparams {'num_replacements': 2, 'sample_ratio': 0.0001, 'p_replace': 0.5, 'p_dropword': 1, 'p_misspelling': 1, 'sampling_strategy': 'random', 'sampling_k': 5}
11/12/2020 18:28:38 - kitanaqa - DEBUG - generators.py: loading pkg data /Users/asisto/dev/Searchable/Research/KitanaQA/src/kitanaqa/support/counter-fitted-vectors.txt
11/12/2020 18:28:45 - kitanaqa - DEBUG - generators.py: loading pkg data /Users/asisto/dev/Searchable/Research/KitanaQA/src/kitanaqa/support/missp.json
11/12/2020 18:28:45 - kitanaqa - INFO - Generating 2 aug examples from 10570 orig examples


In [7]:
generator()

{'version': '1.1',
 'data': [{'title': 'Oxygen',
   'paragraphs': [{'context': 'Highly concentrated sources of oxygen promote rapid combustion. Fire and explosion hazards exist when concentrated oxidants and fuels are brought into close proximity; an ignition event, such as heat or a spark, is needed to trigger combustion. Oxygen is the oxidant, not the fuel, but nevertheless the source of most of the chemical energy released in combustion. Combustion hazards also apply to compounds of oxygen with a high oxidative potential, such as peroxides, chlorates, nitrates, perchlorates, and dichromates because they can donate oxygen to a fire.',
     'qas': [{'answers': [{'answer_start': 46, 'text': 'rapid combustion'},
        {'answer_start': 46, 'text': 'rapid combustion'},
        {'answer_start': 46, 'text': 'rapid combustion'},
        {'answer_start': 46, 'text': 'rapid combustion'},
        {'answer_start': 46, 'text': 'rapid combustion'}],
       'question': 'What can focused oxygen pr