In [1]:
# This example covers the basics of augmenting an existing SQuAD-like dataset with various types of noise.

In [2]:
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features

In [4]:
# Load SQuAD dev set from package
import pkg_resources
import json
data_file = pkg_resources.resource_filename(
            'katanaqa', 'support/dev-v1.1.json')
with open(data_file, 'r') as f:
    data = json.load(f)

In [5]:
# Define augmentation parameters
hparams = {
    "num_replacements": 2,  # Replace up to 2 terms in the original question for each example
    "sample_ratio":     0.0001,  # Generate augmentations for 0.1% of original dataset size
    "sampling_strategy":'random',  # Randomly sample replacement tokens (excluding entities)
    "sampling_k":       5,  # Only consider the 5 highest importance scores across tokens
    "is_training":      False,  # Using dev dataset
    "out_prefix":       'dev',
    "save_freq":        1000,
    "p_misspelling":    1,  # Unnormalized sampling weight for misspelling perturbation
    "p_replace":        0.5,  # Unnormalized sampling weight for w2v synonym perturbation
    "p_dropword":       1,  # Unnormalized sampling weight for drop term perturbation
    "from_checkpoint":  False,  # Do not load progress from local checkpoint
}

In [8]:
# Instantiate the Dataset object.
from doggmentator.augment.augment_squad import SQuADDataset
generator = SQuADDataset(data, **hparams)

# Generate augmented examples according to the perturbation distribution: 20% synonyms, 40% misspellings, 40% drop terms. The perturbed examples will be generated for each original example until the total number defined by the sample_ratio is reached.
generator.generate()

10/14/2020 21:07:47 - doggmentator - INFO - Running SQuADDataset with hparams {'num_replacements': 2, 'sample_ratio': 0.0001, 'p_replace': 0.5, 'p_dropword': 1, 'p_misspelling': 1, 'sampling_strategy': 'random', 'sampling_k': 5}
10/14/2020 21:07:47 - doggmentator - DEBUG - generators.py: loading pkg data /Users/asisto/dev/Searchable/Research/Doggmentator/src/doggmentator/support/counter-fitted-vectors.txt


recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]

10/14/2020 21:07:58 - doggmentator - DEBUG - generators.py: loading pkg data /Users/asisto/dev/Searchable/Research/Doggmentator/src/doggmentator/support/missp.json



recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]

10/14/2020 21:08:01 - doggmentator - INFO - Generating 2 aug examples from 10570 orig examples





10/14/2020 21:08:01 - doggmentator - INFO - Generated 0 examples


In [9]:
generator()

{'version': '1.1',
 'data': [{'title': 'Black_Death',
   'paragraphs': [{'context': 'Plague was reportedly first introduced to Europe via Genoese traders at the port city of Kaffa in the Crimea in 1347. After a protracted siege, during which the Mongol army under Jani Beg was suffering from the disease, the army catapulted the infected corpses over the city walls of Kaffa to infect the inhabitants. The Genoese traders fled, taking the plague by ship into Sicily and the south of Europe, whence it spread north. Whether or not this hypothesis is accurate, it is clear that several existing conditions such as war, famine, and weather contributed to the severity of the Black Death.',
     'qas': [{'answers': [{'answer_start': 244, 'text': 'infected corpses'},
        {'answer_start': 244, 'text': 'infected corpses'},
        {'answer_start': 244, 'text': 'infected corpses'}],
       'question': 'What did the Mongol army thow in their catapults?',
       'orig_id': '57264845f1498d1400e8db0c',