# Augmentation: Upsampling via Duplication of in-frequent patterns


Use infrequent patterns for data augmentation:

* __OBJ_DIR__ is a verb (maximize, minimize) frequently. But there are a couple of examples, where OBJ_DIR is an adjective (__I want the cost to be minimal__). Duplicating such examples might help.

* __LIMIT__ and __PARAM__ are some form of numbers . There is skew in their types. Also, __ordinals__ have a higher chance of being miss tagged by a gold NER system. So, duplicating ordinal examples might also help.

* __VAR__ is mostly conjuncting noun chunks. But properly identifying a span of chunk, is difficult. Conjuncting prepositional phrases is also an infrequent pattern. __He does commercials with famous actors and commercials with regular actors__. Duplicating such examples might help.

* __OBJ_NAME__: OBJ_DIR followed by a prepositional phrase, OBJ_DIR followed by a prepositional phrase followed by another prepositional phrase is a rare pattern. Duplicating such examples might help. (__maximize the number of action figures__; __minimize the number of batches of cookies__)

The __suffix__ number at the end of each filename represents __how many times the pattern has been duplicated__.


In [1]:
## Dependancies

# !pip install spacy==3.3.0  # version lower because of allennlp
# !pip install pandas

In [None]:
import os
import re
import json
import random
from collections import Counter

import pandas as pd

import spacy
from spacy.tokenizer import Tokenizer

from pathlib import Path

from tqdm import tqdm

import datetime
datetime_object = datetime.datetime.now()

nlp = spacy.load("en_core_web_trf")
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)


root_directory = Path(os.path.abspath('')).parents[1]

with open(
        root_directory / 'spacy_format_data/train.json'
) as fname:
    data = json.load(fname)

entities_list = ["CONST_DIR", "LIMIT", "VAR", "PARAM", "OBJ_NAME", "OBJ_DIR"]

# Looks for Patterns and their Counts

In [2]:
pattern_map_train = {}
train_data = data[0]['paragraphs']
spacy_sentence_map = {}

for item in tqdm(train_data):
    
    entities = item['entities']
    raw_string = " ".join(
        [token['orth'] for token in item['sentences'][0]['tokens']])
    spacy_sentence = nlp(raw_string)
    
    spacy_sentence_map[raw_string] = spacy_sentence
    
    pos_tags = [tok.pos_ for tok in spacy_sentence]
    dep_tags = [tok.dep_ for tok in spacy_sentence]
    patterns = [pos_ + "_" + dep_ for pos_, dep_ in zip(pos_tags, dep_tags)]
    
    for entity in entities:
        if entity[-1] in pattern_map_train:
            entity_span = spacy_sentence.char_span(entity[0], entity[1])
            pattern_map_train[entity[-1]] = pattern_map_train[entity[-1]] + [(raw_string[entity[0]:entity[1]], ' '.join(patterns[entity_span[0].i:entity_span[-1].i + 1]))]
        else:
            entity_span = spacy_sentence.char_span(entity[0], entity[1])
            pattern_map_train[entity[-1]] = [(raw_string[entity[0]:entity[1]],' '.join(patterns[entity_span[0].i:entity_span[-1].i + 1]))]

final_pattern_count_dict = {}

for key in entities_list:
    key_phrases = [val[0] for val in pattern_map_train[key]]
    patterns_pos_dep = [val[1] for val in pattern_map_train[key]]
    
    pattern_counter = Counter(patterns_pos_dep)
    
    phrase_values = []
    for element in pattern_counter:
        phrase_values.append(list(set([key_phrases[val_index] for val_index, val in enumerate(patterns_pos_dep) if element == val])))
    
    df = pd.DataFrame.from_dict(pattern_counter, orient='index').reset_index()
    df = df.rename(columns={'index': 'Pattern', 0: 'count'})
    df['examples'] = phrase_values
    final_df = df.sort_values(by=['count'], ascending=True)
    
    count_values = df["count"].tolist()
    patterns_final = df["Pattern"].tolist()
    
    final_pattern_count_dict[key] = {item:count_values[indx] for indx, item in enumerate(patterns_final)}

print("Patterns in VAR Class: ", final_pattern_count_dict['VAR'])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 714/714 [03:10<00:00,  3.74it/s]

Patterns in VAR Class:  {'NOUN_compound NOUN_compound NOUN_pobj': 57, 'ADJ_amod NOUN_pobj': 279, 'NOUN_compound NOUN_compound NOUN_appos': 3, 'ADJ_amod NOUN_conj': 100, 'NOUN_compound NOUN_compound': 47, 'ADJ_amod NOUN_compound NOUN_pobj': 63, 'ADJ_amod NOUN_compound NOUN_nsubj': 30, 'PROPN_attr': 1, 'PROPN_compound PROPN_conj': 14, 'PROPN_compound': 3, 'PROPN_compound PROPN_compound': 5, 'NOUN_compound NOUN_appos': 46, 'NOUN_compound NOUN_conj': 161, 'NOUN_compound NOUN_nsubjpass': 20, 'NOUN_compound NOUN_dobj': 124, 'NOUN_conj': 294, 'NOUN_compound NOUN_pobj': 531, 'NOUN_pobj': 654, 'NOUN_dobj': 235, 'PROPN_appos': 21, 'PROPN_conj': 25, 'PROPN_pobj': 35, 'ADJ_compound NOUN_dobj': 2, 'NOUN_compound NOUN_attr': 15, 'NOUN_compound NOUN_nsubj': 203, 'NOUN_dobj PROPN_appos': 4, 'VERB_amod NOUN_appos': 2, 'VERB_amod NOUN_pobj': 17, 'VERB_amod NOUN_conj': 8, 'VERB_amod NOUN_dobj': 10, 'PROPN_compound PROPN_appos': 10, 'PROPN_compound PROPN_nsubj': 43, 'NOUN_nsubj': 293, 'NOUN_compound NOUN_




# Duplicate in-frequent Patterns using number_of_duplications needed

In [3]:
number_of_duplications = 5

augmented_train = []

for item in tqdm(train_data):
    entities = item['entities']
    
    raw_string = " ".join(
        [token['orth'] for token in item['sentences'][0]['tokens']])
    spacy_sentence = spacy_sentence_map[raw_string]
    
    pos_tags = [tok.pos_ for tok in spacy_sentence]
    dep_tags = [tok.dep_ for tok in spacy_sentence]
    patterns = [pos_ + "_" + dep_ for pos_, dep_ in zip(pos_tags, dep_tags)]
    
    number_of_replications = 0
    for entity in entities:
        entity_span = spacy_sentence.char_span(entity[0], entity[1])
        pattern = ' '.join(patterns[entity_span[0].i:entity_span[-1].i + 1])
        label = entity[-1]
        if final_pattern_count_dict[label][pattern] < number_of_duplications:
            difference = number_of_duplications - final_pattern_count_dict[label][pattern]
            number_of_replications = max(number_of_replications, difference)
    
    if number_of_replications > 0:
        augmented_train += [item] * number_of_replications
    else:
        augmented_train += [item]

augmented_train = augmented_train[1::]

augmented_train_data = []

random.shuffle(augmented_train)

# spacy to regular iob
for example in tqdm(augmented_train):
    for token in example['sentences'][0]['tokens']:
        word = token['orth']
        label = token['ner']
        if "L-" in label:
            label = label.replace("L-", "I-")
        elif "U-" in label:
            label = label.replace("U-", "B-")
        word_list = [word, '_', '_', label]
        augmented_train_data.append('\t'.join(word_list))
    augmented_train_data.append('\n')

# add back the docstart before writing the file
augmented_train_data = ['-DOCSTART-\t_\t_\tO', '\n'] + augmented_train_data

###################################################################################

fname = root_directory / 'augmented_train_data' / ('augmented_train' + str(number_of_duplications) + "__" + str(datetime_object) + '.txt')

with open(fname, 'w') as f:
    for line in augmented_train_data:
        if line == '\n':
            f.write(f"{line}")
        else:
            f.write(f"{line}\n")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 714/714 [00:00<00:00, 7790.85it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1159/1159 [00:00<00:00, 16278.55it/s]
