## Imports

In [None]:
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nltk import tokenize
from nlpaug.util import Action
from random import randrange

# Note: tqdm <= v4.8
from tqdm import tqdm, tqdm_pandas

tqdm_pandas(tqdm())

# Note: tqdm > v4.8
# from tqdm.auto import tqdm
# tqdm.pandas()

from utils import merge_title_perex_body

## Constants

In [None]:
# Input files
SENSITIVE_DATA_FILEPATH = "data/processed-sensitive-data.csv"
SENSITIVE_DATA_AUGMENTED_FILEPATH = "data/processed-sensitive-data-augmented.csv"

# Multiple splits can be created. Id of new augmented text is created by suffixing the id of the original text by SPLIT
# variable. Due to that, we ca track the original id, which was used for augmentation.
SPLIT = 4
RANDOM_SEED = 11

# Output file
AUGMENTED_DATA_SPLIT = f"data/augmented_{SPLIT}.csv"

## Read sensitive data

In [None]:
df_sensitive = pd.read_csv(SENSITIVE_DATA_FILEPATH, index_col=0)

In [None]:
df_sensitive.head(3)

In [None]:
text = df_sensitive.iloc[0]["title"]
text

In [None]:
df_sensitive["text"] = df_sensitive.progress_apply(merge_title_perex_body, axis=1)
df_sensitive["text"].head(3)

### Split to sentences

In [None]:
df_sensitive["sentences"] = df_sensitive["text"].progress_apply(tokenize.sent_tokenize)
df_sensitive["sentences"].head(3)

## Augmentation

The idea of augmenting the data is that we split the text into sentences and slightly change each of the sentence.

Used augmentations:
- randomly swap words
- delete random word or random sequence of words
- replace some words with wordnet synonyms or paraphrase database synonyms
- insert or substitute a words using language model (roberta-base used)
- insert or substitute a words using word2vec vectors (google news vectors used)

### Define augmentations

In [None]:
AUGMENTERS = {
    "replace-synon-wordnet": naw.SynonymAug(aug_src="wordnet"),
    "replace-synon-ppdb": naw.SynonymAug(aug_src="ppdb", model_path="ppdb-2.0-s-all"),
    "swap-word": naw.RandomWordAug(action="swap"),
    "delete-word": naw.RandomWordAug(),
    "delete-seq-of-words": naw.RandomWordAug(action="crop"),
    "substitue-lm": naw.ContextualWordEmbsAug(model_path="roberta-base", action="substitute"),
    "insert-lm": naw.ContextualWordEmbsAug(model_path="roberta-base", action="insert"),
    "substitue-w2v": naw.WordEmbsAug(model_type="fasttext", model_path="crawl-300d-2M.vec", action="substitute"),
    "insert-w2v": naw.WordEmbsAug(model_type="fasttext", model_path="crawl-300d-2M.vec", action="insert"),
}

id_to_key = {i: key for i, (key, val) in enumerate(AUGMENTERS.items())}

In [None]:
AUGMENTERS

In [None]:
def apply_random_augmentation(text):
    augmentation = id_to_key[randrange(len(AUGMENTERS))]
    try:
        return AUGMENTERS[augmentation].augment(text)
    except Exception as E:
        print(f"Exception for {augmentation} augmentation, using sentence without augmentation. {E}")
        return text

### Apply random augmentation on each sentence separately 

In [None]:
df_sentences = df_sensitive["sentences"].explode()

In [None]:
df_augmented = df_sentences.progress_apply(apply_random_augmentation)
df_augmented = df_augmented.rename("text")

In [None]:
# suffix the id of the original text by SPLIT variable
df_augmented.index = df_augmented.index.map(lambda x: f"{SPLIT}{x}")

In [None]:
df_augmented.head(3)

### Group sentences back and save augmentation split into file

In [None]:
df_augmented.groupby("id").transform(lambda x: " ".join(x)).drop_duplicates().to_csv(AUGMENTED_DATA_SPLIT)