This notebook is a demo of [nlpaug](https://github.com/makcedward/nlpaug) package, which contains a variety of augmentations to supplement text data and introduce noise that may help your model generalize.

### Installation

In [None]:
! pip install nlpaug fairseq >> /dev/null

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

test_sentence = "B33F My Food Shop Shopping Trolley - ST. 003 troli mainan edukasi anak keranjang belanja"

### Character Augmenter

1. keyboard : Augmenter that apply typo error simulation to textual input.

In [None]:
aug = nac.KeyboardAug(name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, 
                      aug_word_min=1, aug_word_max=10, stopwords=None, tokenizer=None, reverse_tokenizer=None, 
                      include_special_char=True, include_numeric=True, include_upper_case=True, lang='en', verbose=0, 
                      stopwords_regex=None, model_path=None, min_char=4)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

2. ocr : Augmenter that apply ocr error simulation to textual input.

In [None]:
aug = nac.OcrAug(name='OCR_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, aug_word_min=1, 
                 aug_word_max=10, stopwords=None, tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, 
                 min_char=1)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

3. random : Augmenter that apply random character error to textual input.

In [None]:
aug = nac.RandomCharAug(action='substitute', name='RandomChar_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, 
                        aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True, 
                        include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None, 
                        tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidiates=None)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

### Word Augmenter

1. antonym : Augmenter that apply semantic meaning based to textual input.

In [None]:
aug = naw.AntonymAug(name='Antonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', stopwords=None, tokenizer=None, 
                     reverse_tokenizer=None, stopwords_regex=None, verbose=0)

test_sentence_aug = aug.augment("very interesting")
print("very interesting")
print(test_sentence_aug)

2. back_translation : Augmenter that apply operation (word level) to textual input based on back translation.

**Need about 15mn for models downloading**

In [None]:
# aug = naw.BackTranslationAug(from_model_name='transformer.wmt19.en-de', to_model_name='transformer.wmt19.de-en', 
#                              from_model_checkpt='model1.pt', to_model_checkpt='model1.pt', tokenizer='moses', 
#                              bpe='fastbpe', is_load_from_github=True, name='BackTranslationAug', device='cpu', 
#                              force_reload=False, verbose=0)

# test_sentence_aug = aug.augment(test_sentence)
# print(test_sentence)
# print(test_sentence_aug)

3. context_word_embedding : Augmenter that apply operation (word level) to textual input based on contextual word embeddings.

In [None]:
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', model_type='', action='substitute', temperature=1.0, 
                                top_k=100, top_p=None, name='ContextualWordEmbs_Aug', aug_min=1, aug_max=10, aug_p=0.3, 
                                stopwords=None, device='cpu', force_reload=False, optimize=None, stopwords_regex=None, 
                                verbose=0, silence=True)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

4. random : Augmenter that apply random word operation to textual input.

In [None]:
aug = naw.RandomWordAug(action='delete', name='RandomWord_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None, 
                        target_words=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=0)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

5. spelling : Augmenter that apply spelling error simulation to textual input.

In [None]:
aug = naw.SpellingAug(dict_path=None, name='Spelling_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None, 
                      tokenizer=None, reverse_tokenizer=None, include_reverse=True, stopwords_regex=None, verbose=0)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

6. split : Augmenter that apply word splitting operation to textual input.

In [None]:
aug = naw.SplitAug(name='Split_Aug', aug_min=1, aug_max=10, aug_p=0.3, min_char=4, stopwords=None, tokenizer=None, 
                   reverse_tokenizer=None, stopwords_regex=None, verbose=0)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

7. synonym : Augmenter that apply semantic meaning based to textual input.

In [None]:
aug = naw.SynonymAug(aug_src='wordnet', model_path=None, name='Synonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', 
                     stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, force_reload=False, 
                     verbose=0)

test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)

8. tfidf : Augmenter that apply TF-IDF based to textual input.

**TfIdfAug have to been trained based on your data. You can refer to this notebook for step of training.** see : https://github.com/makcedward/nlpaug/blob/master/example/tfidf-train_model.ipynb

In [None]:
# aug = naw.TfIdfAug(model_path='.', action='substitute', name='TfIdf_Aug', aug_min=1, aug_max=10, aug_p=0.3, top_k=5, 
#                    stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=0)
# 
# test_sentence_aug = aug.augment(test_sentence)
# print(test_sentence)
# print(test_sentence_aug)

9. word_embs : Augmenter that apply operation to textual input based on word embeddings.

In [None]:
# aug = naw.WordEmbsAug(model_type='word2vec', model_path='.', model=None, action='substitute', name='WordEmbs_Aug', 
#                       aug_min=1, aug_max=10, aug_p=0.3, top_k=100, n_gram_separator='_', stopwords=None, tokenizer=None, 
#                       reverse_tokenizer=None, force_reload=False, stopwords_regex=None, verbose=0)

# test_sentence_aug = aug.augment(test_sentence)
# print(test_sentence)
# print(test_sentence_aug)

# References :
* [GitHub](https://github.com/makcedward/nlpaug)
* [Augmenting the Data](https://www.kaggle.com/jpmiller/augmenting-the-data)