## Synonym Replacement for Text Augmentation

In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v -p nltk

Author: Sebastian Raschka

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.12.0

nltk: 3.8.1



In [2]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sebastian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet

nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms


get_synonyms("quickly")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sebastian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['quickly',
 'rapidly',
 'speedily',
 'chop-chop',
 'apace',
 'promptly',
 'quickly',
 'quick',
 'cursorily',
 'quickly']

In [4]:
# for part of speech tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sebastian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
def get_position_tags(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    return pos_tags
    
get_position_tags("The cat quickly jumped over the lazy dog.")

[('The', 'DT'),
 ('cat', 'NN'),
 ('quickly', 'RB'),
 ('jumped', 'VBD'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN'),
 ('.', '.')]

In [6]:
import nltk
from nltk.corpus import wordnet
import random

random.seed(123)


def synonym_replacement(text, num_replacement=2):

    words = nltk.word_tokenize(text)
    
    # tag nounds, adjectives, etc.
    pos_tags = nltk.pos_tag(words)
    
    # Only replace adverbs (RB) and adjectives (JJ) for simplicity here
    candidates = [word for word, pos in pos_tags if pos in ['RB', 'JJ']]

    if len(candidates) < num_replacement:
        return words
    
    # Randomly choose the words to be replaced
    words_to_replace = random.sample(candidates, num_replacement)
    
    
    # For each word to replace, we get its synonyms and choose one randomly
    for word in words_to_replace:
        synonyms = get_synonyms(word)
        if synonyms:
            synonym = random.choice(synonyms)
            text = text.replace(word, synonym, 1)
    
    return text

In [7]:
text = """
The cat quickly jumped over the lazy dog.
"""

sentences = nltk.sent_tokenize(text)
augmented_sentences = [synonym_replacement(sentence) for sentence in sentences]
augmented_paragraph = ' '.join(augmented_sentences)

print(augmented_paragraph)


The cat rapidly jumped over the work-shy dog.


**Compare original with augmented text**

In [8]:
import difflib


d = difflib.Differ()
diff = d.compare(text.split(), augmented_paragraph.split())

print('\n'.join(diff))

  The
  cat
- quickly
+ rapidly
  jumped
  over
  the
- lazy
+ work-shy
  dog.
