# 1. Synonym Replacement

Corpus : Collection of all words in documents

Vocabulary : All unique words in corpus

nltk : Natural Language Toolkit, python library for text processing

wordnet : Lexical Database of english words in nltk

omw-1.4 : Enabling multi-lingual support

punkt : For tokenization

In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [7]:
import random
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# text = "The movie was absolutely fantastic and enjoyable"

In [4]:
def get_synonyms(word):
    # Set used to remove duplicate words, and only contain unique words
    synonyms = set()

    # Get all synsets(meanings) of the word from wordnet
    for syn in wordnet.synsets(word):
        # Each synset has multiple lemmas (lemmas are words with same meaning)
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')

        if synonym.lower() != word.lower():
            synonyms.add(synonym)

    return list(synonyms)

In [5]:
def synonym_replacement(text, n=2):
    # Tokenize sentence into words
    words = nltk.word_tokenize(text)

    # We could also use split function, or spacy library which is the best (eg. handles terms like 5km, New Delhi etc.)

    # Make a copy of the words to modify
    new_words = words.copy()

    random_word_list = list(set([word for word in words if word.isalpha()]))

    random.shuffle(random_word_list)

    # Keep track of how many words replaced
    num_replaced = 0

    # Go through all words
    for word in random_word_list:
        synonyms = get_synonyms(word)

        # Chose a random synonym if exist
        if synonyms:
            synonym = random.choice(synonyms)

            new_words = [synonym if w == word else w for w in new_words]

            num_replaced += 1

        if num_replaced >= n:
            break
    
    return ' '.join(new_words)

In [8]:
original = "The movie was absolutely fantastic and enjoyable"
augmented = synonym_replacement(original, n=2)

print(f"{original=} and {augmented=}")

original='The movie was absolutely fantastic and enjoyable' and augmented='The movie was absolutely wondrous and pleasurable'


In [9]:
def bigram_flip(text):
    words = nltk.word_tokenize(text)

    # Make a copy of tokenized words as we don't use original directly
    new_words = words.copy()

    # List of indexes where each index represents first word of a bigram (pair of words)
    indices = list(range(len(words) - 1))

    # If less than 2 words in text, no biagrams will exist so we can't flip any bigram
    if not indices:
        return text

    flip_index = random.choice(indices)

    new_words[flip_index], new_words[flip_index+1] = new_words[flip_index+1], new_words[flip_index]

    return ' '.join(new_words)

In [10]:
# Example usage
original2 = "The movie was absolutely fantastic and enjoyable"
augmented2 = bigram_flip(original2)

print(f"{original2=} and {augmented2=}")

original2='The movie was absolutely fantastic and enjoyable' and augmented2='The movie was absolutely and fantastic enjoyable'


# 3. Back Translation

In [11]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m857.3 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4


We translate a text into another language and translate it back to original language, this can cause change in sentence.

In [12]:
from deep_translator import GoogleTranslator

def back_translate_verbose(text, intermediate_lang='fr'):
    try:
        translated = GoogleTranslator(source='auto', target=intermediate_lang).translate(text)

        back_translated = GoogleTranslator(source='auto', target='en').translate(translated)

        print(f"Original: {text}")
        print(f"Translated: ({intermediate_lang}): {translated}")
        print(f"Back Translated (English):{back_translated}")

        return back_translated
    
    except Exception as e:
        print("Translation error: ", e)
        return text

In [13]:
original3 = "The movie was absolutely fantastic and enjoyable"
augmented3 = back_translate_verbose(original3, intermediate_lang='fr')

Original: The movie was absolutely fantastic and enjoyable
Translated: (fr): Le film était absolument fantastique et agréable
Back Translated (English):The film was absolutely fantastic and pleasant


# 4. Adding Noise

## a. Random character swaps

In [18]:
def add_noise(text, noise_level=0.1):
    
    # Convert string to list of characters as list is immutale
    text_chars = list(text)

    # Calculate number of noisy operations to perform
    num_noisy = int(len(text_chars) * noise_level)

    # Loop to perform num_noisy times random adjacent swaps
    for _ in range(num_noisy):

        # Select a random index, ensuring there's a next character to swap with
        idx = random.randint(0, len(text_chars) - 2)

        # Swap characters
        text_chars[idx], text_chars[idx+1] = text_chars[idx+1], text_chars[idx]

    # We won't add spaces to join the characters, or extra space characters will be introduced, we want length of original and augmented sentence to be the same
    return ''.join(text_chars)

In [20]:
original4 = "The movie was absolutely fantastic and enjoyable"
augmented4 = add_noise(original4, noise_level=0.1)

print(f"{original4=} and {augmented4=}")


original4='The movie was absolutely fantastic and enjoyable' and augmented4='Th emovie was absolutley fantastic and enjoyable'
