<a href="https://colab.research.google.com/github/rhiosutoyo/Teaching-Deep-Learning-and-Its-Applications/blob/main/8_2_data_augmentation_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Install Required Libraries
!pip install torch nltk translate



In [2]:
# Step 2: Import Libraries
import os
import urllib.request
import tarfile
import torch
import random
import nltk
from nltk.corpus import wordnet
from translate import Translator

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Step 3: Create Data Augmentation Modules

# Random Insertion
def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[random.randint(0, len(synonyms)-1)]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

# Random Deletion
def random_deletion(words, p):
    if len(words) == 1:
        return words

    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

# Random Swap
def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

# Synonym Replacement Helper
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

# Back Translation
def back_translation(sentence, src_language="en", tmp_language="fr"):
    translator = Translator(from_lang=src_language, to_lang=tmp_language)
    translation = translator.translate(sentence)
    translator = Translator(from_lang=tmp_language, to_lang=src_language)
    back_translated = translator.translate(translation)
    return back_translated

# Augment Sentence Function
def augment_sentence(sentence):
    words = sentence.split()
    augmented_sentences = []
    augmented_sentences.append(' '.join(random_insertion(words, 1)))
    augmented_sentences.append(' '.join(random_deletion(words, 0.1)))
    augmented_sentences.append(' '.join(random_swap(words, 1)))
    augmented_sentences.append(back_translation(sentence))
    return augmented_sentences

In [4]:
# Step 4: Call Main Function to Test Augmentation
def main():
    sentences = [
        "The movie was fantastic! I really enjoyed it.",
        "The movie was a delight from start to finish. The captivating storyline and outstanding performances were complemented by breathtaking cinematography and a perfect soundtrack. The balance of humor and drama provided laughter and tears. Overall, it was a heartwarming and inspiring experience. Highly recommend for a meaningful and enjoyable watch.",
        "Absolutely terrible. Worst movie I've seen in years.",
        "This movie was a disappointment. The clichéd plot, one-dimensional characters, slow pacing, and cheesy dialogue were unbearable. Subpar acting and sloppy direction worsened it. Cheap visual effects and an unsatisfying ending made it a tedious experience. Not recommended."
    ]

    for sentence in sentences:
        print(f"Original: {sentence}")
        augmented = augment_sentence(sentence)
        for i, aug_sentence in enumerate(augmented):
            print(f"Augmented {i+1}: {aug_sentence}")
        print()

if __name__ == "__main__":
    main()

Original: The movie was fantastic! I really enjoyed it.
Augmented 1: The movie was fantastic! film I really enjoyed it.
Augmented 2: The movie was fantastic! I really enjoyed it.
Augmented 3: The movie was fantastic! really I enjoyed it.
Augmented 4: The movie was fantastic! I really enjoyed it.

Original: The movie was a delight from start to finish. The captivating storyline and outstanding performances were complemented by breathtaking cinematography and a perfect soundtrack. The balance of humor and drama provided laughter and tears. Overall, it was a heartwarming and inspiring experience. Highly recommend for a meaningful and enjoyable watch.
Augmented 1: The movie was a delight from start to finish. The captivating storyline and outstanding performances were complemented by breathtaking cinematography and a perfect soundtrack. The balance of humor and drama provided laughter and tears. Overall, it was a heartwarming and inspiring experience. Highly recommend for a meaningful and 