In [1]:
import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)  # Remove the original word from synonyms
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = nltk.word_tokenize(sentence)
    replaced_sentence = words[:]
    words_indexes = list(range(len(words)))
    random.shuffle(words_indexes)
    
    replaced = 0
    for i in words_indexes:
        synonyms = get_synonyms(words[i])
        if synonyms:
            synonym = random.choice(synonyms)
            replaced_sentence[i] = synonym
            replaced += 1
            if replaced >= n:  # Replace up to n words
                break

    return ' '.join(replaced_sentence)

# Example usage
sentence = "The quick brown fox jumps over the lazy dog"
replaced_sentence = synonym_replacement(sentence, 3)  # Replace up to 3 words
print("Original sentence:", sentence)
print("Sentence after synonym replacement:", replaced_sentence)

[nltk_data] Downloading package wordnet to /home/raaif/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/raaif/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/raaif/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original sentence: The quick brown fox jumps over the lazy dog
Sentence after synonym replacement: The quick brown play_a_trick_on jumping over the work-shy dog


In [9]:
from transformers import MarianMTModel, MarianTokenizer

def back_translate(text, model_name_src_to_tgt, model_name_tgt_to_src):
    # Initialize the tokenizer and model for source to target language
    tokenizer_src_to_tgt = MarianTokenizer.from_pretrained(model_name_src_to_tgt)
    model_src_to_tgt = MarianMTModel.from_pretrained(model_name_src_to_tgt)
    
    # Translate from source to target language
    translated = model_src_to_tgt.generate(**tokenizer_src_to_tgt(text, return_tensors="pt", padding=True))
    
    # Decode the translated text
    tgt_text = tokenizer_src_to_tgt.decode(translated[0], skip_special_tokens=True)
    
    # Initialize the tokenizer and model for target to source language
    tokenizer_tgt_to_src = MarianTokenizer.from_pretrained(model_name_tgt_to_src)
    model_tgt_to_src = MarianMTModel.from_pretrained(model_name_tgt_to_src)
    
    # Translate back from target to source language
    back_translated = model_tgt_to_src.generate(**tokenizer_tgt_to_src(tgt_text, return_tensors="pt", padding=True))
    
    # Decode the back-translated text
    src_text = tokenizer_tgt_to_src.decode(back_translated[0], skip_special_tokens=True)
    
    return src_text

# Example usage
text = "The quick brown fox jumps over the lazy dog"
text2 = "A hungry man wants to eat a pizza from the local takeaway"
model_name_src_to_tgt = "Helsinki-NLP/opus-mt-en-fr"
model_name_tgt_to_src = "Helsinki-NLP/opus-mt-fr-en"

back_translated_text = back_translate(text2, model_name_src_to_tgt, model_name_tgt_to_src)
print("Original text:", text2)
print("Back-translated text:", back_translated_text)



Original text: A hungry man wants to eat a pizza from the local takeaway
Back-translated text: A hungry man wants to eat a pizza from the corner to take away


In [10]:
import random
import string

def inject_noise(sentence, noise_type='all', noise_level=0.1):
    """
    Injects noise into a given sentence. The type of noise can be specified.
    - sentence: The input sentence to which noise will be added.
    - noise_type: The type of noise to add ('insert', 'delete', 'substitute', or 'all').
    - noise_level: Fraction of characters to alter (between 0 and 1).
    """
    # Function to insert noise: Randomly adds a character within a word
    def insert_noise(word):
        if len(word) > 1:  # Avoid inserting into very short words
            insert_pos = random.randint(1, len(word)-1)  # Avoid inserting at the first position
            insert_char = random.choice(string.ascii_lowercase)
            return word[:insert_pos] + insert_char + word[insert_pos:]
        return word

    # Function to delete noise: Randomly removes a character from a word
    def delete_noise(word):
        if len(word) > 1:
            delete_pos = random.randint(0, len(word)-1)
            return word[:delete_pos] + word[delete_pos+1:]
        return word

    # Function to substitute noise: Replaces a character with a random character
    def substitute_noise(word):
        if len(word) > 1:
            substitute_pos = random.randint(0, len(word)-1)
            substitute_char = random.choice(string.ascii_lowercase)
            return word[:substitute_pos] + substitute_char + word[substitute_pos+1:]
        return word

    noise_functions = {
        'insert': insert_noise,
        'delete': delete_noise,
        'substitute': substitute_noise
    }

    words = sentence.split()
    num_words_to_change = max(1, int(len(words) * noise_level))  # Ensure at least one word is changed
    words_to_change = random.sample(words, num_words_to_change)

    for i, word in enumerate(words):
        if word in words_to_change:
            if noise_type == 'all':
                # Apply a random noise function
                noise_func = random.choice(list(noise_functions.values()))
                words[i] = noise_func(word)
            else:
                # Apply the specified noise function
                words[i] = noise_functions[noise_type](word)

    return ' '.join(words)

# Example usage
sentence = "The quick brown fox jumps over the lazy dog"
print("Original sentence:", sentence)
print("With insertion noise:", inject_noise(sentence, 'insert'))
print("With deletion noise:", inject_noise(sentence, 'delete'))
print("With substitution noise:", inject_noise(sentence, 'substitute'))
print("With random noise of all types:", inject_noise(sentence, 'all'))

Original sentence: The quick brown fox jumps over the lazy dog
With insertion noise: The quick brown fox jaumps over the lazy dog
With deletion noise: The quick brown fox jups over the lazy dog
With substitution noise: The quick brown fox jumps over the lazy dlg
With random noise of all types: The quick brown fox jumps over the lazy dag
