## Noise Injection for Text Augmentation

In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v

Author: Sebastian Raschka

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.12.0



### Random Character Insertion

In [5]:
import random
import string


def random_character_insertion(text, insertion_rate=0.1):
    num_insertions = int(len(text) * insertion_rate)
    
    for _ in range(num_insertions):
        position = random.randint(0, len(text))
        character = random.choice(string.ascii_letters)
        text = text[:position] + character + text[position:]

    return text

In [9]:
random.seed(1)


text = "The cat jumped over the dog."
augmented_text = random_character_insertion(text)
print("Random Character Insertion:", augmented_text)

Random Character Insertion: The Kcat jumped over the doZg.


In [17]:
import difflib


d = difflib.Differ()
diff = d.compare(text, 
                 augmented_text)

print('\n'.join(diff))

  T
  h
  e
   
+ K
  c
  a
  t
   
  j
  u
  m
  p
  e
  d
   
  o
  v
  e
  r
   
  t
  h
  e
   
  d
  o
+ Z
  g
  .


### Random Character Deletion

In [18]:
import random

def random_character_deletion(text, deletion_rate=0.1):

    num_deletions = int(len(text) * deletion_rate)
    
    for _ in range(num_deletions):
        if len(text) == 0:
            break
        position = random.randint(0, len(text) - 1)
        text = text[:position] + text[position + 1:]

    return text

In [19]:
random.seed(1)


text = "The cat jumped over the dog."
augmented_text = random_character_deletion(text)
print("Random Character Insertion:", augmented_text)

Random Character Insertion: The at jumped overthe dog.


In [20]:
import difflib


d = difflib.Differ()
diff = d.compare(text, 
                 augmented_text)

print('\n'.join(diff))

  T
  h
  e
   
- c
  a
  t
   
  j
  u
  m
  p
  e
  d
   
  o
  v
  e
  r
-  
  t
  h
  e
   
  d
  o
  g
  .


### Typo Introduction

In [21]:
import random

def typo_introduction(text, introduction_rate=0.1):
    num_typos = int(len(text) * introduction_rate)
    
    for _ in range(num_typos):
        # Ensure there are at least two characters to swap
        if len(text) < 2:
            break
        position = random.randint(0, len(text) - 2)
        text = text[:position] + text[position + 1] + text[position] + text[position + 2:]

    return text


In [22]:
random.seed(1)


text = "The cat jumped over the dog."
augmented_text = typo_introduction(text)
print("Random Character Insertion:", augmented_text)

Random Character Insertion: The act jumped ove rthe dog.


In [23]:
import difflib


d = difflib.Differ()
diff = d.compare(text, 
                 augmented_text)

print('\n'.join(diff))

  T
  h
  e
   
+ a
  c
- a
  t
   
  j
  u
  m
  p
  e
  d
   
  o
  v
  e
+  
  r
-  
  t
  h
  e
   
  d
  o
  g
  .
