# EDA DATA AUGMENTATION

In [4]:
#Importing Libraries
import pandas as pd

In [5]:
pip install -U nltk

Requirement already up-to-date: nltk in /usr/local/lib/python3.7/dist-packages (3.6.2)


In [6]:
df = pd.read_excel("train.xlsx")

In [7]:
df

Unnamed: 0,Polarity,Review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...
...,...,...
1995,4,I love Olive or Twist. Absolutely love it. As ...
1996,1,I've actually had OK experiences at this place...
1997,4,Girls night had to happen. We decided to go to...
1998,1,Owner or manager was very rude when we present...


In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [9]:
df.shape

(2000, 2)

In [10]:
# Rows containing duplicate data
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 2)


In [11]:
# Dropping the duplicates 
df = df.drop_duplicates()
df.head(5)

Unnamed: 0,Polarity,Review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...


In [12]:
# Finding the null values.
print(df.isnull().sum())

Polarity    0
Review      0
dtype: int64


In [13]:
# Dropping the missing values.
df = df.dropna() 
df.count()

Polarity    2000
Review      2000
dtype: int64

In [14]:
# After dropping the values
print(df.isnull().sum()) 

Polarity    0
Review      0
dtype: int64


In [15]:
from nltk.corpus import wordnet
def get_synonyms(word):
    
    synonyms = set()
    
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

In [16]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
import random

In [18]:
import nltk
from nltk.corpus import stopwords
stop_words = []
for w in stopwords.words('english'):
    stop_words.append(w)
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## 1) Synonym Replacement

In [19]:
def synonym_replacement(words, n):
    
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

## 2) Random Deletion

In [20]:
def random_deletion(words, p):

    words = words.split()
    
    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    sentence = ' '.join(new_words)
    
    return sentence

In [21]:
def swap_word(new_words):
    
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        
        if counter > 3:
            return new_words
    
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

## 3) Random Swap

In [22]:
def random_swap(words, n):
    
    words = words.split()
    new_words = words.copy()
    # n is the number of words to be swapped
    for _ in range(n):
        new_words = swap_word(new_words)
        
    sentence = ' '.join(new_words)
    
    return sentence

## 4) Random Insertion

In [23]:
def random_insertion(words, n):
    
    words = words.split()
    new_words = words.copy()
    
    for _ in range(n):
        add_word(new_words)
        
    sentence = ' '.join(new_words)
    
    return sentence

In [24]:
def add_word(new_words):
    
    synonyms = []
    counter = 0
    
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
        
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

In [25]:
eda_df = pd.DataFrame(columns=["Polarity", "Review"])

#Calling Different Augmention 

In [26]:
def aug(sent, n, p, polarity, eda_df):
    #orignal sentence
    og_sentence = {"Polarity": polarity, "Review": sent}
    eda_df = eda_df.append(og_sentence, ignore_index = True)

    #SR Augmented sentence
    sr_sentence = {"Polarity": polarity, "Review": synonym_replacement(sent,n)}
    eda_df = eda_df.append(sr_sentence, ignore_index = True)

    #RD Augmented  sentence
    rd_sentence = {"Polarity": polarity, "Review": random_deletion(sent,p)}
    eda_df = eda_df.append(rd_sentence, ignore_index = True)

    #RS Augmented 
    rs_sentence = {"Polarity": polarity, "Review": random_swap(sent,n)}
    eda_df = eda_df.append(rs_sentence, ignore_index = True)

    #RI Augmented 
    ri_sentence = {"Polarity": polarity, "Review": random_insertion(sent,n)}
    eda_df = eda_df.append(ri_sentence, ignore_index = True)

    return eda_df

In [27]:
for index, row in df.iterrows():
  polarity = row["Polarity"]
  review = row["Review"]
  new_words = 4       # number of new words beinging added/replaced
  prob_del = 0.3      # probablity of deletion
  eda_df = aug(review, new_words, prob_del, polarity, eda_df)

In [28]:
df.shape

(2000, 2)

In [29]:
eda_df.shape

(10000, 2)

In [30]:
eda_df

Unnamed: 0,Polarity,Review
0,5,dr. goldberg offers everything i look for in a...
1,5,md goldberg offers everything i look for in a ...
2,5,goldberg everything for in general practitione...
3,5,dr. goldberg offers everything i look for in a...
4,5,dr. goldberg offers everything i look for in a...
...,...,...
9995,5,"A beautiful little bar with an exciting \""mart..."
9996,5,"A beautiful little bar with an exciting \""mart..."
9997,5,"beautiful little an exciting \""martini\"" list ..."
9998,5,"A exciting the zone with an beautiful \""martin..."


EDA AUGMENTATION EXAMPLE

In [31]:
def aug_example(sent,n,p):
    print(f" Original Sentence : {sent}")
    print(f" SR Augmented Sentence : {synonym_replacement(sent,n)}")
    print(f" RD Augmented Sentence : {random_deletion(sent,p)}")
    print(f" RS Augmented Sentence : {random_swap(sent,n)}")
    print(f" RI Augmented Sentence : {random_insertion(sent,n)}")

In [32]:
sentence = df.Review[4]
aug_example(sentence, 4, 0.3)

 Original Sentence : I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there