In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed
tf.keras.backend.clear_session()
from tensorflow.keras.layers import Input, Softmax, RNN, Dense, Embedding, LSTM, concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
import numpy as np
import matplotlib.pyplot as plt
from keras.regularizers import l2

In [3]:
# reading the dataframe from disk

data = pd.read_csv('/content/drive/MyDrive/seq2seq/data.csv')
data.head()

Unnamed: 0,corrupted_text,english_text
0,"U wan me to ""chop"" seat 4 u nt?\n",Do you want me to reserve seat for you or not?\n
1,Yup. U reaching. We order some durian pastry a...,Yeap. You reaching? We ordered some Durian pas...
2,They become more ex oredi... Mine is like 25.....,They become more expensive already. Mine is li...
3,I'm thai. what do u do?\n,I'm Thai. What do you do?\n
4,Hi! How did your week go? Haven heard from you...,Hi! How did your week go? Haven't heard from y...


## Data Augmentation

In [4]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[K     |████████████████████████████████| 405 kB 5.3 MB/s 
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.7


In [5]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
from nlpaug.util.file.download import DownloadUtil

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
# reference: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb

aug_key_char = nac.KeyboardAug()

aug_ocr = nac.OcrAug()

aug_swap = nac.RandomCharAug(action="swap")

aug_spelling = naw.SpellingAug()
# augmented_texts = aug.augment(text, n=3)

In [8]:
aug_char_dist = list()

for text in tqdm(data.english_text.values):
    aug_char_dist.append(aug_key_char.augment(text))


100%|██████████| 2000/2000 [00:00<00:00, 3852.58it/s]


In [9]:
char_df = pd.DataFrame(list(zip(aug_char_dist, data.english_text.values)), columns=['corrupted_text', 'english_text'])

data = pd.concat([data, char_df])
data.head()

Unnamed: 0,corrupted_text,english_text
0,"U wan me to ""chop"" seat 4 u nt?\n",Do you want me to reserve seat for you or not?\n
1,Yup. U reaching. We order some durian pastry a...,Yeap. You reaching? We ordered some Durian pas...
2,They become more ex oredi... Mine is like 25.....,They become more expensive already. Mine is li...
3,I'm thai. what do u do?\n,I'm Thai. What do you do?\n
4,Hi! How did your week go? Haven heard from you...,Hi! How did your week go? Haven't heard from y...


In [10]:
aug_swap_list = list()
aug_ocr_list = list()
aug_spelling_list = list()

for text in tqdm(data.english_text.values):
    aug_swap_list.append(aug_swap.augment(text))
    aug_ocr_list.append(aug_ocr.augment(text))
    aug_spelling_list.append(aug_spelling.augment(text))

100%|██████████| 4000/4000 [00:02<00:00, 1395.13it/s]


In [11]:
swap_df = pd.DataFrame(list(zip(aug_swap_list, data.english_text.values)), columns=['corrupted_text', 'english_text'])
ocr_df = pd.DataFrame(list(zip(aug_ocr_list, data.english_text.values)), columns=['corrupted_text', 'english_text'])
spelling_df = pd.DataFrame(list(zip(aug_spelling_list, data.english_text.values)), columns=['corrupted_text', 'english_text'])


In [12]:
data_df = pd.concat([data, swap_df, ocr_df])
data_df.head()

Unnamed: 0,corrupted_text,english_text
0,"U wan me to ""chop"" seat 4 u nt?\n",Do you want me to reserve seat for you or not?\n
1,Yup. U reaching. We order some durian pastry a...,Yeap. You reaching? We ordered some Durian pas...
2,They become more ex oredi... Mine is like 25.....,They become more expensive already. Mine is li...
3,I'm thai. what do u do?\n,I'm Thai. What do you do?\n
4,Hi! How did your week go? Haven heard from you...,Hi! How did your week go? Haven't heard from y...


## Preprocessing

In [13]:
# reference: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [14]:
def preprocess(text):
    """
        Function to clean the strings containing special characters and converts them to lowercase characters.

        input: string
        output: string which contains number and lower character.
    """

    # convert the string to lowercase
    text = text.lower()
    # decontraction - expanding the words like : i'll -> i will, he'd -> he would
    text = decontracted(text)
    text = re.sub('[^A-Za-z0-9]',' ',text)
    text = re.sub('\s_\s', ' ', text)   #  replace strings like  ' _ ' with ' ' (string with a space)
    text = re.sub('\s+', ' ', text).strip()  # replace more than one_space_character to single_space_character

    return text

In [15]:
preprocess("I'm thai. what do u do?\n")

'i am thai what do u do'

In [16]:
# Create new `pandas` methods which use `tqdm` progress

data_df['corrupted_text'] = data_df['corrupted_text'].apply(preprocess)
data_df['english_text']   = data_df['english_text'].apply(preprocess)

In [17]:
data_df.head()

Unnamed: 0,corrupted_text,english_text
0,u wan me to chop seat 4 u nt,do you want me to reserve seat for you or not
1,yup u reaching we order some durian pastry alr...,yeap you reaching we ordered some durian pastr...
2,they become more ex oredi mine is like 25 so h...,they become more expensive already mine is lik...
3,i am thai what do u do,i am thai what do you do
4,hi how did your week go haven heard from you f...,hi how did your week go have not heard from yo...


In [18]:
# shuffling the dataframe 
data_df = data_df.sample(frac=1).reset_index(drop=True)

In [19]:
data_df.shape

(12000, 2)

In [20]:
# saving the file to disk
data_df.to_csv('/content/drive/MyDrive/seq2seq/data_preprocessed_post_analysis0.csv', index=False)