In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from nltk.corpus import stopwords



#### Load data

In [2]:
train_path = "../train.csv"
train = pd.read_csv(train_path)

#### Split data to train & val

In [3]:
X = train.iloc[:,0:4]
y = train.iloc[:,4]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6090, 4) (1523, 4) (6090,) (1523,)


#### Word Augmentation - Using synonym

In [5]:
counter = Counter(y_train)
gap = counter[0] - counter[1] #Number of difference between 1 and 0 = # of augmentated examples

In [6]:
aug = naw.SynonymAug(stopwords=stopwords.words('english')) #skip stop words

In [7]:
print("***********Example of Word Augmentation:***************")
index = X_train.index[0:5]
for i in index:
    print(X_train.iloc[i]['text'])
    print(aug.augment(X_train.iloc[i]['text']))  
    print('---------------------')

***********Example of Word Augmentation:***************
-  Pandemonium In Aba As Woman Delivers Baby Without Face http://t.co/36GccAPaak http://t.co/nqjZS6wkuN
- Pandemonium In Aba As Woman Delivers Baby Without Human face http: / / t. cobalt / 36GccAPaak http: / / t. cobalt / nqjZS6wkuN
---------------------
Couples having less sex... for fear it'll be a let down: Internet movies and books saying how sex 'ought to be' pÛ_ http://t.co/c1xhIzPrAd
Couple having less sex activity. .. for fear it ' ll be a let down: Internet movies and books saying how sexual activity ' ought to be ' p  Û_ http: / / t. co / c1xhIzPrAd
---------------------
#hot  Funtenna: hijacking computers to send data as sound waves [Black Hat 2015] http://t.co/xV3D9bPjHi #prebreak #best
# hot Funtenna: hijacking electronic computer to send information as speech sound waves [Black person Hat 2015] hypertext transfer protocol: / / t. cobalt / xV3D9bPjHi # prebreak # best
---------------------
Share Large sinkhole swall

#### Sentence Augmentation - Random Sentence Augmentation.

In [8]:
aug = nas.RandomSentAug()

In [9]:
print("***********Example of Word Augmentation:***************")
index = X_train.index[0:5]
for i in index:
    print(X_train.iloc[i]['text'])
    print(aug.augment(X_train.iloc[i]['text']))  
    print('---------------------')

***********Example of Word Augmentation:***************
-  Pandemonium In Aba As Woman Delivers Baby Without Face http://t.co/36GccAPaak http://t.co/nqjZS6wkuN
-  Pandemonium In Aba As Woman Delivers Baby Without Face http://t.co/36GccAPaak http://t.co/nqjZS6wkuN
---------------------
Couples having less sex... for fear it'll be a let down: Internet movies and books saying how sex 'ought to be' pÛ_ http://t.co/c1xhIzPrAd
Couples having less sex... for fear it'll be a let down: Internet movies and books saying how sex 'ought to be' pÛ_ http://t.co/c1xhIzPrAd
---------------------
#hot  Funtenna: hijacking computers to send data as sound waves [Black Hat 2015] http://t.co/xV3D9bPjHi #prebreak #best
#hot  Funtenna: hijacking computers to send data as sound waves [Black Hat 2015] http://t.co/xV3D9bPjHi #prebreak #best
---------------------
Share Large sinkhole swallows entire pond in Lowndes County Georgia A largeÛ_ http://t.co/HvBJ30aj9s #YoNews
Share Large sinkhole swallows entire pon

#### It seems that Sentence Augmentation is better

In [10]:
one = y_train[y_train== 1]
# Randomly select samples with label == 1 to augment
sample_index = np.random.choice(one.index,gap)
sample_df = X.iloc[sample_index,:].copy()

def map_text(text):
    """
    Define text augmentation function
    """
    return aug.augment(text)

sample_df['text'] = sample_df['text'].apply(map_text)

#reset index
sample_df = sample_df.reset_index().drop('index',axis=1)


In [12]:
#export
sample_df.to_csv("../Q6_output/Q6_generated.csv",index = None)