In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install nlpaug

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nlpaug.augmenter.word as naw
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
sns.set_style('darkgrid')

In [None]:
data = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')
data.head()

In [None]:
plt.figure(figsize=(7, 5))
plt.title('Couting the labels', fontsize=13)
sns.countplot(data=data, x='label')
plt.xlabel('Label', fontsize=12)
plt.ylabel('Count', fontsize=12)

In [None]:
#Tweets without hate speech represents almost 93% of the dataset

counts = pd.DataFrame({
    'Label': data['label'].value_counts().index,
    'Count': data['label'].value_counts().values,
    'Percentage':  data['label'].value_counts().values/data.shape[0]
})

counts.head()

## Data Augmentation

### Random Over Sampler (ROS)

In [None]:
ros = RandomOverSampler(random_state=42, sampling_strategy='minority')
X_resampled, y_resampled = ros.fit_resample(data[['tweet']], data['label'])
data_resampled_ros = pd.concat([X_resampled, y_resampled], axis=1)
data_resampled_ros.head()

In [None]:
plt.figure(figsize=(7, 5))
plt.title('Couting the labels', fontsize=13)
sns.countplot(data=data_resampled_ros, x='label')
plt.xlabel('Label', fontsize=12)
plt.ylabel('Count', fontsize=12)

In [None]:
counts = pd.DataFrame({
    'Label': data_resampled_ros['label'].value_counts().index,
    'Count': data_resampled_ros['label'].value_counts().values,
    'Percentage':  data_resampled_ros['label'].value_counts().values/data_resampled_ros.shape[0]
})

counts.head()

### SMOTE

In [None]:
# First of all we need to tokenize the tweets so we can augmentate them

data_smote = data.copy()
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data_smote)

data_smote['tweet'] = tokenizer.texts_to_sequences(data_smote['tweet'])
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

data_smote['tweet'] = pad_sequences(data_smote['tweet'], padding='post', maxlen=maxlen)

In [None]:
smote = SMOTE(random_state=42, sampling_strategy='minority')
X_resampled, y_resampled = smote.fit_resample(data_smote[['tweet']], data_smote['label'])
data_resampled_smote = pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
plt.figure(figsize=(7, 5))
plt.title('Couting the labels', fontsize=13)
sns.countplot(data=data_resampled_smote, x='label')
plt.xlabel('Label', fontsize=12)
plt.ylabel('Count', fontsize=12)

In [None]:
counts = pd.DataFrame({
    'Label': data_resampled_smote['label'].value_counts().index,
    'Count': data_resampled_smote['label'].value_counts().values,
    'Percentage':  data_resampled_smote['label'].value_counts().values/data_resampled_smote.shape[0]
})

counts.head()

### SMOTETomek

In [None]:
# First of all we need to tokenize the tweets so we can augmentate them

data_smotetomek = data.copy()
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data_smotetomek)

data_smotetomek['tweet'] = tokenizer.texts_to_sequences(data_smotetomek['tweet'])
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

data_smotetomek['tweet'] = pad_sequences(data_smotetomek['tweet'], padding='post', maxlen=maxlen)

In [None]:
tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = tomek.fit_resample(data_smotetomek[['tweet']], data_smotetomek['label'])
data_resampled_smotetomek = pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
plt.figure(figsize=(7, 5))
plt.title('Couting the labels', fontsize=13)
sns.countplot(data=data_resampled_smotetomek, x='label')
plt.xlabel('Label', fontsize=12)
plt.ylabel('Count', fontsize=12)

In [None]:
counts = pd.DataFrame({
    'Label': data_resampled_smotetomek['label'].value_counts().index,
    'Count': data_resampled_smotetomek['label'].value_counts().values,
    'Percentage':  data_resampled_smotetomek['label'].value_counts().values/data_resampled_smote.shape[0]
})

counts.head()

#### NLPAUG

In [None]:
data_resampled_nlpaug = data.copy()

aug_texts = []
minority_data = data_resampled_nlpaug[data_resampled_nlpaug['label'] == 1]
aug = naw.SynonymAug(aug_src='wordnet')

texts = minority_data['tweet'].tolist()

for text in texts:
    augmented_texts = aug.augment(text, n=12)
    
    for augmented in augmented_texts:
        aug_texts.append(augmented)

print(len(aug_texts))

temp = pd.DataFrame({
    'tweet': aug_texts
})
        
temp['label'] = 1
        
data_resampled_nlpaug = pd.concat([data_resampled_nlpaug, temp], axis=0)
data_resampled_nlpaug = data_resampled_nlpaug.reset_index()
data_resampled_nlpaug = data_resampled_nlpaug.drop(columns=['index'])
del temp, minority_data

In [None]:
plt.figure(figsize=(7, 5))
plt.title('Couting the labels', fontsize=13)
sns.countplot(data=data_resampled_nlpaug, x='label')
plt.xlabel('Label', fontsize=12)
plt.ylabel('Count', fontsize=12)

In [None]:
counts = pd.DataFrame({
    'Label': data_resampled_nlpaug['label'].value_counts().index,
    'Count': data_resampled_nlpaug['label'].value_counts().values,
    'Percentage':  data_resampled_nlpaug['label'].value_counts().values/data_resampled_nlpaug.shape[0]
})

counts.head()