In [None]:
# System
import os

# Time
import time
import datetime

# Numerical
import numpy as np
import pandas as pd

# Tools
import itertools
from collections import Counter

# NLP
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from pywsd.utils import lemmatize_sentence

# Preprocessing
from sklearn import preprocessing
from sklearn.utils import class_weight as cw
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Model Selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Evaluation Metrics
from sklearn import metrics 
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix,classification_report

# Deep Learing Preprocessing - Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical

# Deep Learning Model - Keras
from keras.models import Model
from keras.models import Sequential

# Deep Learning Model - Keras - CNN
from keras.layers import Conv1D, Conv2D, Convolution1D, MaxPooling1D, SeparableConv1D, SpatialDropout1D, \
    GlobalAvgPool1D, GlobalMaxPool1D, GlobalMaxPooling1D 
from keras.layers.pooling import _GlobalPooling1D
from keras.layers import MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling2D

# Deep Learning Model - Keras - RNN
from keras.layers import Embedding, LSTM, Bidirectional

# Deep Learning Model - Keras - General
from keras.layers import Input, Add, concatenate, Dense, Activation, BatchNormalization, Dropout, Flatten
from keras.layers import LeakyReLU, PReLU, Lambda, Multiply



# Deep Learning Parameters - Keras
from keras.optimizers import RMSprop, Adam

# Deep Learning Callbacs - Keras
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 

warnings.filterwarnings("ignore", category=DeprecationWarning)
 
%matplotlib inline



In [None]:

data = pd.read_excel("hatespeechdata.xlsx", encoding='latin-1')


In [None]:
data.head()



Initial data cleaning requirements that we can think of after looking at the top 5 records:
The Twitter handles are already masked as @user due to privacy concerns. So, these Twitter handles are hardly giving any information about the nature of the tweet. We can also think of getting rid of the punctuations, numbers and even special characters since they wouldn’t help in differentiating different kinds of tweets. Most of the smaller words do not add much value. For example, ‘pdx’, ‘his’, ‘all’. So, we will try to remove them as well from our data. Once we have executed the above three steps, we can split every tweet into individual words or tokens which is an essential step in any NLP task.We also need to reduce words to their roots eg loved , loves , loving . These terms are often used in the same context. If we can reduce them to their root word, which is ‘love’, then we can reduce the total number of unique words in our data without losing a significant amount of information.

Given below is a user-defined function to remove unwanted text patterns from text

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [None]:
# remove twitter handles (@user)
data['tidy_tweet'] = np.vectorize(remove_pattern)(data['v2'], "@[\w]*")

In [None]:
# remove twitter handles (@user)
data['tidy_tweet'] = np.vectorize(remove_pattern)(data['v2'], "#[\w]*")

In [None]:
# remove special characters, numbers, punctuations
data['tidy_tweet'] = data['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [None]:
data.head()

In [None]:
#removing short words
data['tidy_tweet'] = data['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
data.head()

In [None]:
def printHead():
    data.head()

In [None]:
#tokenization
tokenized_tweet = data['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()
all_words = ' '.join([text for text in data['tidy_tweet']])
#test = join(data,'tidy_tweet')
#cloud(all_words)

In [None]:
#stemming
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) 
tokenized_tweet.head()


In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

data['tidy_tweet'] = tokenized_tweet

In [None]:
data.head()

In [None]:
tags = data["v1"]
texts = data["tidy_tweet"]
print(all_words)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras import metrics

In [None]:
num_max = 1000
le = LabelEncoder()
tags = le.fit_transform(tags)
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)
mat_texts = tok.texts_to_matri(texts,mode='count')



In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(texts,tags, test_size = 0.2)
mat_texts_tr = tok.texts_to_matrix(x_train,mode='count')
mat_texts_tst = tok.texts_to_matrix(x_test,mode='count')

max_len = 100
x_train = tok.texts_to_sequences(x_train)
x_test = tok.texts_to_sequences(x_test)
cnn_texts_mat = sequence.pad_sequence(x_train,maxlen=max_len)
max_len = 100
cnn_texts_mat_tst = sequence.pad_sequences(x_test,maxlen=max_len)

In [None]:
def get_cnn_model_v2():   
    model = Sequential()
    model.add(Embedding(1000,50,input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Conv1D(64,3,padding='valid',activation='relu',strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc',metrics.binary_accuracy])
    return model

m_CNN_v2 = get_cnn_model_v2()
#check_model(m,cnn_texts_mat,y_train,cnn_texts_mat_tst ,y_test)
history_cnn_v2 = m_CNN_v2.fit(cnn_texts_mat,y_train, epochs=10, batch_size=60, validation_split=0.2)
acc = history_cnn_v2.history['acc']
val_acc = history_cnn_v2.history['val_acc']
loss = history_cnn_v2.history['loss']
val_loss = history_cnn_v2.history['val_loss']
epochs = range(len(acc))

In [None]:
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns; sns.set()

In [None]:
plt.plot(epochs, acc, '-', color='red', label='training acc')
plt.plot(epochs, val_acc, '-', color='green', label='validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, loss, '-', color='red', label='training acc')
plt.plot(epochs, val_loss,  '-', color='green', label='validation acc')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
pred = m_CNN_v2.predict_classes(cnn_texts_mat_tst)
acc = m_CNN_v2.evaluate(cnn_texts_mat_tst ,y_test)
proba_cnn_v2 = m_CNN_v2.predict_proba(cnn_texts_mat_tst)
from sklearn.metrics import confusion_matrix
print("Test loss is {0:.4f} accuracy is {1:.4f}  ".format(acc[0],acc[1]))
print(confusion_matrix(pred, y_test))

In [None]:
def plot_heatmap(cm,title):
    df_cm2 = pd.DataFrame(cm, index = ['normal', 'hate'])
    df_cm2.columns=['normal','hate']

    ax = plt.axes()
    sns.heatmap(df_cm2, annot=True, fmt="d", linewidths=.5,ax=ax)
    ax.set_title(title)
    plt.show()

    
    return

In [None]:
plot_heatmap(confusion_matrix(pred, y_test),'CNN')

In [None]:
print(classification_report(pred, y_test))

In [None]:
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns; sns.set()

In [None]:
def get_LSTM_model():    
    model = Sequential()
    model.add(Embedding(1000, 32))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    return model
m_LSTM = get_LSTM_model()
#check_model(m,cnn_texts_mat,y_train,cnn_texts_mat_tst ,y_test)
history_lstm = m_LSTM.fit(cnn_texts_mat,y_train, epochs=10, batch_size=60, validation_split=0.20)
acc = history_lstm.history['acc']
val_acc = history_lstm.history['val_acc']
loss = history_lstm.history['loss']
val_loss = history_lstm.history['val_loss']
epochs = range(len(acc))

In [None]:
plt.plot(epochs, acc, '-', color='red', label='training acc')
plt.plot(epochs, val_acc, '-', color='green', label='validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, loss, '-', color='red', label='training acc')
plt.plot(epochs, val_loss,  '-', color='green', label='validation acc')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
pred = m_LSTM.predict_classes(cnn_texts_mat_tst)
acc = m_LSTM.evaluate(cnn_texts_mat_tst ,y_test)
proba_lstm = m_LSTM.predict_proba(cnn_texts_mat_tst)
from sklearn.metrics import confusion_matrix
print("Test loss is {0:.2f} accuracy is {1:.4f}  ".format(acc[0],acc[1]))
print(confusion_matrix(pred, y_test))

In [None]:
plot_heatmap(confusion_matrix(pred, y_test),'LSTM')

In [None]:
from sklearn.metrics import roc_curve
y_pred_lstm = m_LSTM.predict(cnn_texts_mat_tst).ravel()
fpr_lstm, tpr_lstm, thresholds_lstm = roc_curve(y_test, y_pred_lstm)

In [None]:
#AUC
from sklearn.metrics import auc
auc_lstm = auc(fpr_lstm, tpr_lstm)
auc_lstm

In [None]:
from sklearn.metrics import roc_curve
y_pred_CNN = m_CNN_v2.predict(cnn_texts_mat_tst).ravel()
fpr_CNN, tpr_CNN, thresholds_CNN = roc_curve(y_test, y_pred_CNN)
from sklearn.metrics import auc
auc_CNN = auc(fpr_CNN, tpr_CNN)
auc_CNN

In [None]:
plt.figure(2)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_lstm, tpr_lstm, label='LSTM (area = {:.3f})'.format(auc_lstm))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()

In [None]:
plt.figure(2)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_CNN, tr_CNN,label='CNN (area = {:.3f})'.format(auc_CNN))
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()