<a href="https://colab.research.google.com/github/shakil1819/NLTK-LSTM-Based-Hate-Speech-Detection/blob/main/LSTM_%2B_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""LSTM_Twitter_dataset.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/github/shakil1819/NLTK-LSTM-Based-Hate-Speech-Detection/blob/main/LSTM_Twitter_dataset.ipynb

# Importing Libraries
"""

import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer
import itertools
from wordcloud import WordCloud
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from keras.models import Sequential,Model
from keras.layers import Dense,LSTM, SpatialDropout1D, Embedding
from tensorflow.keras import utils
from tensorflow.keras.utils import to_categorical
from joblib import dump, load

"""# Reading the dataset"""

text = []
clas = []
df = pd.read_csv('https://raw.githubusercontent.com/shakil1819/NLTK-LSTM-Based-Hate-Speech-Detection/main/Dataset/labeled_data.csv')
text = df['tweet'].tolist()
clas = df['class'].tolist()
df.head()

"""# creating a new dataframe for easy text processing"""

df = pd.DataFrame({'tweet': text, 'class': clas})

"""# Finding if there is any missing data"""

print(df.isnull().sum())

"""# Converting the data into lower case."""

df['tweet'] = df['tweet'].apply(lambda x:x.lower())

"""# removing punctuations"""

punctuation_signs = list("?:!.,;")
df['tweet'] = df['tweet']

for punct_sign in punctuation_signs:
    df['tweet'] = df['tweet'].str.replace(punct_sign, '')

"""# Removing '\n' and '\t', extra spaces, quoting text, and progressive pronouns."""

df['tweet'] = df['tweet'].apply(lambda x: x.replace('\n', ' '))
df['tweet'] = df['tweet'].apply(lambda x: x.replace('\t', ' '))
df['tweet'] = df['tweet'].str.replace("    ", " ")
df['tweet'] = df['tweet'].str.replace('"', '')
df['tweet'] = df['tweet'].str.replace("'s", "")

"""# removing stop-words"""

nltk.download('stopwords')
stop_words = list(stopwords.words('english'))
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['tweet'] = df['tweet'].str.replace(regex_stopword, '')

"""# Using Bag of Words approach for final data Preparation.Â¶"""

cv = CountVectorizer(max_features = 75)
X = cv.fit_transform(df['tweet']).toarray()
y = df['class']

"""# Splitting the Data using Stratified split"""

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state = 42)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                              cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

"""# Using Random Forest Classifier as the Model and printing evaluating it using confusion matrix"""

clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("accuracy is: ",accuracy)
CM = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(CM, classes = range(3))
dump(clf, 'rf.joblib')

"""# Using Decision tree as the Model and printing evaluating it using confusion matrix"""

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("accuracy is: ",accuracy)
CM = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(CM, classes = range(3))
dump(clf, 'decision.joblib')

"""# Using AdaBoost Classifier as the Model and printing evaluating it using confusion matrix"""

clf = AdaBoostClassifier(n_estimators=100)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("accuracy is: ",accuracy)
CM = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(CM, classes = range(3))
dump(clf, 'ada.joblib')

"""# Converting the labels into categorical format"""

y_train=to_categorical(y_train, num_classes = 3, dtype='float32')
y_test=to_categorical(y_test, num_classes = 3, dtype='float32')

"""# Creating and Training an LSTM Model"""

model = Sequential()
model.add(Embedding(232337, 100, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(20, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, y_train,validation_data = (X_test,y_test), epochs=epochs, batch_size=batch_size)

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

"""# Saving the LSTM Model"""

model.save('lstm.h5')

tweet    0
class    0
dtype: int64


  df['tweet'] = df['tweet'].str.replace(punct_sign, '')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df['tweet'] = df['tweet'].str.replace(regex_stopword, '')


accuracy is:  0.8407531943510423
accuracy is:  0.8221923335574983
accuracy is:  0.8468056489576328
Epoch 1/5
 28/272 [==>...........................] - ETA: 2:39 - loss: 0.5728 - accuracy: 0.7606

#Incorporating bert embeddings

In [None]:
# Import BERT tokenizer and embeddings
#!pip install pytorch-pretrained-bert
!pip install transformers
from transformers import BertTokenizer, BertModel

# Load pretrained BERT
bert = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize tweets using BERT tokenizer
encoded_tweets = [bert_tokenizer.encode_plus(tweet,
                   max_length=128,
                   pad_to_max_length=True)
                 for tweet in df['tweet'].values]

import torch
tweet_tokens = {'input_ids': torch.tensor([x['input_ids'] for x in encoded_tweets]),
                'attention_mask': torch.tensor([x['attention_mask'] for x in encoded_tweets])}
# Extract BERT embeddings for tokens
tweet_embeddings = bert(tweet_tokens['input_ids'])['last_hidden_state']

# Build model input using tweet BERT embeddings
tweet_input = Input(shape=(MAX_LEN, 768), dtype='float32')

# Pass tweet embeddings to LSTM
x = LSTM(64)(tweet_embeddings)

# Rest of the model same as before
x = Dense(32, activation='relu')(x)
out = Dense(3, activation='softmax')(x)

model = Model(tweet_input, out)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

# Train model on tweet embeddings same way
model.fit(tweet_embeddings, y_train,
          epochs=10, batch_size=64)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


# Visualization

In [None]:
# Model evaluation
y_pred = model.predict(X_test_bert)

# Accuracy
acc = accuracy_score(y_test, y_pred.argmax(1))
print("Test Accuracy:", acc)

# Loss and Accuracy plots
plt.plot(history.history['loss'])
plt.plot(history.history['acc'])
plt.title('Model loss and accuracy')
plt.ylabel('Loss/Accuracy')
plt.xlabel('Epoch')
plt.legend(['Loss', 'Accuracy'], loc='upper left')
plt.show()

# Confusion matrix
cm = confusion_matrix(y_test, y_pred.argmax(1))
disp = ConfusionMatrixDisplay(cm, display_labels)
disp.plot(cmap=plt.cm.Blues)

# Classification report
print(classification_report(y_test, y_pred.argmax(1)))

# ROC Curve
plt.title('ROC Curve')
plot_roc_curve(model, X_test_bert, y_test)