<a href="https://colab.research.google.com/github/rakibulhaque9954/Comment_Flag_LSTM_Model/blob/main/lstm_comment_flagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import pandas as pd### data manipulation
import datetime
import pathlib
import io
import os
import re
import string
import time
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from keras.models import Model
from keras.layers import Layer
from keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from keras.optimizers import Adam

# Dataset Preparation

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/Colab\ Notebooks/train.csv /content/




In [4]:
df = pd.read_csv('train.csv')



In [5]:
df.sample(5)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
35114,5dc3503c26fd5246,"East New York, Brooklyn vs. New Lots, Brooklyn...",0,0,0,0,0,0
33277,589e4b69392c99df,"""\n\n Invitation \n\n For Wikipedia's 11th An...",0,0,0,0,0,0
39795,6a4237d006dba76b,2007 (UTC)\n\nHave now split the section into ...,0,0,0,0,0,0
39357,69062e50266408f3,My wife said she used to read these. She feel...,0,0,0,0,0,0
125358,9e8ee0af14a4c1c5,"""== Turkey etc. ==\n\nHallo !\n\nTurkey and th...",0,0,0,0,0,0


In [6]:
df.shape

(159571, 8)

In [7]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [8]:
X.shape
y.shape


(159571, 6)

In [9]:
X.head()



0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [10]:
print(y)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [11]:
def standardization(input_data):
  """Input: Raw Data(review, etc)
     Output: Standardized Data(review, etc)
     conversion to lower case for data regualrity,
     removal of html tags,
     regex means regularization text,
     removal of punctuations,
     removal of special characters,
     remove accented characters,
     stemming for example: discussion, discussed, discussing are the same word from the root discuss
     lemmatization for example: tensed when reduced to stem its tens which makes no sense, thats why lemmatization
     is used to provide the base for of the word, in this case tensed is turned into a lemma which is tense(again this two methods
     have their own use cases depending on the scenario)
  """
  lower_case = tf.strings.lower(input_data)
  html_tags = tf.strings.regex_replace(lower_case, '<[^>]+>', ' ')
  output = tf.strings.regex_replace(html_tags, '[%s]' % re.escape(string.punctuation), ' ')


  return output

In [12]:
VOCABULARY_SIZE = 200000

In [13]:
vectorize_layer = TextVectorization(
    max_tokens=VOCABULARY_SIZE,
    standardize=standardization,
    output_mode='int',
    output_sequence_length=500)



In [14]:
vectorize_layer.adapt(X.values)
vectorized_text = vectorize_layer(X.values)




In [15]:

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks



In [16]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [49]:
model = tf.keras.Sequential()
# Create the embedding layer
model.add(Embedding(VOCABULARY_SIZE + 1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(6, activation='sigmoid'))

In [50]:
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 6)                 774       
                                                      

In [51]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])



In [53]:
history = model.fit(train, epochs=20, validation_data=val)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:


model.evaluate(test)




[0.004912625066936016, 0.994546115398407]

In [55]:


classname = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [56]:
import re
input_text = vectorize_layer('go hit yourself')
input_text = np.expand_dims(input_text, axis = 0)
res = model.predict(input_text)
pred = (res > 0.5).astype(int)
print(pred)
# print(classname[pred])
# batch_X, batch_y = test.as_numpy_iterator().next()
# (model.predict(batch_X) > 0.5).astype(int)
# predictions = (model.predict(batch_X) > 0.5).astype(int)
# for pred in predictions:
#     print(classname[pred[0]])


[[1 0 1 0 1 0]]


In [60]:
import re
full_model = tf.keras.models.Sequential([
    vectorize_layer,
    model
])
full_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, 500)               0         
 ctorization)                                                    
                                                                 
 sequential_2 (Sequential)   (None, 6)                 6491686   
                                                                 
Total params: 6491686 (24.76 MB)
Trainable params: 6491686 (24.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [65]:
full_model.save('/content/drive/MyDrive/lstm_flagger_full')



In [68]:
custom_objects = {
    'TextVectorization': tf.keras.layers.TextVectorization,
    'standardization': standardization
}

In [69]:
reloaded_model = tf.keras.models.load_model('/content/drive/MyDrive/lstm_flagger_full', custom_objects=custom_objects)




In [70]:
text = np.array(['go die'])
text = np.expand_dims(text, axis=0)
res = reloaded_model.predict(text)
pred = (res > 0.5).astype(int)
print(pred)

[[1 0 0 1 0 0]]


In [57]:
model.save('/content/drive/MyDrive/lstm_flagger.h5')

  saving_api.save_model(


In [None]:
from keras.metrics import Precision, Recall, CategoricalAccuracy
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [58]:
from sklearn.metrics import roc_auc_score, f1_score
import numpy as np
from keras.metrics import Precision, Recall

# Define your model's output number of classes
num_classes = 6  # Make sure to set this to the number of classes you have

# Instantiate the metrics
pre = Precision()
re = Recall()

# These lists will store all the true and predicted labels for later computation
true_labels = []
predicted_labels = []

for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_test, y_test = batch
    # Make predictions
    y_pred = model.predict(X_test)

    # Flatten the predictions for Precision and Recall
    y_test_flat = y_test.flatten()
    y_pred_flat = y_pred.flatten()

    # Update Precision and Recall states
    pre.update_state(y_test_flat, np.round(y_pred_flat))
    re.update_state(y_test_flat, np.round(y_pred_flat))

    # Store the true and predicted labels for F1 and ROC AUC
    true_labels.append(y_test)
    predicted_labels.append(y_pred)

# Now concatenate to form a single array for F1 and ROC AUC
true_labels = np.concatenate(true_labels, axis=0)
predicted_labels = np.concatenate(predicted_labels, axis=0)

# Calculate F1 score for each class
f1_scores = [f1_score(true_labels[:, i], np.round(predicted_labels[:, i]), average='binary') for i in range(num_classes)]

# Calculate ROC AUC for each class
roc_auc_scores = [roc_auc_score(true_labels[:, i], predicted_labels[:, i]) for i in range(num_classes)]

# Calculate the macro-average F1 score and the average ROC AUC
average_f1_score = np.mean(f1_scores)
average_roc_auc = np.mean(roc_auc_scores)

# Print out Precision and Recall
precision_result = pre.result().numpy()
recall_result = re.result().numpy()

print("Precision:", precision_result)
print("Recall:", recall_result)
print("F1 Scores per class:", f1_scores)
print("ROC AUC Scores per class:", roc_auc_scores)
print("Average F1 Score:", average_f1_score)
print("Average ROC AUC Score:", average_roc_auc)


Precision: 0.9815121
Recall: 0.9700027
F1 Scores per class: [0.9888570518943013, 0.8372093023255814, 0.9795221843003413, 0.9056603773584904, 0.980975029726516, 0.9504950495049506]
ROC AUC Scores per class: [0.9999398234416232, 0.999341997923109, 0.9999269498472373, 0.999914623624991, 0.9999253143206359, 0.9998893215739206]
Average F1 Score: 0.9404531658516969
Average ROC AUC Score: 0.9998230051219196


In [35]:
# print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.9024651646614075, Recall:0.934257984161377, Accuracy:0.520561695098877
