<a href="https://colab.research.google.com/github/rakibulhaque9954/Comment_Flag_LSTM_Model/blob/main/lstm_comment_flagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import pandas as pd### data manipulation
import datetime
import pathlib
import io
import os
import re
import string
import time
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from keras.models import Model
from keras.layers import Layer
from keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from keras.optimizers import Adam

# Dataset Preparation

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/Colab\ Notebooks/train.csv /content/




In [4]:
df = pd.read_csv('train.csv')



In [16]:
df.sample(5)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
13210,22f74a13587a44e7,No worries. Good job on the Alan Scott article...,0,0,0,0,0,0
126859,a666dfea05083005,Another Three Revert Rule warning on Southern ...,0,0,0,0,0,0
1324,039b2a3155b61e28,No matter how many times I rethink it I cannot...,0,0,0,0,0,0
147261,396e51c0d5dd9842,""" \n\nBlake's 7\nHello, Barbacana, and welcom...",0,0,0,0,0,0
125367,9e95adfa776417f0,There! I got a little (very little) info on ba...,0,0,0,0,0,0


In [12]:
df.shape

(159571, 8)

In [5]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [19]:
X.shape
y.shape


(159571, 6)

In [20]:
X.head()



0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [23]:
print(y)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [6]:
def standardization(input_data):
  """Input: Raw Data(review, etc)
     Output: Standardized Data(review, etc)
     conversion to lower case for data regualrity,
     removal of html tags,
     regex means regularization text,
     removal of punctuations,
     removal of special characters,
     remove accented characters,
     stemming for example: discussion, discussed, discussing are the same word from the root discuss
     lemmatization for example: tensed when reduced to stem its tens which makes no sense, thats why lemmatization
     is used to provide the base for of the word, in this case tensed is turned into a lemma which is tense(again this two methods
     have their own use cases depending on the scenario)
  """
  lower_case = tf.strings.lower(input_data)
  html_tags = tf.strings.regex_replace(lower_case, '<[^>]+>', ' ')
  output = tf.strings.regex_replace(html_tags, '[%s]' % re.escape(string.punctuation), ' ')


  return output

In [7]:
VOCABULARY_SIZE = 200000

In [8]:
vectorize_layer = TextVectorization(
    max_tokens=VOCABULARY_SIZE,
    standardize=standardization,
    output_mode='int',
    output_sequence_length=1800)



In [9]:
vectorize_layer.adapt(X.values)
vectorized_text = vectorize_layer(X.values)




In [10]:

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks



In [11]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [12]:
model = tf.keras.Sequential()
# Create the embedding layer
model.add(Embedding(VOCABULARY_SIZE + 1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(6, activation='sigmoid'))

In [34]:
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [13]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])



In [14]:
history = model.fit(train, epochs=1, validation_data=val)





In [18]:


model.evaluate(test)




[0.04314778000116348, 0.993166983127594]

In [21]:

input_text = vectorize_layer('You freaking suck! I am going to hit you.')
input_text = np.expand_dims(input_text, axis = 0)
res = model.predict(input_text)
(res > 0.5).astype(int)
batch_X, batch_y = test.as_numpy_iterator().next()
(model.predict(batch_X) > 0.5).astype(int)
res.shape



(1, 6)

In [22]:
from keras.metrics import Precision, Recall, CategoricalAccuracy
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [23]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8140080571174622, Recall:0.6987917423248291, Accuracy:0.48846539855003357
