<h1> Multi Label toxic comments using RNN(GRU)</h1>

In [4]:
# %pip install tensorflow pandas matplotlib scikit-learn pandas numpy

In [5]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [6]:
df = pd.read_csv('data//train.csv')

In [7]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
df.head(1)['comment_text']

0    Explanation\nWhy the edits made under my usern...
Name: comment_text, dtype: object

In [9]:
df.query('identity_hate == 1')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
105,00472b8e2d38d1ea,A pair of jew-hating weiner nazi schmucks.,1,0,1,0,1,1
176,006b94add72ed61c,I think that your a Fagget get a oife and burn...,1,0,1,1,1,1
218,008e0818dde894fb,"Kill all niggers. \n\nI have hard, that others...",1,0,1,0,1,1
238,0097dd5c29bf7a15,u r a tw@ fuck off u gay boy.U r smelly.Fuck u...,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...
159281,fb726deec64157bd,LoL!! \n\nyou're GAY!! you will never know how...,1,1,1,0,1,1
159336,fc3efa2f6f025f6d,"Oh, fuck off. The pansy Jew would just whine a...",1,0,1,0,1,1
159400,fd052883fa6a8697,"Shalom \n\nSemite, get the fuck out of here. I...",1,1,1,1,1,1
159449,fdce660ddcd6d7ca,I think he is a gay fag!!!,1,0,0,0,0,1


In [10]:
df.columns[2:]                                             

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

### Preprocessing

In [11]:
from tensorflow.keras.layers import TextVectorization

In [12]:
x = df['comment_text'] # all the text data(the input values)
y = df[df.columns[2:]].values # labels for each text data as an array of integers(the target values)

In [13]:
MAX_FEATURES = 200000 # No. of unique words to be stored in the model.

In [14]:
vectorizer  = TextVectorization(max_tokens = MAX_FEATURES,
                                output_sequence_length = 1800,
                                output_mode = 'int')  
# max_tokens - Total number of tokens/words to store 
# output_sequance_length - Max length of one sentance(no. of words)
# output_mode = int - the words are mapped to integers.

In [15]:
# the textVectorization lowercases and removes punctuation.

In [16]:
vectorizer.adapt(x.values) # x.values converts df into numpy array
# .adapt is ued to build a vocabulary (does a little bit of preprocessing for us)
# the vocabulary is ordered by the frequency of that word.

In [17]:
vectorizer('Hello World ,  opop you')[:4] # '1' token means tht the word was not seen at adapt time.
# 0 means that there is no word at that position

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([288, 263,   1,   7], dtype=int64)>

In [18]:
vectorized_text = vectorizer(x.values)
vectorized_text #(159571 - no. of sentences,1800 - max no. of words in a sentence set by us)

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

Creating a Data pipeline

In [19]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y)) # create a tf.data.datset object where input is vectorized text and output is multilabels (supervised learning)
dataset = dataset.cache()  # catches the dataset in memory which saves time as we dont have to load data  from the disk after each epoch 
dataset = dataset.shuffle(160000)  # shuflle the dataset so that NN does not earn the order of input which may result in overfitting
dataset = dataset.batch(32) # create batches of the datset helps in parallel processing 32 is the batch size
dataset = dataset.prefetch(8) # tells how many batches have to be prepared before the end of execution of current batch

In [20]:
train = dataset.take(int(len(dataset)*0.7))  # get the first 70% of the data as train data take takes in the number of batches to extract
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2)) # skip the fisrt 70% as it is training data, of the 30% first 20% is taken as validation data
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1)) # the data after 90% is left for testing

In [21]:
train_generator = train.as_numpy_iterator()
batch_x,batch_y  = train_generator.next() # numpy generator is an iterator that can be used to iterate through the batches in iterator

In [22]:
batch_x.shape # 32 - Batch size ,1800 - make words per sentence cap set by us


(32, 1800)

In [23]:
batch_y.shape # 32 batch size ,6 number of labels            

(32, 6)

In [24]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

### Create Sequential Model

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dense,Embedding,GRU

In [26]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():

    model = Sequential() # instantiate a Sequntial Model

    # embedding layer
    model.add(Embedding(MAX_FEATURES+1,32)) # +1 to handle out-of-vocab tokens that might appear later in real world scenarios 
    # 32 - no. of features the embedding of one token contains


    #Bidirectional LSTM 
    model.add(Bidirectional(GRU(32,activation = 'tanh')))

    # three dense(fully connected) layers that will do feature extraction
    model.add(Dense(128,activation = 'relu'))
    model.add(Dense(256,activation = 'relu'))
    model.add(Dense(128,activation = 'relu'))

    # final layer 
    model.add(Dense(6,activation = 'sigmoid')) # why 6 nodes - becasue the final output has 6 classes and because we are classifying the activation func is sigmoid
    # sigmoid converts any value between 0 and 1 which actually is our output format for the 6 labels
    
    model.compile(loss = 'BinaryCrossentropy',optimizer = 'Adam')

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [27]:
model.summary() 
# Why LSTM has 64 nodes because two LSTM have been created due to it being bidirectional (one for forward propogation and one for back)

In [28]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


Only comment if executing first time.

In [29]:
# history = model.fit(train,epochs = 10,validation_data = val )

In [30]:
# model.save('biGRU.h5')

In [31]:
from tensorflow.keras.models import load_model
model = load_model('biGRU.h5') 



In [32]:
# from matplotlib import pyplot as plt

In [33]:
# plt.figure(figsize = (10,10))
# pd.DataFrame(history.history).plot()
# plt.show()

<h3>Evaluate The Model</h3>

In [34]:
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

In [35]:
pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [36]:
for batch in test.as_numpy_iterator():  #the test dataset has some no. of batches of size 16 each
    x_test,y_test = batch
    y_pred = (model.predict(x_test)>0.5).astype(int)
    
    y_test = y_test.flatten()
    y_pred = y_pred.flatten()
    
    pre.update_state(y_test,y_pred) # after calculating for the particular batch it updates the overall metrics for all the batches passed till that point 
    rec.update_state(y_test,y_pred) # at the end we will have these metric values for the entire test database even though
    acc.update_state(y_test,y_pred)
    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 389ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9

In [37]:
print(f"Precision = {pre.result().numpy()}, Recall = {rec.result().numpy()},Accuracy = {acc.result().numpy()}")

Precision = 0.8707948923110962, Recall = 0.857519805431366,Accuracy = 0.8373494148254395


### Make Predictions 

In [38]:
input = "I  will kill you"

In [39]:
input_text = vectorizer(input)

In [40]:
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([  8,  44, 950, ...,   0,   0,   0], dtype=int64)>

In [41]:
predictions = model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346ms/step


In [42]:
predictions

array([[0.9923997 , 0.06405305, 0.01577269, 0.6505518 , 0.03707685,
        0.03473576]], dtype=float32)

In [43]:
labels  = df.columns[2:]

In [44]:
labels

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [45]:
for i in range(0,len(labels)):
    print(f"{labels[i]}:{predictions[0][i]>0.5}")

toxic:True
severe_toxic:False
obscene:False
threat:True
insult:False
identity_hate:False
