IMPORTING LIBRARIES

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.layers import TextVectorization

READING THE DATA

In [7]:
df_train=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
df_test=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')
df_testlabel=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test_labels.csv')

In [8]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
df_train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df_train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

Splitting the data into features and comments

In [11]:
X=df_train['comment_text']
y=df_train[df_train.columns[2:]].values

Preprocessing layer for mapping text features to integer sequences

In [12]:
#number of words
max_features=100000 

Initializing the text vectorization layers 

In [13]:
vectorizer=TextVectorization(max_tokens=max_features,output_sequence_length=900,output_mode='int')

Teach the vectorizer vocabulary

In [14]:
vectorizer.adapt(X.values)

Getting the comments as form of integer arrays

In [15]:
vectorized_text = vectorizer(X.values)
vectorized_text

<tf.Tensor: shape=(159571, 900), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

Creating tensorflow data pipeline using map,cache,shuffle,batch and prefetch

In [16]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 900), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.int64, name=None))>

Creating training, testing and validation data

In [17]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

Creating model

In [18]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(max_features+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

Compiling

In [19]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

Summary

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          3200032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

Running the model

In [21]:
history = model.fit(train, epochs=1, validation_data=val)



Preprocessing the testing data 

In [22]:
X_test=df_test['comment_text']

In [23]:
vectorizer.adapt(X_test.values)

In [24]:
input_text = vectorizer(X_test)

In [25]:
result=model.predict(input_text)
result

array([[5.22749722e-02, 9.51005131e-05, 5.95942140e-03, 1.56790018e-03,
        1.80419087e-02, 2.07683444e-03],
       [2.15822458e-03, 2.92833107e-07, 2.06857920e-04, 3.22074047e-05,
        7.23391771e-04, 4.20071374e-05],
       [1.71756148e-02, 2.62295980e-05, 2.68772244e-03, 7.12722540e-04,
        6.82350993e-03, 8.42034817e-04],
       ...,
       [9.51260328e-04, 5.06756130e-08, 7.53703716e-05, 9.62138620e-06,
        3.00019979e-04, 1.28710435e-05],
       [9.65416431e-04, 4.99126109e-08, 7.48544044e-05, 9.51695074e-06,
        3.01033258e-04, 1.27350631e-05],
       [7.95790553e-03, 4.37971994e-06, 9.68843699e-04, 2.06530094e-04,
        2.89219618e-03, 2.54422426e-04]], dtype=float32)

Evaluating the model using Precision, Recall and Categorical Accuracy

In [26]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [27]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
for batch in test.as_numpy_iterator(): 
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8407940864562988, Recall:0.6237366199493408, Accuracy:0.4603811502456665


In [28]:
final_result=pd.DataFrame(result)
final_result

Unnamed: 0,0,1,2,3,4,5
0,0.052275,9.510051e-05,0.005959,0.001568,0.018042,0.002077
1,0.002158,2.928331e-07,0.000207,0.000032,0.000723,0.000042
2,0.017176,2.622960e-05,0.002688,0.000713,0.006824,0.000842
3,0.002947,5.739696e-07,0.000302,0.000051,0.001014,0.000065
4,0.019841,3.457710e-05,0.003140,0.000854,0.007949,0.001011
...,...,...,...,...,...,...
153159,0.016542,2.361660e-05,0.002520,0.000661,0.006537,0.000783
153160,0.000441,1.074076e-08,0.000031,0.000003,0.000133,0.000005
153161,0.000951,5.067561e-08,0.000075,0.000010,0.000300,0.000013
153162,0.000965,4.991261e-08,0.000075,0.000010,0.000301,0.000013


In [29]:
final_result.to_csv('prediction.csv')