# Importing Dataset and Libraries

In [1]:
import os
import pandas as pd

import numpy as np

import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("C:\\Users\\Gaurav Tripathi\\Desktop\\AD 3\\train.csv")

In [3]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


# Preprocessing

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
X = df['comment_text']
y = df[df.columns[2:]].values
X
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [6]:
MAX_FEATURES = 200000
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [7]:
vectorizer.adapt(X.values)

In [8]:
type(X.values)
vectorizer("hello there how are you doing")[:6]

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([288,  41,  73,  20,   7, 273], dtype=int64)>

In [9]:
vectorized_text = vectorizer(X.values)


In [10]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [11]:
# we are creating tensorflow data pipe line

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)   # helps to prevent bottelnecking 

In [12]:
batch_x, batch_y=dataset.as_numpy_iterator().next()

In [13]:
print(batch_x)
batch_x.shape

[[   425     41    171 ...      0      0      0]
 [    30    281    141 ...      0      0      0]
 [   171  37178     13 ...      0      0      0]
 ...
 [  3106   1371     32 ...      0      0      0]
 [  1927    803   1540 ...      0      0      0]
 [196116      8     25 ...      0      0      0]]


(16, 1800)

In [14]:
train = dataset.take(int(len(dataset)*.7))   # this gives 70% of the data to training set

val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))  # 20% for validation 

test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # 10% for testing 

print(len(train))
print(len(val))
print(len(test))

6981
1994
997


In [15]:
train_generator=train.as_numpy_iterator()

In [16]:
train_generator.next()

(array([[  139,   224,     5, ...,     0,     0,     0],
        [   67,    18,     6, ...,     0,     0,     0],
        [   20,     7,   832, ...,     0,     0,     0],
        ...,
        [   14,   266,     6, ...,     0,     0,     0],
        [   21,  1778,    77, ...,     0,     0,     0],
        [68857,    64, 26730, ...,     0,     0,     0]], dtype=int64),
 array([[1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0]], dtype=int64))

# Create Sequential Model

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [18]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM (long short-term menory) Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [19]:
model.compile(optimizer='Adam', loss='binary_crossentropy')
model.summary()


In [21]:
history=model.fit(train, epochs=5, validation_data=val)

Epoch 1/5
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3685s[0m 527ms/step - loss: 0.0825 - val_loss: 0.0468
Epoch 2/5
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3647s[0m 522ms/step - loss: 0.0469 - val_loss: 0.0411
Epoch 3/5
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3490s[0m 500ms/step - loss: 0.0423 - val_loss: 0.0354
Epoch 4/5
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3382s[0m 484ms/step - loss: 0.0369 - val_loss: 0.0322
Epoch 5/5
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3607s[0m 517ms/step - loss: 0.0323 - val_loss: 0.0301


In [20]:
from matplotlib import pyplot as plt

# Make Predictions

In [21]:
input_text = vectorizer('i will kill you someday')
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([  8,  44, 950, ...,   0,   0,   0], dtype=int64)>

In [22]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [23]:
res = model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


In [24]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [25]:
arr=(res > 0.5).astype(int)
arr>0.5


array([[ True,  True,  True, False,  True, False]])

# Evaluate Model

In [63]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [64]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [99]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [100]:
print(f'Precision: {pre.result().numpy()},\n Recall:{re.result().numpy()},\n Accuracy:{acc.result().numpy()}')

Precision: 0.060814134776592255,
 Recall:0.7598828673362732,
 Accuracy:0.0521564707159996


# Test and Gradio

In [26]:
import gradio as gr
import tensorflow as tf

In [27]:
model.save('toxicity.keras')


In [28]:
import gradio as gr

In [29]:
def greet(comment):
    input_text=vectorizer(comment)
    batch_X, batch_y = test.as_numpy_iterator().next()
    res = model.predict(np.expand_dims(input_text,0))
    
    
    col1=["Toxic","Sever_Toxic","Obscene","Threat","Insult","Identity_Hate"]
    text = ' '
    for idx, col in enumerate(col1):
        text += '{}: {}\n'.format(col, res[0][idx]>0.5)
    
    return text
    
    

demo = gr.Interface(
    fn=greet,
    inputs=["text"],
    outputs=["text"],
)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.28.2, however version 4.29.0 is available, please upgrade.
--------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
