# Comment Toxicity Model with Gradio App

## Step 0: Install Dependencies and Bring On Data

In [1]:
pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
df = pd.read_csv(
os.path.join('CommentToxicity-main','jigsaw-toxic-comment-classification-challenge','train.csv','train.csv')
)

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [83]:
# Toxic Comments
df[df['threat']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
79,003217c3eb469ba9,Hi! I am back again!\nLast warning!\nStop undo...,1,0,0,1,0,0
176,006b94add72ed61c,I think that your a Fagget get a oife and burn...,1,0,1,1,1,1
600,0199d6af27b715f3,I'm also a sock puppet of this account...SUPRI...,1,0,0,1,0,0
802,02230885017a50c5,"Fuck you, Smith. Please have me notified when ...",1,0,1,1,1,0
1017,02c6e41e4b317ac3,WOULDN'T BE THE FIRST TIME BITCH. FUCK YOU I'L...,1,1,1,1,1,1


In [5]:
df.iloc[7]['comment_text']

"Your vandalism to the Matt Shirvington article has been reverted.  Please don't do it again, or you will be banned."

In [6]:
df[df.columns[2:]].iloc[7]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 7, dtype: int64

In [7]:
# A Toxic Comment
df.iloc[16]['comment_text']

"Bye! \n\nDon't look, come or think of comming back! Tosser."

In [8]:
df[df.columns[2:]].iloc[16]

toxic            1
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 16, dtype: int64

## Step 1: Preprocessing The Data

In [9]:
from tensorflow.keras.layers import TextVectorization

In [10]:
# Vectorization and Tokenization 

In [11]:
# Split out our data into comments and features
df['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [12]:
# df[df.columns[2:]]
# df[df.columns[2:]].values
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [13]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [14]:
MAX_FEATURES = 200000 # Number of Words in the Vocab

In [15]:
vectorizer = TextVectorization(
    max_tokens = MAX_FEATURES,
    output_sequence_length=1800,
    output_mode='int'
)

In [16]:
vectorizer.adapt(X.values)

In [17]:
vectorizer('Hello world, life is great!')[0:7]

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([288, 263, 306,   9, 275,   0,   0], dtype=int64)>

In [18]:
# vectorizer.get_vocabulary()

In [19]:
vectorized_text = vectorizer(X.values)

In [20]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [21]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [22]:
batch_X,batch_y = dataset.as_numpy_iterator().next()

In [23]:
batch_y.shape

(16, 6)

In [24]:
int(len(dataset)*0.7)

6981

In [25]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

## Step 2: Create a Sequential Model 

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [30]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [31]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [33]:
# Run About 5-10 epochs
history = model.fit(train, epochs=1, validation_data=val)



In [34]:
history.history

{'loss': [0.061934374272823334], 'val_loss': [0.04580554738640785]}

In [None]:
plt.figure(figsize(8,5))
pd.DataFrame(history.history).plot()
plt.show()

## Step 3: Make Predictions

In [40]:
input_text = vectorizer('You freaking suck! I am gonna kill you.')

In [41]:
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([   7, 7158,  397, ...,    0,    0,    0], dtype=int64)>

In [37]:
# Making Our Prediction

In [38]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [42]:
model.predict(np.expand_dims(input_text,0))



array([[0.98005337, 0.3890112 , 0.8946436 , 0.07185384, 0.76219624,
        0.28174257]], dtype=float32)

In [43]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [64]:
batch_X,batch_y = test.as_numpy_iterator().next()

In [65]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [66]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

## Step 4: Evaluate The Model 

In [72]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [73]:
 pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [74]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)















In [75]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8382193446159363, Recall:0.6666666865348816, Accuracy:0.48445335030555725


## Test and Gradio

In [76]:
!pip install gradio jinja2

Defaulting to user installation because normal site-packages is not writeable
Collecting gradio
  Downloading gradio-3.34.0-py3-none-any.whl (20.0 MB)
                                              0.0/20.0 MB ? eta -:--:--
                                              0.5/20.0 MB 14.2 MB/s eta 0:00:02
     --                                       1.3/20.0 MB 16.9 MB/s eta 0:00:02
     ----                                     2.1/20.0 MB 16.6 MB/s eta 0:00:02
     ------                                   3.1/20.0 MB 18.0 MB/s eta 0:00:01
     -------                                  3.9/20.0 MB 19.3 MB/s eta 0:00:01
     ----------                               5.0/20.0 MB 18.9 MB/s eta 0:00:01
     ------------                             6.1/20.0 MB 19.5 MB/s eta 0:00:01
     -------------                            6.9/20.0 MB 19.3 MB/s eta 0:00:01
     ---------------                          7.8/20.0 MB 19.3 MB/s eta 0:00:01
     -----------------                        8.7/20.0 MB



In [77]:
import tensorflow as tf
import gradio as gr

In [78]:
model.save('toxicity1.h5')

In [79]:
model = tf.keras.models.load_model('toxicity1.h5')

In [80]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [81]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

  super().__init__(
  super().__init__(


In [82]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://9aa12a2a3977423e23.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




