dataset from kaggle 
url:https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df_train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
df_train.shape

(159571, 8)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [7]:
df_train.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [8]:
from tensorflow.keras.layers import TextVectorization

In [9]:
x = df_train['comment_text']
y = df_train[df_train.columns[2:]].values

In [10]:
MAX_FEATURES = 50000

In [11]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,standardize='lower_and_strip_punctuation',split='whitespace',
                               ngrams=(1,2),output_mode='int',output_sequence_length=700,vocabulary_size=5000)

In [12]:
vectorizer.adapt(x.values)

In [13]:
vectorized_text = vectorizer(x.values)

In [14]:
vectorized_text

<tf.Tensor: shape=(159571, 700), dtype=int64, numpy=
array([[ 1027,    85,     2, ...,     0,     0,     0],
       [    1,    58,  5471, ...,     0,     0,     0],
       [  605,   631,    79, ...,     0,     0,     0],
       ...,
       [    1, 22416,   527, ...,     0,     0,     0],
       [    5,    12,   809, ...,     0,     0,     0],
       [    5,     8,   163, ...,     0,     0,     0]], dtype=int64)>

In [15]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) 

In [16]:
batch_x,batch_y = dataset.as_numpy_iterator().next()

In [17]:
len(dataset)

9974

In [18]:
int(len(dataset)*.8)

7979

In [19]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.3))

In [20]:
len(train)

6981

In [21]:
len(val)

2992

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [23]:
model = Sequential()

model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))

model.add(Dense(6, activation='sigmoid'))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          1600032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 256)               16640     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                        

In [25]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam',metrics=['accuracy'])

In [26]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.98):
            print("\nReached 98% accuracy so cancelling training!")
            self.model.stop_training = True

callbacks = myCallback()

In [27]:
history = model.fit(train,epochs=10,validation_data = val,callbacks=[callbacks]) 

Epoch 1/10
Epoch 2/10
Reached 98% accuracy so cancelling training!


In [28]:
df_test = pd.read_csv('test.csv')

In [29]:
df_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [30]:
df_test.shape

(153164, 2)

In [31]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            153164 non-null  object
 1   comment_text  153164 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [32]:
test = df_test['comment_text']

In [33]:
test.head()

0    Yo bitch Ja Rule is more succesful then you'll...
1    == From RfC == \n\n The title is fine as it is...
2    " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3    :If you have a look back at the source, the in...
4            I don't anonymously edit articles at all.
Name: comment_text, dtype: object

In [34]:
vectorizer.adapt(test.values)

In [35]:
vectorized_test_text = vectorizer(test.values)

In [36]:
vectorized_test_text

<tf.Tensor: shape=(153164, 700), dtype=int64, numpy=
array([[ 2801,   282,  8829, ...,     0,     0,     0],
       [   32,  1890,     2, ...,     0,     0,     0],
       [  121,     1, 42753, ...,     0,     0,     0],
       ...,
       [    1,  1988,     9, ...,     0,     0,     0],
       [   52,     5,     2, ...,     0,     0,     0],
       [  233,   303,    22, ...,     0,     0,     0]], dtype=int64)>

In [37]:
test_results = model.predict(vectorized_test_text)



In [38]:
(test_results > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [39]:
test_results.shape

(153164, 6)

In [40]:
df_test_labels = pd.read_csv('test_labels.csv')

In [41]:
df_test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1


In [None]:
yTrue =  df_test_labels[df_test_labels.columns[1:]].values

In [None]:
yTrue = (yTrue - np.min(yTrue)) / (np.max(yTrue) - np.min(yTrue))

In [None]:
yTrue

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
yTrue.shape

(153164, 6)

In [None]:
yhat = test_results.flatten()
y_true = yTrue.flatten() 

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
pre.update_state(y_true, yhat)
re.update_state(y_true, yhat)
acc.update_state(y_true, yhat)

<tf.Variable 'UnreadVariable' shape=() dtype=float32, numpy=1.0>

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.4342195987701416, Recall:0.024633467197418213, Accuracy:0.0


In [None]:
import gradio

In [None]:
model.save('comment_toxicity.h5')

In [None]:
input_str = vectorizer('i hate you so much i am coming to kill you')

In [None]:
res = model.predict(np.expand_dims(input_str,0))



In [None]:
res

array([[0.7702635 , 0.01856275, 0.3336253 , 0.02559903, 0.3471496 ,
        0.06040946]], dtype=float32)

In [59]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df_train.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [60]:
interface = gradio.Interface(fn=score_comment,inputs='text',outputs='text')

In [61]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://95d8bda2-f2b5-4250.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




