dataset from kaggle 
url:https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df_train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
df_train.shape

(159571, 8)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [7]:
df_train.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [8]:
import re

def cleaning_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub('<.*?>','',text)
    text = re.sub('[^\w\d\s]','',text)
    text = re.sub('^\s+|\s+?$','',text)
    return text

In [9]:
import spacy

nlp = spacy.load("en_core_web_sm") 

def text_preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [10]:
df_train['comment_text'] = [cleaning_text(text) for text in df_train['comment_text']]

In [11]:
df_train['comment_text'] = [text_preprocess(text) for text in df_train['comment_text']]

In [12]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation edit username Hardcore Metallica F...,0,0,0,0,0,0
1,000103f0d9cfb60f,D aww match background colour m seemingly st...,0,0,0,0,0,0
2,000113f07ec002fd,hey man m try edit war s guy constantly re...,0,0,0,0,0,0
3,0001b41b1c6bb37e,t real suggestion improvement wonder sectio...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page s,0,0,0,0,0,0


In [13]:
x = df_train['comment_text']
y = df_train[df_train.columns[2:]].values

In [14]:
MAX_FEATURES = 50000

In [15]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=MAX_FEATURES,ngrams=(1,2),output_mode='int',
                               output_sequence_length=700)

In [16]:
vectorizer.adapt(x.values)

In [17]:
vectorized_text = vectorizer(x.values)

In [18]:
vectorized_text

<tf.Tensor: shape=(159571, 700), dtype=int64, numpy=
array([[  441,     8,   481, ...,     0,     0,     0],
       [   62, 31419,   899, ...,     0,     0,     0],
       [  256,   191,    16, ...,     0,     0,     0],
       ...,
       [    1,  9087,     2, ...,     0,     0,     0],
       [   33,     9,   115, ...,     0,     0,     0],
       [   14,     4,    10, ...,     0,     0,     0]], dtype=int64)>

In [19]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) 

In [20]:
batch_x,batch_y = dataset.as_numpy_iterator().next()

In [21]:
len(dataset)

9974

In [22]:
int(len(dataset)*.7)

6981

In [23]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [24]:
len(train)

6981

In [25]:
len(val)

1994

In [26]:
len(test)

997

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [28]:
model = Sequential()

model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))

model.add(Dense(6, activation='sigmoid'))

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          1600032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 256)               16640     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                        

In [30]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam',metrics=['accuracy'])

In [31]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.98):
            print("\nReached 98% accuracy so cancelling training!")
            self.model.stop_training = True

callbacks = myCallback()

In [32]:
history = model.fit(train,epochs=10,validation_data = val,callbacks=[callbacks]) 

Epoch 1/10
Epoch 2/10
Reached 98% accuracy so cancelling training!


In [33]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)

    y_true = y_true.flatten()
    yhat = yhat.flatten() 

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8408525586128235, Recall:0.6835114359855652, Accuracy:0.476429283618927


In [34]:
import gradio

In [35]:
model.save('comment_toxicity.h5')

In [36]:
input_str = vectorizer('i hate you so much i am coming to kill you')

In [37]:
res = model.predict(np.expand_dims(input_str,0))



In [38]:
res

array([[0.48210755, 0.01113031, 0.13770162, 0.02511684, 0.23189002,
        0.07169584]], dtype=float32)

In [39]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df_train.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [40]:
interface = gradio.Interface(fn=score_comment,inputs='text',outputs='text')

In [41]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://484cc46d-d3ba-4921.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




