#### Importing libraries

In [46]:
import os
import matplotlib.pyplot as plt
import numpy as np 
import tensorflow as tf
import pandas as pd

In [47]:
df = pd.read_csv(os.path.join('/Users/rahulsharma/Desktop/toxicComment/dataset','train.csv'))

In [48]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [49]:
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [50]:
from tensorflow.keras.layers import TextVectorization

In [51]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [52]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [53]:
MAX_FEATURES = 200000 # no of words in the vocab

In [54]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode = 'int')

Now that the vocab layer has been created, call `adapt` on the text-only dataset to create the vocabulary.

In [55]:
vectorizer.adapt(X.values)

In [56]:
vectorizer('hello world this code is written in python')[:8]

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([ 288,  263,   14, 1349,    9,  367,   11, 9696])>

In the line `vectorized_text = vectorizer(X.values)`, you are using the vectorizer object to transform your input data into a numerical representation.

In [57]:
vectorized_text = vectorizer(X.values)

Creating a tensorflow data pipeline

In [58]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

getting one batch out of the dataset

In [59]:
dataset.as_numpy_iterator().next()

(array([[     8,    127,      2, ...,      0,      0,      0],
        [     5,    225,    479, ...,      0,      0,      0],
        [  1419,     29,    324, ...,      0,      0,      0],
        ...,
        [  1514,    865,    562, ...,      0,      0,      0],
        [    48,     15,     95, ...,      0,      0,      0],
        [     2,     24, 173467, ...,      0,      0,      0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

In [60]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.1))

In [61]:
len(train), len(test), len(val)

(6981, 997, 1994)

In [62]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [63]:
y.shape

(159571, 6)

`Bidirectional` - it allows you to pass information in both the direction. it is useful because words prior to another word might have a meaning and it also might modify the meaning of the sentence. 

Note - Final layer because `y.shape` has output in 6 different values so we want to map the output is the same style 


In [64]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [71]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # 


In [72]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 6)                 774       
                                                      

In [73]:
history = model.fit(train, epochs=1, validation_data=val)
history



<keras.src.callbacks.History at 0x2dde5aec0>

Predicting

In [78]:
text = vectorizer('you freaking suck!')

In [80]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [79]:
res = model.predict(np.expand_dims(text,0))
res



array([[0.99433744, 0.11346626, 0.92900115, 0.01295206, 0.75436985,
        0.07119527]], dtype=float32)

Evaluate models

In [81]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [82]:
for batches in test.as_numpy_iterator():
    # Unpack the batch
    x_true, y_true = batches
    # Making a prediction
    yhat = model.predict(x_true)
    # Flatten the values
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)



In [84]:
print("Precision: {}".format(pre.result().numpy()))
print("Recall: {}".format(re.result().numpy()))
print("CategoricalAccuracy: {}".format(acc.result().numpy()))

Precision: 0.8548812866210938
Recall: 0.6480000019073486
CategoricalAccuracy: 0.47943830490112305


Gradio app

In [85]:
!pip install jinja2


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [86]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.2
    Uninstalling pip-23.3.2:
      Successfully uninstalled pip-23.3.2
Successfully installed pip-24.0


In [87]:
import gradio as gr 

In [88]:
model.save('toxicity.h5')

  saving_api.save_model(


In [90]:
model = tf.keras.models.load_model('toxicity.h5')

In [91]:
input_str = vectorizer('hey i freaken hate you!')
res = model.predict(np.expand_dims(input_str,0))
res
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')
interface.launch(share=True)



  inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
  inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
  inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://d05b2f51eb7a8d5903.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




