In [None]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [130]:
#Importing all required packages
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
import tensorflow as tf

In [131]:
#Reading the csv file into a pandas dataframe
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')

In [133]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
495,014b44616d8cb457,"Sarek of Vulcan: Unfortunately for you, you ca...",0,0,0,0,0,0
496,014bb932bd289352,Keep your chin up! Darwinism was not accepted ...,0,0,0,0,0,0
497,014c96f873db11ff,"""""""Nazi filth"""" is impolite 04:27, 20 Jan 200...",1,0,0,0,1,0
498,014d00c8f2a76df4,Interesting. I checked the other case number K...,0,0,0,0,0,0
499,014f50b1f448cb2d,The transcluded part of the GA review doesn't ...,0,0,0,0,0,0


In [134]:
#Assigning the comment text column from the dataframe (df) to X, and the target labels (all columns from index 2 to the end) to y. This separates the input text and the corresponding labels for classification.
X = df['comment_text']
y = df[df.columns[2:]].values

In [135]:
#Defining the maximum number of features
MAX_FEATURES = 200000

In [136]:
#Creating a TextVectorization layer for encoding and adapting the training dataset
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_mode='int')

In [137]:
vectorizer.adapt(X.values)

In [138]:
#Creating another TextVectorization layer to fit and transform data, and also handle sequences of varying lengths
vectorized_text = vectorizer(X.values)

In [139]:
vocab = vectorizer.get_vocabulary()
word_index = dict(zip(vocab, range(len(vocab))))
max_sequence_length = max([len(x.split()) for x in X.values])

In [140]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=max_sequence_length,
                               output_mode='int')

In [141]:
vectorizer.adapt(X.values)

In [142]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [143]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [144]:
#Defining a Bidirectional LSTM model
model = Sequential()

model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [145]:
#Compiling the model using Binary Cross Entropy loss function and Adam optimizer
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [146]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_12 (Dense)            (None, 128)               8320      
                                                                 
 dense_13 (Dense)            (None, 256)               33024     
                                                                 
 dense_14 (Dense)            (None, 128)               32896     
                                                                 
 dense_15 (Dense)            (None, 6)                 774       
                                                      

In [147]:
#Fitting the training data to the model for 1 epoch
history = model.fit(train, epochs=1, validation_data=val)



In [148]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [149]:
np.expand_dims(input_text,0)

array([[8, 1, 1, ..., 0, 0, 0]])

In [150]:
model.predict(np.expand_dims(input_text,0))



array([[0.00160883, 0.01007908, 0.01723169, 0.00052744, 0.00674852,
        0.00081864]], dtype=float32)

In [151]:
res = model.predict(np.expand_dims(input_text,0))



In [152]:
(res > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0]])

In [153]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [154]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [155]:
res.shape

(1, 6)

In [156]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [157]:
#Calculate Precision, Recall and Categorical Accuracy scores for the model
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [158]:
print(f'Precision:{pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision:0.0, Recall:0.0, Accuracy:0.0


In [None]:
!pip install gradio jinja2

In [160]:
model.save('toxicity.h5')

In [161]:
model = tf.keras.models.load_model('toxicity.h5')

In [162]:
input_str = vectorizer('hey i freaken hate you!')

In [163]:
res = model.predict(np.expand_dims(input_str,0))



In [181]:
res

array([[0.00154681, 0.00980252, 0.0167864 , 0.0005068 , 0.00658554,
        0.00078404]], dtype=float32)

In [182]:
import gradio as gr

In [198]:
#Defining a 'score_comment' function to predict toxicity labels for an input comment, vectorize the comment, get predictions, and format the results as a string.
from io import BytesIO
import base64

def score_comment(comment, category=None, custom=None):
    vectorized_comment = vectorizer([comment])
    
    if category and custom:
        # Add custom phrase or word to the input data
        X_custom = np.zeros((1, max_sequence_length), dtype=np.int64)
        custom_text = '{} {}'.format(category, custom)
        vectorized_custom = vectorizer([custom_text])
        X_custom[:, :len(vectorized_custom[0])] = vectorized_custom
        X = np.vstack([vectorized_comment, X_custom])
    else:
        X = vectorized_comment
    
    results = model.predict(X)[-1]

    # Create a bar chart of category probabilities
    fig, ax = plt.subplots()
    y_pos = np.arange(len(df.columns[2:]))
    ax.barh(y_pos, results, align='center')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(df.columns[2:])
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Probability')
    ax.set_title('Category Probabilities')

    # Save the plot to a buffer
    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)

    # Convert the buffer to a base64 encoded string
    encoded_img = base64.b64encode(buf.getvalue()).decode('utf-8')
    plt.close(fig)

    # Convert base64 image to numpy array
    nparr = np.frombuffer(base64.b64decode(encoded_img), np.uint8)
    img_np = cv2.imdecode(nparr, cv2.IMREAD_UNCHANGED)

    return img_np

In [206]:
#Creating a 'score_comment' function with additional inputs for custom categories and visual outputs of predicted probabilities.
interface = gr.Interface(fn=score_comment,
                         inputs=[gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                                 gr.inputs.CheckboxGroup(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], label='Categories'),
                                 gr.inputs.Textbox(label='Custom phrase or word', optional=True)],
                         outputs=gr.outputs.Image(type='pil'))

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",
  "Usage of gradio.outputs is deprecated, and will not be supported in the future, please import your components from gradio.components",


In [207]:
pip install opencv-python-headless

[0mNote: you may need to restart the kernel to use updated packages.


In [208]:
import cv2

In [209]:
#Creating an interface for the model using Gradio to take user input, display predictions and probability distributions for different categories, and provide a visual output.
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7872
Running on public URL: https://6f4ea68e74010a704f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


