In [1]:
# importing libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

In [2]:
# loading the dataset into pandas Dataframe
data=pd.read_csv(r"C:\Users\Rohit Mourya\Downloads\comments_data.csv")

In [3]:
# calling first five rows of datasets
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# checking for null values in datasets
data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
# understanding structure of your dataset
data.shape

(159571, 8)

In [6]:
# cleaning and preprocessing text data i.e comment_text from DataFrame
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps=PorterStemmer()
corpus=[]
for i in range(len(data['comment_text'])):
    review=re.sub('[^a-zA-Z0-9]',' ',data['comment_text'][i])
    review=review.lower()
#     review=review.split()
    
#     review=[ps.stem(words) for words in review]
#     review=' '.join(review)
    corpus.append(review)

In [25]:
corpus[0]
# corpus

'explanation why the edits made under my username hardcore metallica fan were reverted  they weren t vandalisms  just closure on some gas after i voted at new york dolls fac  and please don t remove the template from the talk page since i m retired now 89 205 38 27'

In [8]:
# Encoding the preprocessed text using one_hot encoder
import tensorflow
from tensorflow.keras.preprocessing.text import one_hot
voc_size=20000
oh_rpr=[one_hot(words,voc_size) for words in corpus]

In [27]:
oh_rpr[5]

[17873, 6460, 18195, 12604, 16211, 5832, 16356, 18129, 16211, 5803]

In [10]:
# Padding the encoded sequences to ensure uniform length of every comment
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_docs=pad_sequences(oh_rpr,padding='pre',maxlen=50)

In [11]:
padded_docs

array([[11692,  3255, 16356, ..., 17742,  1110, 15304],
       [    0,     0,     0, ..., 16311,  9360,   330],
       [    0,     0,     0, ..., 16356, 18343,  7906],
       ...,
       [    0,     0,     0, ...,  5873, 14687,  3977],
       [    0,     0,     0, ..., 16629,  9038,  5468],
       [    0,     0,     0, ...,  4695,  9061,  2753]])

In [12]:
# separating the independent variable and dependent variable
x=padded_docs
y=data.iloc[:,2:].values

In [13]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [14]:
# Splitting the data into training and testing sets.
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [15]:
# importing libraries for building deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,Bidirectional,LSTM

In [16]:
# Create a Sequential model using TensorFlow's Keras API.
model=Sequential()
# Adding Embedding layers in model
model.add(Embedding(20000,50))
# Adding Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32,activation='tanh')))
# Adding Dense layer in model
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(6,activation='sigmoid'))

In [17]:
# Compiling the model with suitable optimizer and loss function
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [18]:
# Training the model on the training data for a specified number of epochs.
model.fit(x_train,y_train,validation_split=0.2,epochs=3)

Epoch 1/3
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 23ms/step - accuracy: 0.9442 - loss: 0.0989 - val_accuracy: 0.9949 - val_loss: 0.0596
Epoch 2/3
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 22ms/step - accuracy: 0.9842 - loss: 0.0507 - val_accuracy: 0.9949 - val_loss: 0.0581
Epoch 3/3
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 22ms/step - accuracy: 0.9942 - loss: 0.0421 - val_accuracy: 0.9949 - val_loss: 0.0638


<keras.src.callbacks.history.History at 0x2280be9e9d0>

In [19]:
# Summary of model
model.summary()

In [20]:
# Evaluating the model's performance on the test data.
model.evaluate(x_test,y_test)

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9935 - loss: 0.0615


[0.06306039541959763, 0.99357670545578]

In [21]:
# Defining a function for predicting toxicity levls based on user input

def sentences(sent):     
    sent=sent.lower()
    text=[sent]

    ohr_rpr=[one_hot(words,voc_size) for words in text]
    pad_docs=pad_sequences(ohr_rpr,padding='pre',maxlen=50)
    z=model.predict(pad_docs)
    
    print('toxic:',round(z[0][0]*100),'%')
    print('severe_toxic:',round(z[0][1]*100),'%')
    print('obsence:',round(z[0][2]*100),'%')
    print('threat:',round(z[0][3]*100),'%')
    print('insult:',round(z[0][4]*100),'%')
    print('identity_hate::',round(z[0][5]*100),'%')
    
sent=input("Enter your comment:")
sentences(sent)

Enter your comment:They went to college.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373ms/step
toxic: 5 %
severe_toxic: 0 %
obsence: 0 %
threat: 0 %
insult: 1 %
identity_hate:: 0 %


In [22]:
# Creating a Gradio interface with input as text and output as text, displaying toxicity predictions in terms of percentage.
import gradio as gr

def predict(text):
    text = text.lower()
    text = [text]

    ohr_rpr = [one_hot(words, voc_size) for words in text]
    pad_docs = pad_sequences(ohr_rpr, padding='pre', maxlen=50)
    z = model.predict(pad_docs)

    toxic = round(z[0][0] * 100,2)
    severe_toxic = round(z[0][1] * 100,2)
    obsence = round(z[0][2] * 100,2)
    threat = round(z[0][3] * 100,2)
    insult = round(z[0][4] * 100,2)
    identity_hate = round(z[0][5] * 100,2)

    report =f"Toxic: {toxic}%\nSevere Toxic: {severe_toxic}%\nObscene: {obsence}%\nThreat: {threat}%\nInsult: {insult}%\nIdentity Hate: {identity_hate}%"
    return report

# Creating interface for our model
interface = gr.Interface(
  fn=predict,
  inputs="text",
  outputs="text",
  title="Toxicity Analyzer",
  description="Enter a comment below to predict its toxicity levels.",
)
# Launching the Gradio interface for user interaction.
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7869
Running on public URL: https://104b312fe2e6191dca.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [23]:
# Creating a Gradio interface with input as text and output as text, displaying toxicity predictions.
import gradio as gr

def predict(text):
    text = text.lower()
    text = [text]

    ohr_rpr = [one_hot(words, voc_size) for words in text]
    pad_docs = pad_sequences(ohr_rpr, padding='pre', maxlen=50)
    z = model.predict(pad_docs)

    toxic = round(z[0][0] * 100, 2)
    severe_toxic = round(z[0][1] * 100, 2)
    obsence = round(z[0][2] * 100, 2)
    threat = round(z[0][3] * 100, 2)
    insult = round(z[0][4] * 100, 2)
    identity_hate = round(z[0][5] * 100, 2)
    
    if toxic>=30:
        toxic=True
    else:
        toxic=False
    if severe_toxic>=30:
        severe_toxic=True
    else:
        severe_toxic=False
    if obsence>=30:
        obsence=True
    else:
        obsence=False
    if threat>=30:
        threat=True
    else:
        threat=False
    if insult>=30:
        insult=True
    else:
        insult=False
    if identity_hate>=30:
        identity_hate=True
    else:
        identity_hate=False

    report =f"Toxic: {toxic}\nSevere Toxic: {severe_toxic}\nObscene: {obsence}\nThreat: {threat}\nInsult: {insult}\nIdentity Hate: {identity_hate}"
    return report

interface = gr.Interface(
  fn=predict,
  inputs="text",
  outputs="text",
  title="Toxicity Analyzer",
  description="Enter a comment below to predict its toxicity levels.",
)
#  Launching the Gradio interface for user interaction.
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7870
Running on public URL: https://8f68a48f841d811fda.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
