In [2]:
!pip install numpy pandas tensorflow scikit-learn gradio





[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, GRU, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split

df = pd.read_csv('final_extended_toxic_comments_train.csv')

df['comment_text'].fillna("", inplace=True)

MAX_NUM_WORDS = 10000  
MAX_SEQUENCE_LENGTH = 100 
EMBEDDING_DIM = 100  

# Tokenizer to convert text to sequences of integers
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['comment_text'].values)
sequences = tokenizer.texts_to_sequences(df['comment_text'].values)

# Pad the sequences to ensure consistent input size
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Labels for multi-label classification
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the hybrid CNN + RNN (LSTM) model
model = Sequential()

# Embedding layer
model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))

# CNN part 
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# RNN part 
model.add(Bidirectional(LSTM(100, return_sequences=True)))

# Optional: Add another RNN layer (GRU in this case)
model.add(GRU(100))

# Dense layers for output
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5)) 

# Output layer with sigmoid activation for multi-label classification
model.add(Dense(6, activation='sigmoid'))  # 6 output units for 6 labels

# Compile the model (for multi-label classification, use binary crossentropy)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), verbose=1)

model.save('toxic_comment_model.h5')

import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved!")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['comment_text'].fillna("", inplace=True)


Epoch 1/5




[1m2884/2884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 45ms/step - accuracy: 0.7961 - loss: 0.2013 - val_accuracy: 0.9795 - val_loss: 0.1124
Epoch 2/5
[1m2884/2884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 45ms/step - accuracy: 0.9514 - loss: 0.1051 - val_accuracy: 0.9783 - val_loss: 0.0803
Epoch 3/5
[1m2884/2884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 45ms/step - accuracy: 0.8769 - loss: 0.0722 - val_accuracy: 0.9698 - val_loss: 0.0627
Epoch 4/5
[1m2884/2884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 45ms/step - accuracy: 0.7440 - loss: 0.0511 - val_accuracy: 0.9481 - val_loss: 0.0548
Epoch 5/5
[1m2884/2884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 46ms/step - accuracy: 0.7761 - loss: 0.0385 - val_accuracy: 0.9533 - val_loss: 0.0458




Model and tokenizer saved!


In [4]:
!pip install gradio





[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
!pip install gradio --upgrade #Upgrade Gradio to the latest version

import numpy as np
import gradio as gr
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences


model = load_model('toxic_comment_model.h5')

with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

MAX_SEQUENCE_LENGTH = 100
toxicity_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Function to predict toxicity of a comment
def predict_toxicity(comment):
    # Tokenize and pad the input comment
    test_sequences = tokenizer.texts_to_sequences([comment])
    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    # Make predictions
    prediction = model.predict(test_data)[0]

    # Create a dictionary of results
    result = {label: round(pred, 2) for label, pred in zip(toxicity_labels, prediction)}

    return result

interface = gr.Interface(
    fn=predict_toxicity,
    inputs=gr.Textbox(lines=2, placeholder="Enter a comment to check for toxicity"), #Change to gr.Textbox
    outputs="json",
    title="Comment Toxicity Detector",
    description="Enter a comment, and this tool will predict if the comment contains various types of toxicity like 'toxic', 'severe toxic', 'obscene', 'threat', 'insult', or 'identity hate'.",
)

interface.launch()

ERROR: Invalid requirement: '#Upgrade'

[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 437ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
