In [1]:
!pip install gradio transformers tensorflow pandas numpy scikit-learn



In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import gradio as gr
from transformers import TFDistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
from google.colab import drive


In [3]:
# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ Load Dataset
file_path = '/content/drive/MyDrive/archive (8)/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv'
df = pd.read_csv(file_path)

# ✅ Extract Target Labels
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = df[target_columns].values  # Extract labels


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# ✅ Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# ✅ Split dataset before tokenization
train_texts, test_texts, y_train, y_test = train_test_split(df["comment_text"], y, test_size=0.2, random_state=42)

# ✅ Function to tokenize texts
def encode_texts(texts):
    encodings = tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="tf"
    )
    return encodings["input_ids"].numpy(), encodings["attention_mask"].numpy()

# ✅ Tokenize train & test sets
X_train_encoded, train_masks = encode_texts(train_texts)
X_test_encoded, test_masks = encode_texts(test_texts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
BATCH_SIZE = 32

train_dataset = tf.data.Dataset.from_tensor_slices(((X_train_encoded, train_masks), y_train)) \
                         .shuffle(10000) \
                         .batch(BATCH_SIZE) \
                         .prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(((X_test_encoded, test_masks), y_test)) \
                        .batch(BATCH_SIZE) \
                        .prefetch(tf.data.AUTOTUNE)


In [6]:
# ✅ Load DistilBERT Model Before Calling `bert_embedding`
bert_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [7]:
from tensorflow.keras.layers import Layer

class BertEmbeddingLayer(Layer):
    def __init__(self, bert_model, **kwargs):
        super(BertEmbeddingLayer, self).__init__(**kwargs)
        self.bert_model = bert_model  # Save the model reference

    def call(self, inputs):
        input_ids, attention_mask = inputs
        return self.bert_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

    def get_config(self):  # Required for saving and loading the model
        config = super().get_config()
        return config


In [8]:
from tensorflow.keras.layers import Input, Lambda, Dense
from tensorflow.keras.models import Model

# ✅ Define Model Inputs
input_ids = Input(shape=(64,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(64,), dtype=tf.int32, name="attention_mask")

# ✅ Replace Lambda with Custom Layer
bert_output = BertEmbeddingLayer(bert_model, name="bert_embedding")([input_ids, attention_mask])

# ✅ Fully Connected Layers
x = Dense(128, activation="relu")(bert_output)
x = Dense(64, activation="relu")(x)
output = Dense(len(target_columns), activation="sigmoid")(x)

# ✅ Build Model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.summary()

In [17]:
import tensorflow as tf
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam

# 1️⃣ Change Learning Rate (Lower it)
lr_schedule = ExponentialDecay(1e-4, decay_steps=5000, decay_rate=0.9)
optimizer = Adam(learning_rate=lr_schedule)

# 2️⃣ Change Loss Function (If Data is Balanced)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

# 3️⃣ Compile Model Again
model.compile(
    optimizer=optimizer,
    loss=loss,  # Use Binary Crossentropy
    metrics=[tf.keras.metrics.AUC()]
)


In [18]:
# ✅ Define `bert_embedding()` Function
def bert_embedding(inputs):
    input_ids, attention_mask = inputs
    return bert_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

In [19]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Flatten labels for class weight computation
y_train_flat = y_train.flatten()
class_weights = compute_class_weight("balanced", classes=np.unique(y_train_flat), y=y_train_flat)

# Convert to dictionary format for TensorFlow
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}


In [20]:
history = model.fit(train_dataset, validation_data=test_dataset, epochs=10)

Epoch 1/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 74ms/step - auc_1: 0.8348 - loss: 0.1902 - val_auc_1: 0.9674 - val_loss: 0.0640
Epoch 2/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 67ms/step - auc_1: 0.9694 - loss: 0.0626 - val_auc_1: 0.9704 - val_loss: 0.0613
Epoch 3/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 67ms/step - auc_1: 0.9714 - loss: 0.0606 - val_auc_1: 0.9706 - val_loss: 0.0600
Epoch 4/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 67ms/step - auc_1: 0.9726 - loss: 0.0593 - val_auc_1: 0.9737 - val_loss: 0.0601
Epoch 5/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 74ms/step - auc_1: 0.9735 - loss: 0.0580 - val_auc_1: 0.9746 - val_loss: 0.0603
Epoch 6/10
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 68ms/step - auc_1: 0.9748 - loss: 0.0572 - val_auc_1: 0.9749 - val_loss: 0.0586
Epoch 7/10
[1m1527/3990[0m [32m

KeyboardInterrupt: 

In [21]:
from tensorflow.keras.layers import Layer
import tensorflow as tf
import keras.saving
import transformers

@keras.saving.register_keras_serializable()
class BertEmbeddingLayer(Layer):
    def __init__(self, bert_model_name="distilbert-base-uncased", **kwargs):
        super(BertEmbeddingLayer, self).__init__(**kwargs)
        self.bert_model = transformers.TFAutoModel.from_pretrained(bert_model_name)

    def call(self, inputs):
        input_ids, attention_mask = inputs
        return self.bert_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

    def get_config(self):
        config = super().get_config()
        config.update({"bert_model_name": "distilbert-base-uncased"})  # Save model name
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)




In [22]:
# ✅ Save Model
model.save("toxicity.keras")



In [23]:
from tensorflow.keras.models import load_model

# ✅ Load model with the registered custom layer
model = load_model("toxicity.keras", custom_objects={"BertEmbeddingLayer": BertEmbeddingLayer})



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [25]:
import numpy as np
from tensorflow.keras.metrics import AUC, Precision, Recall # Import AUC

# ✅ Get model predictions
y_probs = model.predict(test_dataset)

# ✅ Tune Threshold for Best AUC
thresholds = np.arange(0.1, 0.9, 0.05)  # Try different thresholds
best_auc = 0
best_threshold = 0.5

auc = AUC() # Initialize AUC before the loop

for t in thresholds:
    y_pred = (y_probs > t).astype(int)
    auc.update_state(y_test, y_pred)
    if auc.result().numpy() > best_auc:
        best_auc = auc.result().numpy()
        best_threshold = t

print(f"Best AUC: {best_auc:.4f} at Threshold: {best_threshold}")

# ✅ Apply the best threshold for final predictions
yhat = (y_probs > best_threshold).astype(int)

# ✅ Compute Metrics
y_true = y_test.flatten()
yhat = yhat.flatten()

auc = AUC() # Re-initialize AUC for final computation
precision = Precision()
recall = Recall()

auc.update_state(y_true, yhat)
precision.update_state(y_true, yhat)
recall.update_state(y_true, yhat)

print(f"AUC: {auc.result().numpy():.4f}")
print(f"Precision: {precision.result().numpy():.4f}")
print(f"Recall: {recall.result().numpy():.4f}")

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 53ms/step
Best AUC: 0.8939 at Threshold: 0.1
AUC: 0.8939
Precision: 0.5174
Recall: 0.8171


In [54]:
# ✅ Define Prediction Function for Gradio
def score_comment(comment):
    tokens = tokenizer([comment], padding="max_length", truncation=True, max_length=64, return_tensors="tf")
    input_ids = tf.convert_to_tensor(tokens["input_ids"])
    attention_mask = tf.convert_to_tensor(tokens["attention_mask"])

    prediction = model.predict([input_ids, attention_mask])[0]

    labels = ["toxic💀", "severe_toxic☠️", "obscene🙊", "threat⚠️", "insult🤡", "identity_hate🚫"]
    return {labels[i]: f"{prediction[i]:.2f}" for i in range(len(labels))}

# ✅ Gradio Interface
interface = gr.Interface(
    fn=score_comment,
    inputs=gr.Textbox(lines=2, placeholder="💬Enter a comment"),
    outputs=gr.Label(),
    title="💢Toxic Comment Classifier",
    description="Enter a comment to check toxicity levels... 🤔💬⚠️"
)

interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8df445eda1eb04208b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [51]:
import gradio as gr
import tensorflow as tf
import numpy as np
from transformers import DistilBertTokenizer
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model("toxicity.keras", custom_objects={"BertEmbeddingLayer": BertEmbeddingLayer})

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define label categories
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def classify_comment(comment):
    tokens = tokenizer([comment], padding="max_length", truncation=True, max_length=64, return_tensors="tf")
    input_ids = tf.convert_to_tensor(tokens["input_ids"])
    attention_mask = tf.convert_to_tensor(tokens["attention_mask"])

    prediction = model.predict([input_ids, attention_mask])[0]

    result = {labels[i]: f"{round(prediction[i] * 100, 1)}%" for i in range(len(labels))}
    return result

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("""
    # 🛑 **Toxic Comment Classifier**
    **Enter a comment to check its toxicity levels.**
    The model predicts the probability of various toxic traits.
    """)

    comment_input = gr.Textbox(label='💬 Enter your comment:', placeholder='Type here...', lines=2)
    submit_btn = gr.Button("🚀 Analyze", variant="primary")
    clear_btn = gr.Button("❌ Clear")
    output = gr.Label("📊 Toxicity Analysis Results")

    submit_btn.click(classify_comment, inputs=[comment_input], outputs=[output])
    clear_btn.click(lambda: "", inputs=[], outputs=[comment_input])

demo.launch(share=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://60783c07ebdada2388.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


