In [1]:
!pip install gradio
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, LSTM, Dense, Embedding, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset from Google Drive
file_path = '/content/drive/MyDrive/Dataset/train/train.csv'  # Update this with your file path
df = pd.read_csv(file_path)

# Preprocessing: Handle missing values
df = df.dropna(subset=['comment_text'])  # Drop rows with missing comments
df['comment_text'] = df['comment_text'].astype(str)  # Ensure all comments are strings

# Extract features (text) and labels
x = df['comment_text']
y = df.iloc[:, 2:].values  # Labels are from columns 2 onwards

# Ensure labels are binary (if not already)
y = np.where(y > 0, 1, 0)

# Parameters
MAX_FEATURES = 20000  # Max unique words in vocabulary
SEQ_LENGTH = 100  # Length of sequences after vectorization

# Text preprocessing function (optional improvements)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.replace('\n', ' ')  # Remove newlines
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation
    return text

# Apply preprocessing
x = x.apply(preprocess_text)

# Text vectorization
vectorizer = TextVectorization(
    max_tokens=MAX_FEATURES,
    output_sequence_length=SEQ_LENGTH,
    output_mode='int'
)
vectorizer.adapt(x.values)
vectorized_text = vectorizer(x.values)

# Convert the vectorized text into a NumPy array for compatibility with train_test_split
vectorized_text = np.array(vectorized_text)

# Split dataset into train, validation, and test sets
x_train, x_temp, y_train, y_temp = train_test_split(vectorized_text, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.33, random_state=42)

# Create TensorFlow datasets for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(16).cache().prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(16).cache().prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(16)

# Build the model with LSTM and Embedding
model = tf.keras.Sequential([
    Input(shape=(SEQ_LENGTH,)),
    Embedding(MAX_FEATURES + 1, 32),  # Embedding layer
    LSTM(64, activation='tanh', return_sequences=False),  # LSTM layer
    Dense(128, activation='relu'),  # Fully connected layer
    Dense(y.shape[1], activation='sigmoid')  # Output layer for multi-label classification
])

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(name="auc")]
)

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    train_dataset,
    epochs=5,  # Number of epochs
    validation_data=val_dataset,
    callbacks=[early_stopping]
)

# Save the model
model_path = 'toxicity_model.keras'
model.save(model_path)

# Reload the model for prediction
model = tf.keras.models.load_model(model_path)

# Function to score a comment
def score_comment(comment):
    if not isinstance(comment, str) or not comment.strip():
        return "Invalid input. Please enter a valid comment."

    # Preprocess and vectorize the comment
    processed_comment = preprocess_text(comment)
    vectorized_comment = vectorizer([processed_comment])
    results = model.predict(vectorized_comment)

    # Format the output
    response = "Toxicity Classification:\n"
    labels = df.columns[2:].values  # Column names representing toxicity categories
    for idx, label in enumerate(labels):
        response += f"{label}: {'Toxic' if results[0][idx] > 0.5 else 'Non-Toxic'}\n"

    return response

# Optional: If you want to use Gradio for the user interface
import gradio as gr

interface = gr.Interface(
    fn=score_comment,
    inputs=gr.Textbox(lines=2, placeholder='Enter comment here...'),
    outputs='text',
    title="Toxic Comment Classifier",
    description=(
        "This model classifies comments into multiple toxicity categories. "
        "Enter a comment to see if it is toxic, obscene, a threat, etc."
    ),
    examples=[  # Example comments
        ["You're an idiot!"],
        ["Have a great day!"],
        ["I will kill you!"],
        ["I love you."]
    ]
)

# Launch the Gradio interface
interface.launch(share=True)


Collecting gradio
  Downloading gradio-5.16.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

