In [None]:
#!pip install tensorflow
#!pip install nltk

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import pickle
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9!?.,' ]", " ", text)
    words = text.split()
    cleaned_words = []

    for word in words:
        if re.fullmatch(r"\d+", word):
            cleaned_words.append("[Number Detected]")
        else:
            cleaned_words.append(word)

    text = " ".join(cleaned_words)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Model creation function (adjusted to output only 3 labels)
def create_model(input_length, vocab_size, embedding_dim=100):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_length),
        SpatialDropout1D(0.2),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(pool_size=4),
        Bidirectional(LSTM(100, return_sequences=True)),
        Dropout(0.3),
        LSTM(100),
        Dropout(0.3),
        Dense(3, activation='sigmoid')  # Multi-label output
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Preprocess text data
def preprocess_text(texts, tokenizer=None, max_len=100):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=20000)
        tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    data = pad_sequences(sequences, maxlen=max_len, truncating='post')
    return data, tokenizer

# Load and prepare training data
def load_data(preprocessed_csv_path='train_preprocessed.csv', max_len=100):
    print(f"Loading preprocessed data from {preprocessed_csv_path}...")
    if os.path.exists(preprocessed_csv_path):
        df = pd.read_csv(preprocessed_csv_path)
    else:
        raise FileNotFoundError("Preprocessed CSV file not found.")

    df['comment_text'] = df['comment_text'].astype(str).apply(clean_text)
    texts = df['comment_text'].values
    labels = df[['toxic', 'obscene', 'insult']].values
    labels = (labels > 0.5).astype(int)
    return texts, labels

# Train the model
def train_model(preprocessed_csv_path='train_preprocessed.csv', model_path='model.h5', tokenizer_path='tokenizer.pkl', epochs=5, batch_size=32, max_len=100):
    texts, labels = load_data(preprocessed_csv_path, max_len)
    data, tokenizer = preprocess_text(texts, max_len=max_len)

    vocab_size = len(tokenizer.word_index) + 1
    model = create_model(input_length=data.shape[1], vocab_size=vocab_size)

    checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

    model.fit(data, labels, epochs=epochs, batch_size=batch_size, validation_split=0.2,
              callbacks=[checkpoint, early_stopping, reduce_lr], verbose=1)

    with open(tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f)

    print("Model training complete and saved.")

# Load trained model
def load_model_and_tokenizer(model_path='model.h5', tokenizer_path='tokenizer.pkl'):
    model = tf.keras.models.load_model(model_path)
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)
    return model, tokenizer

# Predict toxicity
def predict_toxicity(model, tokenizer, text, max_len=100):
    processed_text, _ = preprocess_text([clean_text(text)], tokenizer, max_len)
    prediction = model.predict(processed_text)[0]
    categories = ["Toxic", "Obscene", "Insult"]
    return {cat: float(pred) for cat, pred in zip(categories, prediction)}

# Main training execution
if __name__ == "__main__":
    train_model()

Loading preprocessed data from train_preprocessed.csv...




Epoch 1/8
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step - accuracy: 0.9361 - loss: 0.1247
Epoch 1: val_loss improved from inf to 0.07562, saving model to model.h5




[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1472s[0m 367ms/step - accuracy: 0.9361 - loss: 0.1247 - val_accuracy: 0.9945 - val_loss: 0.0756 - learning_rate: 0.0010
Epoch 2/8
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 364ms/step - accuracy: 0.9897 - loss: 0.0673
Epoch 2: val_loss did not improve from 0.07562
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1551s[0m 374ms/step - accuracy: 0.9897 - loss: 0.0673 - val_accuracy: 0.9943 - val_loss: 0.0759 - learning_rate: 0.0010
Epoch 3/8
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 359ms/step - accuracy: 0.9909 - loss: 0.0549
Epoch 3: val_loss did not improve from 0.07562

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1502s[0m 369ms/step - accuracy: 0.9909 - loss: 0.0549 - val_accuracy: 0.9942 -

In [3]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [19]:
import os
print(os.listdir('/content'))

['.config', 'train_preprocessed.csv', 'tokenizer.pkl', 'model.h5', '.ipynb_checkpoints', 'sample_data']


In [20]:
import gradio as gr
import matplotlib.pyplot as plt
import os
import sys
import pickle
import re
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9!?.,' ]", " ", text)
    words = text.split()
    cleaned_words = []

    for word in words:
        if re.fullmatch(r"\d+", word):
            cleaned_words.append("[Number Detected]")
        else:
            cleaned_words.append(word)

    text = " ".join(cleaned_words)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Preprocess text data
def preprocess_text(texts, tokenizer=None, max_len=100):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=20000)
        tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    data = pad_sequences(sequences, maxlen=max_len, truncating='post')
    return data, tokenizer

# Load the trained model and tokenizer
def load_model_and_tokenizer(model_path='model.h5', tokenizer_path='tokenizer.pkl'):
    model = load_model(model_path)
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)
    return model, tokenizer

# Toxicity prediction function
def predict_toxicity(model, tokenizer, text, max_len=100):
    processed_text, _ = preprocess_text([clean_text(text)], tokenizer, max_len)
    prediction = model.predict(processed_text)[0]
    categories = ["Toxic", "Obscene", "Insult"]
    return {cat: float(pred) for cat, pred in zip(categories, prediction)}

# Toxicity prediction function for Gradio interface
def toxicity_predictor(text, chart_type):
    text = text.strip()

    if not text:
        return None, "No input detected. Please provide some text."
    if len(text) > 200:
        return None, "Input text is too long. Please shorten your text."
    if not chart_type:
        return None, "Please select a chart type (Bar Chart or Pie Chart)."

    try:
        result = predict_toxicity(model, tokenizer, text)
        labels, percentages = zip(*[(k, round(v * 100, 2)) for k, v in result.items()])

        if sum(percentages) < 1:
            return None, "No significant toxicity detected."

        fig, ax = plt.subplots(figsize=(8, 4))

        if chart_type == "Bar Chart":
            ax.bar(labels, percentages, color=['red' if p > 50 else 'skyblue' for p in percentages])
            ax.set_ylabel('Percentage')
            ax.set_title('Toxicity Prediction')
            ax.set_ylim(0, 100)

            for i, v in enumerate(percentages):
                ax.text(i, v + 2, f"{v:.2f}%", ha='center', fontsize=10)

        elif chart_type == "Pie Chart":
            filtered_data = [(p, l) for p, l in zip(percentages, labels) if p > 0]
            if not filtered_data:
                return None, "No significant toxicity detected."

            sorted_data = sorted(filtered_data, reverse=True)
            sorted_percentages, sorted_labels = zip(*sorted_data)
            colors = ["red", "orange", "skyblue"][:len(sorted_percentages)]

            ax.pie(sorted_percentages, labels=sorted_labels, autopct='%1.1f%%', colors=colors)
            ax.set_title('Toxicity Prediction')

        return fig, None

    except Exception as e:
        return None, f"An unexpected error occurred: {str(e)}"

# Load model and tokenizer (make sure model.h5 and tokenizer.pkl are in the same directory)
model, tokenizer = load_model_and_tokenizer(model_path='model.h5', tokenizer_path='tokenizer.pkl')

# Gradio app layout
iface = gr.Interface(
    fn=toxicity_predictor,
    inputs=[
        gr.Textbox(lines=3, placeholder="Enter your text here..."),
        gr.Radio(choices=["Bar Chart", "Pie Chart"], label="Select Chart Type", value="Bar Chart")
    ],
    outputs=[
        gr.Plot(),  # Graph output
        gr.Textbox(label="Error Message", interactive=False)  # Error message display
    ],
    title="CleanComment",  # Title added here
)

# Launch app
if __name__ == "__main__":
    iface.launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4c98b3f416a11366a5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
