In [13]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn opencv-python-headless

# Importing required packages
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import cv2
from matplotlib import pyplot as plt
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

Collecting tensorflow-gpu
  Using cached tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting protobuf<3.20,>=3.9.2
  Using cached protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Building wheels for collected packages: tensorflow-gpu
  Building wheel for tensorflow-gpu (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[18 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 36, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/tmp/pip-install-689u0aax/tensorflow-gpu_2fe6009a60c246a89c73882ed7ff4aac/setup.py", line 37, in <module>
  [31m   [0m Exception:
  [31m   [0m 
  [31m   [0m The "tensorflow-gpu" package has been removed!

In [14]:
!pip install gradio jinja2

[0m

In [15]:
import gradio as gr

In [16]:
# Reading the dataset into a pandas dataframe
file_path = 'train.csv'

try:
    df = pd.read_csv(file_path)
    X = df['comment_text']
    y = df[df.columns[2:]].values
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Dataset not found at '{file_path}'. Please place the 'train.csv' file in the same folder as the notebook.")

In [17]:
# Defining the maximum number of features
MAX_FEATURES = 200000

In [18]:
# Creating a TextVectorization layer for encoding and adapting the training dataset
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=max([len(x.split()) for x in X.values]),
                               output_mode='int')
vectorizer.adapt(X.values)

In [19]:
# Creating a TensorFlow dataset from the vectorized text and labels
dataset = tf.data.Dataset.from_tensor_slices((vectorizer(X.values), y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps with bottlenecks

In [20]:
# Splitting dataset into train, validation, and test sets
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [21]:
# Defining a Bidirectional LSTM model
model = Sequential([
    Embedding(MAX_FEATURES+1, 32),
    Bidirectional(LSTM(32, activation='tanh')),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(6, activation='sigmoid')
])

In [22]:
# Compiling the model using Binary Cross Entropy loss function and Adam optimizer
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [23]:
# Define model file path
model_filepath = '/kaggle/input/toxicityweights/toxicity.h5'

In [24]:
# Check if a pre-trained model exists and load it
if os.path.exists(model_filepath):
    print("Loading pre-trained model...")
    model = tf.keras.models.load_model(model_filepath)
else:
    print("Training the model...")
    history = model.fit(train, epochs=1, validation_data=val)
    model.save(model_filepath)

Loading pre-trained model...


In [25]:
# Evaluation metrics
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
# Calculate Precision, Recall, and Categorical Accuracy scores for the model
for batch in test.as_numpy_iterator(): 
    X_true, y_true = batch
    yhat = model.predict(X_true)
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [27]:
print(f'Precision:{pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision:0.8507959842681885, Recall:0.6643538475036621, Accuracy:0.5245737433433533


In [28]:
# Defining a 'score_comment' function to predict toxicity labels for an input comment,
# vectorize the comment, get predictions, and format the results as a string.
from io import BytesIO
import base64

def score_comment(comment, category=None, custom=None):
    vectorized_comment = vectorizer([comment])
    
    if category and custom:
        # Add custom phrase or word to the input data
        sequence_length = vectorized_comment.shape[1]
        X_custom = np.zeros((1, sequence_length), dtype=np.int64)
        custom_text = '{} {}'.format(category, custom)
        vectorized_custom = vectorizer([custom_text])
        X_custom[:, :len(vectorized_custom[0])] = vectorized_custom
        X = np.vstack([vectorized_comment, X_custom])
    else:
        X = vectorized_comment
    
    results = model.predict(X)[-1]

    # Create a bar chart of category probabilities
    fig, ax = plt.subplots()
    y_pos = np.arange(len(df.columns[2:]))
    ax.barh(y_pos, results, align='center')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(df.columns[2:])
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Probability')
    ax.set_title('Category Probabilities')

    # Save the plot to a buffer
    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)

    # Convert the buffer to a base64 encoded string
    encoded_img = base64.b64encode(buf.getvalue()).decode('utf-8')
    plt.close(fig)

    # Convert base64 image to numpy array
    nparr = np.frombuffer(base64.b64decode(encoded_img), np.uint8)
    img_np = cv2.imdecode(nparr, cv2.IMREAD_UNCHANGED)

    return img_np

In [29]:
# Creating an interface for the model using Gradio to take user input,
# display predictions and probability distributions for different categories,
# and provide a visual output.
interface = gr.Interface(fn=score_comment,
                         inputs=[gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                                 gr.inputs.CheckboxGroup(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], label='Categories'),
                                 gr.inputs.Textbox(label='Custom phrase or word', optional=True)],
                         outputs=gr.outputs.Image(type='pil'))

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",
  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",
  "Usage of gradio.outputs is deprecated, and will not be supported in the future, please import your components from gradio.components",


In [30]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://18ca153da8b1f68eb3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


