In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import TextVectorization

In [49]:
train_data_path = 'jigsaw-toxic-comment-classification-challenge/train.csv/train.csv'
test_data_path = 'jigsaw-toxic-comment-classification-challenge/test.csv/test.csv'
test_labels_path = 'jigsaw-toxic-comment-classification-challenge/test_labels.csv/test_labels.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)
test_labels_df = pd.read_csv(test_labels_path)

# Check that the test data and labels align (same order)
test_df = test_df.merge(test_labels_df, on='id')



In [60]:
actual_labels = test_labels_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [52]:
MAX_FEATURES = 2000  # Adjust if needed
OUTPUT_SEQUENCE_LENGTH = 1800  # Adjust if needed


In [53]:
model = load_model('./boss.h5')

In [54]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
                               output_mode='int')

In [55]:
vectorizer.adapt(train_df['comment_text'].values)

In [56]:
def make_prediction(text):
    # Vectorize input text
    vectorized_text = vectorizer([text])  # Vectorize input text

    # Correctly format the input for prediction (remove unnecessary np.expand_dims)
    # Ensure that the vectorized_text is in the format the model expects: (batch_size, sequence_length)
    prediction = model.predict(vectorized_text)  # Remove the np.expand_dims

    # Convert probabilities to binary outputs
    predicted_classes = (prediction > 0.5).astype(int)
    return predicted_classes


In [57]:
sample_text = "This is a great example to test!"
prediction = make_prediction(sample_text)
print(f"Prediction for '{sample_text}': {prediction}")

Prediction for 'This is a great example to test!': [[0 0 0 0 0 0]]


In [58]:
def predict_test_data(test_df):
    # Vectorize the test data comments
    test_vectorized = vectorizer(test_df['comment_text'].values)

    # Create a Dataset from the vectorized data
    test_dataset = tf.data.Dataset.from_tensor_slices(test_vectorized)
    
    # Batch the dataset
    test_dataset = test_dataset.batch(32)

    # Predict in batches (better for large datasets)
    predictions = model.predict(test_dataset)

    # Convert probabilities to binary outputs
    predicted_classes = (predictions > 0.5).astype(int)
    return predicted_classes

# Get predictions for all test data
test_predictions = predict_test_data(test_df)
print("Predictions on test data completed.")


Predictions on test data completed.


In [4]:
import tensorflow as tf

# Convert actual labels and predictions to tensors for computation
actual_labels_tensor = tf.convert_to_tensor(actual_labels, dtype=tf.int32)
predictions_tensor = tf.convert_to_tensor(test_predictions, dtype=tf.int32)

# Calculate the accuracy
accuracy = tf.metrics.Accuracy()
accuracy.update_state(actual_labels_tensor, predictions_tensor)
overall_accuracy = accuracy.result().numpy()
print(f"Overall Accuracy: {overall_accuracy}")


Overall Accuracy: 0.89


In [22]:


# Interactive input for user to test comments
while True:
    input_text = input("Enter a comment to test for toxicity (type 'exit' to quit): ")
    if input_text.lower() == 'exit':
        break
    result = make_prediction(input_text)
    print(f"The comment is: {result}")

The comment is: Not Toxic
The comment is: Toxic
The comment is: Not Toxic
The comment is: Not Toxic
The comment is: Not Toxic
