<a href="https://colab.research.google.com/github/satwikmishra11/Cyber_Sakhi/blob/main/models/commentclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow pandas matplotlib scikit-learn




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


In [None]:

df = pd.read_csv('//content//train.csv.zip')

print(df.head())


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [None]:
from tensorflow.keras.layers import TextVectorization

X = df['comment_text']
y = df[df.columns[2:]].values

# Define vocabulary size
MAX_FEATURES = 200000  # Number of words in the vocabulary

# Text Vectorization layer
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

vectorizer.adapt(X.values.astype(str))

# text into numerical format
vectorized_text = vectorizer(X.values)


In [None]:
# Create a TensorFlow dataset from input and labels
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))

# Optimize dataset processing
dataset = dataset.cache()         # Cache for efficiency
dataset = dataset.shuffle(160000) # Shuffle for randomness
dataset = dataset.batch(16)       # Batch processing
dataset = dataset.prefetch(8)     # Prefetch for faster processing

# Train-Test Split
size = len(list(dataset))
train_size = int(size * 0.7)
val_size = int(size * 0.2)

train = dataset.take(train_size)
val = dataset.skip(train_size).take(val_size)
test = dataset.skip(train_size + val_size)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

# Initialize model
model = Sequential()

# Embedding Layer
model.add(Embedding(MAX_FEATURES+1, 32))

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Output Layer (6 labels, sigmoid activation for multi-label classification)
model.add(Dense(6, activation='sigmoid'))

# Compile model
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

# Display model summary
model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Early Stopping to monitor validation loss
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model with EarlyStopping
history = model.fit(train, epochs=10, validation_data=val, callbacks=[early_stop])


Epoch 1/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m685s[0m 98ms/step - loss: 0.0571 - val_loss: 0.0440
Epoch 2/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m740s[0m 98ms/step - loss: 0.0461 - val_loss: 0.0414
Epoch 3/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m681s[0m 98ms/step - loss: 0.0406 - val_loss: 0.0360
Epoch 4/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m683s[0m 98ms/step - loss: 0.0362 - val_loss: 0.0314
Epoch 5/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m675s[0m 97ms/step - loss: 0.0326 - val_loss: 0.0286
Epoch 6/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m680s[0m 97ms/step - loss: 0.0291 - val_loss: 0.0259
Epoch 7/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m689s[0m 99ms/step - loss: 0.0263 - val_loss: 0.0231
Epoch 8/10
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m736s[0m 98ms/step - loss: 0.0242 - val_loss: 0.0204


In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

# Evaluate on the test dataset
for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    y_pred = model.predict(X_true)

    y_true = y_true.flatten()
    y_pred = y_pred.flatten()

    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    accuracy.update_state(y_true, y_pred)

print(f'🔹 Precision: {precision.result().numpy():.4f}')
print(f'🔹 Recall: {recall.result().numpy():.4f}')
print(f'🔹 Accuracy: {accuracy.result().numpy():.4f}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7

In [None]:

model.save("/content/toxic_comment_model.keras")
