In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam


# Load your dataset (adjust the path if necessary)
df = pd.read_csv('/content/labelled_comments.csv')

# Preprocessing: Encoding the labels (0 and 1 for non-cyberbullying and cyberbullying)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Splitting the dataset into train and test sets
X = df['preprocessed_comments'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization: Convert words to tokens and pad sequences for equal length
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_sequence_length = 500
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Build the RNN model with LSTM layer
model = Sequential()

# Embedding layer (turns word indices into dense vectors)
model.add(Embedding(input_dim=5000, output_dim=128))

# LSTM layer (Recurrent layer to learn dependencies in the text)
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32, return_sequences=False)))

# Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Dense layer with a single output node (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# You can save the model for later use
model.save('cyberbullying_detection_model.h5')


Epoch 1/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1063s[0m 15s/step - accuracy: 0.6032 - loss: 0.6463 - val_accuracy: 0.7518 - val_loss: 0.5202
Epoch 2/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1098s[0m 15s/step - accuracy: 0.8618 - loss: 0.3622 - val_accuracy: 0.7500 - val_loss: 0.4912
Epoch 3/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1045s[0m 15s/step - accuracy: 0.9348 - loss: 0.2233 - val_accuracy: 0.7600 - val_loss: 0.5985
Epoch 4/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1102s[0m 15s/step - accuracy: 0.9403 - loss: 0.1702 - val_accuracy: 0.7382 - val_loss: 0.6713
Epoch 5/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1038s[0m 15s/step - accuracy: 0.9462 - loss: 0.1650 - val_accuracy: 0.7500 - val_loss: 0.6693
Epoch 6/10
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1044s[0m 15s/step - accuracy: 0.9647 - loss: 0.1215 - val_accuracy: 0.7545 - val_loss: 0.7700
Epoch 7/10
[1m69/69[



Accuracy: 74.36%


In [None]:
import pickle


In [None]:
# Save the tokenizer object to a .pkl file
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)
print("Tokenizer saved as 'tokenizer.pkl'")


Tokenizer saved as 'tokenizer.pkl'
