 Importing Libraries and Loading the Data:


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Dropout, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')

# Load the dataset
file_path = '/content/HateSpeechDetection (Balanced dataset).csv'  # Update the file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())


  Platform                                            Comment  Hateful
0   Reddit  Damn I thought they had strict gun laws in Ger...        0
1   Reddit  I dont care about what it stands for or anythi...        0
2   Reddit                  It's not a group it's an idea lol        0
3   Reddit                          So it's not just America!        0
4   Reddit  The dog is a spectacular dancer considering he...        0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Preprocessing:

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing function to the comments
df['Comment'] = df['Comment'].apply(preprocess_text)

# Display the first few rows of the preprocessed dataframe
print(df.head())


  Platform                                           Comment  Hateful
0   Reddit              damn thought strict gun laws germany        0
1   Reddit  dont care stands anything connected like shields        0
2   Reddit                                    group idea lol        0
3   Reddit                                           america        0
4   Reddit  dog spectacular dancer considering two left feet        0


Tokenization and Padding:


In [5]:
# Split the data into features and labels
X = df['Comment']
y = df['Hateful']

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)

# Pad the sequences
max_length = 100  # Define the maximum length for padding
X_padded = pad_sequences(X_tokenized, maxlen=max_length, padding='post')

# Display the shape of the padded data
print(f'Padded data shape: {X_padded.shape}')


Padded data shape: (3000, 100)


Balancing the Dataset with RandomOverSampler:

In [6]:
# Use RandomOverSampler to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_padded, y)

# Display the shape of the resampled data
print(f'Resampled data shape: {X_resampled.shape}')


Resampled data shape: (4800, 100)


Splitting the Data:

In [7]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}')
print(f'Testing samples: {len(X_test)}')


Training samples: 3840
Testing samples: 960


Building the Model:

In [8]:
# Define the model
embedding_dim = 128  # Dimension of the embedding vectors
model = Sequential()

# Add embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length))

# Add spatial dropout layer
model.add(SpatialDropout1D(0.2))

# Add convolutional layers
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Add bidirectional LSTM layer
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))

# Add dense layers
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          792704    
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 128)          0         
 Dropout1D)                                                      
                                                                 
 conv1d (Conv1D)             (None, 96, 64)            41024     
                                                                 
 max_pooling1d (MaxPooling1  (None, 48, 64)            0         
 D)                                                              
                                                                 
 bidirectional (Bidirection  (None, 200)               132000    
 al)                                                             
                                                        

Training the Model:

In [9]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Evaluating the Model:

In [10]:
# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Print classification report
print(classification_report(y_test, y_pred))


Test Accuracy: 0.9750000238418579
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       481
           1       0.97      0.99      0.98       479

    accuracy                           0.97       960
   macro avg       0.98      0.98      0.97       960
weighted avg       0.98      0.97      0.97       960

