CNN model:   

In [169]:
# Import necessary libraries
import pandas as pd
# For data manipulation and analysis
import numpy as np
 # For numerical computations
from tensorflow.keras.preprocessing.text import Tokenizer
# For converting text into sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences
 # For padding sequences to the same length
from tensorflow.keras.models import Sequential
  # For creating a linear stack of neural network layers
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, SpatialDropout1D, Flatten
 # Different layers for the neural network
from tensorflow.keras.optimizers import Adam
# Optimizer for the model
from tensorflow.keras.callbacks import EarlyStopping
 # To stop training when a monitored metric stops improving
from tensorflow.keras.regularizers import l2
 # Regularizer to prevent overfitting
from sklearn.model_selection import train_test_split
 # For splitting data into training and testing sets
from sklearn.metrics import classification_report
# For generating a report showing the main classification metrics
from imblearn.over_sampling import RandomOverSampler
 # For balancing the dataset by oversampling
import nltk
  # For natural language processing
from nltk.corpus import stopwords
 # For removing common words that do not carry significant meaning
import string
 # For string operations

# Download stopwords
nltk.download('stopwords')  # Download the list of stopwords from NLTK

# Load the dataset
file_path = '/content/HateSpeechDetection (Balanced dataset).csv'  # File path to the dataset
df = pd.read_csv(file_path)  # Load the dataset into a pandas DataFrame

# Data Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    stop_words = set(stopwords.words('english'))  # Set of stopwords in English
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply the preprocessing function to the comments
df['Comment'] = df['Comment'].apply(preprocess_text)  # Preprocess the 'Comment' column

# Split the data into features and labels
X = df['Comment']  # Features are the comments
y = df['Hateful']  # Labels are whether the comment is hateful or not

# Tokenize the text data
tokenizer = Tokenizer()  # Initialize the Tokenizer
tokenizer.fit_on_texts(X)  # Fit the tokenizer on the text data
X_tokenized = tokenizer.texts_to_sequences(X)  # Convert text to sequences of integers

# Pad the sequences
max_length = 100  # Define maximum length of sequences
X_padded = pad_sequences(X_tokenized, maxlen=max_length, padding='post')  # Pad sequences to the same length

# Use RandomOverSampler to balance the dataset
desired_samples_per_class = 5000  # Desired number of samples per class
sampling_strategy = {0: desired_samples_per_class, 1: desired_samples_per_class}  # Define sampling strategy
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)  # Initialize RandomOverSampler
X_resampled, y_resampled = ros.fit_resample(X_padded, y)  # Resample the dataset

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)  # Split data

# Define the CNN model
embedding_dim = 32  # Define embedding dimension
cnn_model = Sequential()  # Initialize the Sequential model
cnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length))  # Add embedding layer
cnn_model.add(SpatialDropout1D(0.2))  # Add spatial dropout layer
cnn_model.add(Conv1D(filters=32, kernel_size=5, activation='relu'))  # Add Conv1D layer
cnn_model.add(MaxPooling1D(pool_size=2))  # Add max pooling layer
cnn_model.add(Flatten())  # Flatten the input
cnn_model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))  # Add dense layer with L2 regularization
cnn_model.add(Dropout(0.5))  # Add dropout layer
cnn_model.add(Dense(1, activation='sigmoid'))  # Add output layer

# Compile the CNN model
optimizer = Adam(learning_rate=0.001)  # Define the optimizer
cnn_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])  # Compile the model

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)  # Early stopping

# Train the CNN model
cnn_history = cnn_model.fit(X_train, y_train, epochs=3, batch_size=64, validation_split=0.2, verbose=1, callbacks=[early_stopping])  # Train the model

# Evaluate the CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test)  # Evaluate the model
print(f'Test Accuracy: {cnn_accuracy}')  # Print test accuracy

# Generate predictions and print classification report
y_pred = (cnn_model.predict(X_test) > 0.5).astype("int32")  # Generate predictions
print(classification_report(y_test, y_pred))  # Print classification report


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/3




Epoch 2/3
Epoch 3/3
Test Accuracy: 0.9950000047683716
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1001
           1       0.99      1.00      1.00       999

    accuracy                           0.99      2000
   macro avg       1.00      1.00      0.99      2000
weighted avg       1.00      0.99      0.99      2000



Again building the model by following these steps:



Further reducing the number of filters in Conv1D.

Further reducing the number of neurons in Dense layers.

Reducing the learning rate.

Increasing the dropout rate.

Simplify the model architecture.


In [168]:
# Import necessary libraries
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')

# Load the dataset
file_path = '/content/HateSpeechDetection (Balanced dataset).csv'
df = pd.read_csv(file_path)
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Platform,Comment,Hateful
0,Reddit,Damn I thought they had strict gun laws in Ger...,0
1,Reddit,I dont care about what it stands for or anythi...,0
2,Reddit,It's not a group it's an idea lol,0
3,Reddit,So it's not just America!,0
4,Reddit,The dog is a spectacular dancer considering he...,0


 Data Preprocessing

In [156]:
# Data Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


# Apply the preprocessing function to the comments
df['Comment'] = df['Comment'].apply(preprocess_text)
print("Data preprocessing completed.")
print(df.head())



Data preprocessing completed.
  Platform                                           Comment  Hateful
0   Reddit              damn thought strict gun laws germany        0
1   Reddit  dont care stands anything connected like shields        0
2   Reddit                                    group idea lol        0
3   Reddit                                           america        0
4   Reddit  dog spectacular dancer considering two left feet        0


Tokenize and Pad Sequences

In [157]:
# Split the data into features and labels
X = df['Comment']
y = df['Hateful']

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)

# Pad the sequences
max_length = 100
X_padded = pad_sequences(X_tokenized, maxlen=max_length, padding='post')
print("Data preprocessing completed.")
print(df.head())



Data preprocessing completed.
  Platform                                           Comment  Hateful
0   Reddit              damn thought strict gun laws germany        0
1   Reddit  dont care stands anything connected like shields        0
2   Reddit                                    group idea lol        0
3   Reddit                                           america        0
4   Reddit  dog spectacular dancer considering two left feet        0


 Tokenize and Pad Sequences

In [158]:
# Split the data into features and labels
X = df['Comment']
y = df['Hateful']

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)

# Pad the sequences
max_length = 100
X_padded = pad_sequences(X_tokenized, maxlen=max_length, padding='post')
print("Tokenization and padding completed.")
print(X_padded[:5])


Tokenization and padding completed.
[[ 190  148 2423 1488 1489  581    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   2   57 1071   93 1490    1 2424    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    

Balance the Dataset:

In [159]:
# Use RandomOverSampler to balance the dataset
desired_samples_per_class = 5000
sampling_strategy = {0: desired_samples_per_class, 1: desired_samples_per_class}

ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_padded, y)
# Print the results
print("Dataset balanced using RandomOverSampler.")
print("Class distribution after resampling:")
print(pd.Series(y_resampled).value_counts())



Dataset balanced using RandomOverSampler.
Class distribution after resampling:
Hateful
0    5000
1    5000
Name: count, dtype: int64




 Split the Data

In [160]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")
print(f"Training set size: {len(X_train)}, Testing set size: {len(X_test)}")


Data split into training and testing sets.
Training set size: 8000, Testing set size: 2000


Define the Model:

In [161]:
# Define the CNN model
embedding_dim = 32  # Further reduced embedding dimension
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length))
cnn_model.add(Conv1D(filters=16, kernel_size=5, activation='relu'))  # Further reduced filters
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))  # Further reduced neurons
cnn_model.add(Dropout(0.3))  # Increased dropout rate
cnn_model.add(Dense(1, activation='sigmoid'))
print("Model defined successfully.")
cnn_model.summary()


Model defined successfully.
Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_27 (Embedding)    (None, 100, 32)           198176    
                                                                 
 conv1d_38 (Conv1D)          (None, 96, 16)            2576      
                                                                 
 max_pooling1d_23 (MaxPooli  (None, 48, 16)            0         
 ng1D)                                                           
                                                                 
 flatten_6 (Flatten)         (None, 768)               0         
                                                                 
 dense_52 (Dense)            (None, 32)                24608     
                                                                 
 dropout_26 (Dropout)        (None, 32)                0         
                         

Compile the Model

In [162]:

# Compile the CNN model
optimizer = Adam(learning_rate=0.0001)  # Reduced learning rate
cnn_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])


Train the Model

In [163]:
from tensorflow.keras.callbacks import EarlyStopping
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

# Train the CNN model
cnn_history = cnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, verbose=1, callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Evaluate the Model

In [164]:

# Evaluate the CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {cnn_accuracy}')

# Generate predictions and print classification report
y_pred = (cnn_model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.8665000200271606
              precision    recall  f1-score   support

           0       0.81      0.96      0.88      1001
           1       0.95      0.77      0.85       999

    accuracy                           0.87      2000
   macro avg       0.88      0.87      0.87      2000
weighted avg       0.88      0.87      0.87      2000

