In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Conv1D, MaxPooling1D, GlobalMaxPooling1D,Flatten
from sklearn.metrics import classification_report, accuracy_score


### **Load and Preprocess Data**

In [3]:
df = pd.read_csv('Reddit_Encoded.csv')

In [4]:
df.head()

Unnamed: 0,comment,hate_speech,lemmatized_comment,document_vector_flat
0,subsection retarded hungarians ohh boy brace l...,1,subsection retard hungarians ohh boy brace liv...,"0.014043219,-0.01809359,0.017145459,0.08062436..."
1,hiii just got work Foundation and grounding ma...,0,hiii just get work Foundation and ground mainl...,"-0.0030388932,-0.035133556,0.020659983,0.07383..."
2,wow guess soyboys every country,0,wow guess soyboys every country,"0.017362628,0.005587179,0.0297773,0.109146975,..."
3,owen benjamins soyboy song goes every country ...,0,owen benjamins soyboy song go every country amaze,"0.018085241,0.0011954829,2.8959475e-05,0.07601..."
4,yall hear sumn means live small town rn for w...,0,yall hear sumn mean live small town rn for wor...,"0.023993038,-0.00060867134,0.005239945,0.05989..."


In [10]:
# Convert the document vectors from string to numerical form
df['document_vector'] = df['document_vector_flat'].apply(lambda x: np.fromstring(x, sep=','))

# Extract features and labels
X = np.array(df['document_vector'].tolist())
y = df['hate_speech'].values

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape data for CNN (add a channel dimension)
X = X.reshape(X.shape[0], X.shape[1], 1)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## ***Deep Learning Models***

**CNN Model**

In [17]:
# Define the model
model = Sequential()

# Add a Conv1D layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))

# Add a MaxPooling layer
model.add(MaxPooling1D(pool_size=2))

# Add a Flatten layer to convert 3D data to 1D
model.add(Flatten())

# Add a Dense layer
model.add(Dense(128, activation='relu'))

# Add a Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Add the output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### **Train and Evaluate Model**

In [15]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
score = model.evaluate(X_val, y_val, verbose=0)
print(f"CNN Validation Accuracy: {score[1]}")

Epoch 1/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 48ms/step - accuracy: 0.8710 - loss: 0.3178 - val_accuracy: 0.8557 - val_loss: 0.3730
Epoch 2/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 49ms/step - accuracy: 0.8692 - loss: 0.3174 - val_accuracy: 0.8515 - val_loss: 0.3758
Epoch 3/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 49ms/step - accuracy: 0.8703 - loss: 0.3151 - val_accuracy: 0.8562 - val_loss: 0.3699
Epoch 4/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 59ms/step - accuracy: 0.8764 - loss: 0.3035 - val_accuracy: 0.8530 - val_loss: 0.3880
Epoch 5/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 51ms/step - accuracy: 0.8727 - loss: 0.3173 - val_accuracy: 0.8605 - val_loss: 0.3721
Epoch 6/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 45ms/step - accuracy: 0.8770 - loss: 0.2973 - val_accuracy: 0.8562 - val_loss: 0.3947
Epoch 7/10
[1m5

In [16]:
# Predict the labels for the validation data
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype(int).flatten()

# Generate and print the classification report
report = classification_report(y_val, y_pred, target_names=['Not Hate Speech', 'Hate Speech'])
print(report)

[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step
                 precision    recall  f1-score   support

Not Hate Speech       0.90      0.92      0.91      3369
    Hate Speech       0.72      0.66      0.69      1074

       accuracy                           0.86      4443
      macro avg       0.81      0.79      0.80      4443
   weighted avg       0.85      0.86      0.86      4443



**LSTM**

In [19]:
# Define the LSTM model
model = Sequential()

# Add an LSTM layer
model.add(LSTM(units=128, input_shape=(X_train.shape[1], 1)))

# Add a Dense layer with 128 units
model.add(Dense(128, activation='relu'))

# Add a Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Add the output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()


  super().__init__(**kwargs)


### **Train and Evaluate Model**

In [20]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
score = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Accuracy: {score[1]}")


Epoch 1/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 417ms/step - accuracy: 0.7616 - loss: 0.5594 - val_accuracy: 0.7583 - val_loss: 0.5406
Epoch 2/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 407ms/step - accuracy: 0.7631 - loss: 0.5368 - val_accuracy: 0.7583 - val_loss: 0.5350
Epoch 3/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 378ms/step - accuracy: 0.7636 - loss: 0.5330 - val_accuracy: 0.7657 - val_loss: 0.5264
Epoch 4/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 377ms/step - accuracy: 0.7677 - loss: 0.5263 - val_accuracy: 0.7625 - val_loss: 0.5265
Epoch 5/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 374ms/step - accuracy: 0.7708 - loss: 0.5206 - val_accuracy: 0.7698 - val_loss: 0.5207
Epoch 6/10
[1m556/556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 376ms/step - accuracy: 0.7678 - loss: 0.5226 - val_accuracy: 0.7691 - val_loss: 0.5203
Epoc

In [21]:
# Predict the labels for the validation data
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype(int).flatten()

# Generate and print the classification report
report = classification_report(y_val, y_pred, target_names=['Not Hate Speech', 'Hate Speech'])
print(report)


[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 134ms/step
                 precision    recall  f1-score   support

Not Hate Speech       0.78      0.98      0.87      3369
    Hate Speech       0.63      0.12      0.20      1074

       accuracy                           0.77      4443
      macro avg       0.70      0.55      0.53      4443
   weighted avg       0.74      0.77      0.70      4443



#### CNN performing better than LSTM