Importing Libraries and Loading the Dataset.


We start by importing the necessary libraries and loading the dataset.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')

# Load the dataset
file_path = '/content/HateSpeechDetection (Balanced dataset).csv'  # Update the file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())


  Platform                                            Comment  Hateful
0   Reddit  Damn I thought they had strict gun laws in Ger...        0
1   Reddit  I dont care about what it stands for or anythi...        0
2   Reddit                  It's not a group it's an idea lol        0
3   Reddit                          So it's not just America!        0
4   Reddit  The dog is a spectacular dancer considering he...        0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Preprocessing

Next, we preprocess the text data by converting it to lowercase, removing punctuation, and stopwords.

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing function to the comments
df['Comment'] = df['Comment'].apply(preprocess_text)

# Display the first few rows of the preprocessed dataframe
print(df.head())


  Platform                                           Comment  Hateful
0   Reddit              damn thought strict gun laws germany        0
1   Reddit  dont care stands anything connected like shields        0
2   Reddit                                    group idea lol        0
3   Reddit                                           america        0
4   Reddit  dog spectacular dancer considering two left feet        0


Splitting the Data

We split the data into features and labels and then into training and testing sets.

In [None]:
# Split the data into features and labels
X = df['Comment']
y = df['Hateful']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}')
print(f'Testing samples: {len(X_test)}')


Training samples: 2400
Testing samples: 600


Vectorizing the Text Data

We transform the text data into TF-IDF feature vectors.

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


Training and Evaluating Different Models with Suboptimal Hyperparameters:


Logistic Regression


We define a logistic regression model with suboptimal hyperparameters and train it.

In [None]:
# Define the model and suboptimal hyperparameters
lr_model = LogisticRegression(C=0.001)  # Very high regularization

# Train the model
lr_model.fit(X_train_vec, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test_vec)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_predictions)
print('Logistic Regression Accuracy:', lr_accuracy)
print(classification_report(y_test, lr_predictions))


Logistic Regression Accuracy: 0.8233333333333334
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       494
           1       0.00      0.00      0.00       106

    accuracy                           0.82       600
   macro avg       0.41      0.50      0.45       600
weighted avg       0.68      0.82      0.74       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Machine

We define a Support Vector Machine model with suboptimal hyperparameters and train it.

In [None]:
# Define the model and suboptimal hyperparameters
svm_model = SVC(C=0.001, gamma=1, kernel='linear')  # Very high regularization and less flexible kernel

# Train the model
svm_model.fit(X_train_vec, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test_vec)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_predictions)
print('Support Vector Machine Accuracy:', svm_accuracy)
print(classification_report(y_test, svm_predictions))


Support Vector Machine Accuracy: 0.8233333333333334
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       494
           1       0.00      0.00      0.00       106

    accuracy                           0.82       600
   macro avg       0.41      0.50      0.45       600
weighted avg       0.68      0.82      0.74       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Machine

We define a Support Vector Machine model with suboptimal hyperparameters and train it.

In [None]:
# Define the model and suboptimal hyperparameters
svm_model = SVC(C=0.001, gamma=1, kernel='linear')  # Very high regularization and less flexible kernel

# Train the model
svm_model.fit(X_train_vec, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test_vec)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_predictions)
print('Support Vector Machine Accuracy:', svm_accuracy)
print(classification_report(y_test, svm_predictions))


Support Vector Machine Accuracy: 0.8233333333333334
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       494
           1       0.00      0.00      0.00       106

    accuracy                           0.82       600
   macro avg       0.41      0.50      0.45       600
weighted avg       0.68      0.82      0.74       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest

We define a random forest model with suboptimal hyperparameters and train it.

In [None]:
# Define the model and suboptimal hyperparameters
rf_model = RandomForestClassifier(n_estimators=10, max_depth=5)  # Fewer trees and shallow depth

# Train the model
rf_model.fit(X_train_vec, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test_vec)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print('Random Forest Accuracy:', rf_accuracy)
print(classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.8233333333333334
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       494
           1       0.00      0.00      0.00       106

    accuracy                           0.82       600
   macro avg       0.41      0.50      0.45       600
weighted avg       0.68      0.82      0.74       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


 Summarize Evaluation:

In [None]:
print("\nSummary of Model Evaluations:")
print(f"Logistic Regression Accuracy: {lr_accuracy}")
print(f"Support Vector Machine Accuracy: {svm_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")

print("\nDetailed Classification Reports:")
print("Logistic Regression Report:")
print(classification_report(y_test, lr_predictions))
print("Support Vector Machine Report:")
print(classification_report(y_test, svm_predictions))
print("Random Forest Report:")
print(classification_report(y_test, rf_predictions))



Summary of Model Evaluations:
Logistic Regression Accuracy: 0.8233333333333334
Support Vector Machine Accuracy: 0.8233333333333334
Random Forest Accuracy: 0.8233333333333334

Detailed Classification Reports:
Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       494
           1       0.00      0.00      0.00       106

    accuracy                           0.82       600
   macro avg       0.41      0.50      0.45       600
weighted avg       0.68      0.82      0.74       600

Support Vector Machine Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       494
           1       0.00      0.00      0.00       106

    accuracy                           0.82       600
   macro avg       0.41      0.50      0.45       600
weighted avg       0.68      0.82      0.74       600

Random Forest Report:
              precision    recall  f1-score   support

 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
