Importing Libraries and Loading the Dataset:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')

# Load the dataset
file_path = '/content/HateSpeechDetection (Balanced dataset).csv'  # Update the file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())


  Platform                                            Comment  Hateful
0   Reddit  Damn I thought they had strict gun laws in Ger...        0
1   Reddit  I dont care about what it stands for or anythi...        0
2   Reddit                  It's not a group it's an idea lol        0
3   Reddit                          So it's not just America!        0
4   Reddit  The dog is a spectacular dancer considering he...        0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Data Preprocessing

Preprocess the text data by removing punctuation, converting to lowercase, and removing stopwords.

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing function to the comments
df['Comment'] = df['Comment'].apply(preprocess_text)

# Display the first few rows of the preprocessed dataframe
print(df.head())


  Platform                                           Comment  Hateful
0   Reddit              damn thought strict gun laws germany        0
1   Reddit  dont care stands anything connected like shields        0
2   Reddit                                    group idea lol        0
3   Reddit                                           america        0
4   Reddit  dog spectacular dancer considering two left feet        0


 Splitting the Data:

Split the dataset into training and testing sets.



In [None]:
# Split the data into features and labels
X = df['Comment']
y = df['Hateful']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}')
print(f'Testing samples: {len(X_test)}')


Training samples: 2400
Testing samples: 600


 Vectorizing the Text Data:
 Convert the text data into numerical data using TF-IDF vectorization

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


 Training and Evaluating Different Models:

 Train and evaluate different machine learning models: Logistic Regression, Support Vector Machine, and Random Forest. Including hyperparameter tuning using GridSearchCV.

Logistic Regression:


In [None]:
# Define the model and hyperparameters
lr_model = LogisticRegression()
lr_params = {'C': [0.01, 0.1, 1, 10, 100]}

# Perform grid search
lr_grid = GridSearchCV(lr_model, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train_vec, y_train)

# Make predictions
lr_predictions = lr_grid.predict(X_test_vec)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_predictions)
print('Logistic Regression Accuracy:', lr_accuracy)
print('Best Parameters:', lr_grid.best_params_)
print(classification_report(y_test, lr_predictions))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.9433333333333334
Best Parameters: {'C': 100}
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       494
           1       0.93      0.74      0.82       106

    accuracy                           0.94       600
   macro avg       0.94      0.86      0.89       600
weighted avg       0.94      0.94      0.94       600



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Support Vector Machine:



In [None]:
# Define the model and hyperparameters
svm_model = SVC()
svm_params = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf', 'linear']}

# Perform grid search
svm_grid = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy')
svm_grid.fit(X_train_vec, y_train)

# Make predictions
svm_predictions = svm_grid.predict(X_test_vec)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_predictions)
print('Support Vector Machine Accuracy:', svm_accuracy)
print('Best Parameters:', svm_grid.best_params_)
print(classification_report(y_test, svm_predictions))


Support Vector Machine Accuracy: 0.945
Best Parameters: {'C': 10, 'gamma': 1, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       494
           1       0.92      0.75      0.83       106

    accuracy                           0.94       600
   macro avg       0.93      0.87      0.90       600
weighted avg       0.94      0.94      0.94       600



Random Forest:



In [None]:
# Define the model and hyperparameters
rf_model = RandomForestClassifier()
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}

# Perform grid search
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train_vec, y_train)

# Make predictions
rf_predictions = rf_grid.predict(X_test_vec)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print('Random Forest Accuracy:', rf_accuracy)
print('Best Parameters:', rf_grid.best_params_)
print(classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.9366666666666666
Best Parameters: {'max_depth': None, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       494
           1       0.94      0.69      0.79       106

    accuracy                           0.94       600
   macro avg       0.94      0.84      0.88       600
weighted avg       0.94      0.94      0.93       600



Summary of Model Performance:

In [None]:
print('Model Performance Summary:')
print(f'Logistic Regression Accuracy: {lr_accuracy}')
print(f'Support Vector Machine Accuracy: {svm_accuracy}')
print(f'Random Forest Accuracy: {rf_accuracy}')


Model Performance Summary:
Logistic Regression Accuracy: 0.9466666666666667
Support Vector Machine Accuracy: 0.945
Random Forest Accuracy: 0.9366666666666666
