Importing Libraries and Loading the Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('/content/HateSpeechDetection (Balanced dataset).csv')

# Display the first few rows of the dataframe
print(df.head())


  Platform                                            Comment  Hateful
0   Reddit  Damn I thought they had strict gun laws in Ger...        0
1   Reddit  I dont care about what it stands for or anythi...        0
2   Reddit                  It's not a group it's an idea lol        0
3   Reddit                          So it's not just America!        0
4   Reddit  The dog is a spectacular dancer considering he...        0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Preprocessing:

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing function to the comments
df['Comment'] = df['Comment'].apply(preprocess_text)

# Display the first few rows of the preprocessed dataframe
print(df.head())


  Platform                                           Comment  Hateful
0   Reddit              damn thought strict gun laws germany        0
1   Reddit  dont care stands anything connected like shields        0
2   Reddit                                    group idea lol        0
3   Reddit                                           america        0
4   Reddit  dog spectacular dancer considering two left feet        0


Splitting the Data:

In [None]:
# Split the data into features and labels
X = df['Comment']
y = df['Hateful']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}')
print(f'Testing samples: {len(X_test)}')


Training samples: 2400
Testing samples: 600


Vectorizing the Text Data

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


Training and Evaluating Different Models

Logistic Regression

In [None]:
# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_vec, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test_vec)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_predictions)
print('Logistic Regression Accuracy:', lr_accuracy)
print(classification_report(y_test, lr_predictions))


Logistic Regression Accuracy: 0.8816666666666667
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       494
           1       1.00      0.33      0.50       106

    accuracy                           0.88       600
   macro avg       0.94      0.67      0.71       600
weighted avg       0.90      0.88      0.86       600



Support Vector Machine

In [None]:
# Train a Support Vector Machine model
svm_model = SVC()
svm_model.fit(X_train_vec, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test_vec)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_predictions)
print('Support Vector Machine Accuracy:', svm_accuracy)
print(classification_report(y_test, svm_predictions))


Support Vector Machine Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       494
           1       0.98      0.50      0.66       106

    accuracy                           0.91       600
   macro avg       0.94      0.75      0.81       600
weighted avg       0.92      0.91      0.90       600



Random Forest

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_vec, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test_vec)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print('Random Forest Accuracy:', rf_accuracy)
print(classification_report(y_test, rf_predictions))


Random Forest Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       494
           1       0.94      0.71      0.81       106

    accuracy                           0.94       600
   macro avg       0.94      0.85      0.89       600
weighted avg       0.94      0.94      0.94       600



Summary of Model Performance:

In [None]:
print('Model Performance Summary:')
print(f'Logistic Regression Accuracy: {lr_accuracy}')
print(f'Support Vector Machine Accuracy: {svm_accuracy}')
print(f'Random Forest Accuracy: {rf_accuracy}')


Model Performance Summary:
Logistic Regression Accuracy: 0.8816666666666667
Support Vector Machine Accuracy: 0.91
Random Forest Accuracy: 0.94
