 2: One-Hot Encoding and TF-IDF Vectorization

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = '/content/HateSpeechDetection (preprocessed).csv'
data = pd.read_csv(file_path)

# One-Hot Encode the 'Platform' column
platform_one_hot = pd.get_dummies(data['Platform'], prefix='Platform')

# TF-IDF Vectorize the 'Comment' column
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
comments_tfidf = tfidf_vectorizer.fit_transform(data['Comment']).toarray()

# Combine encoded features
encoded_data = pd.DataFrame(comments_tfidf, columns=tfidf_vectorizer.get_feature_names_out())
encoded_data = pd.concat([encoded_data, platform_one_hot], axis=1)
encoded_data['Hateful'] = data['Hateful']

# Split data into training and testing sets
X = encoded_data.drop('Hateful', axis=1)
y = encoded_data['Hateful']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy (One-Hot Encoding + TF-IDF Vectorization): {accuracy}')
print('Classification Report:')
print(report)


Accuracy (One-Hot Encoding + TF-IDF Vectorization): 0.8783333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       494
           1       1.00      0.31      0.47       106

    accuracy                           0.88       600
   macro avg       0.94      0.66      0.70       600
weighted avg       0.89      0.88      0.85       600

