3: Frequency Encoding and Count Vectorization

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = '/content/HateSpeechDetection (preprocessed).csv'
data = pd.read_csv(file_path)

# Frequency Encode the 'Platform' column
data['Platform_encoded'] = data['Platform'].map(data['Platform'].value_counts())

# Count Vectorize the 'Comment' column
count_vectorizer = CountVectorizer(max_features=5000)
comments_count = count_vectorizer.fit_transform(data['Comment']).toarray()

# Combine encoded features
encoded_data = pd.DataFrame(comments_count, columns=count_vectorizer.get_feature_names_out())
encoded_data['Platform_encoded'] = data['Platform_encoded']
encoded_data['Hateful'] = data['Hateful']

# Split data into training and testing sets
X = encoded_data.drop('Hateful', axis=1)
y = encoded_data['Hateful']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy (Frequency Encoding + Count Vectorization): {accuracy}')
print('Classification Report:')
print(report)


Accuracy (Frequency Encoding + Count Vectorization): 0.9283333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       494
           1       0.97      0.61      0.75       106

    accuracy                           0.93       600
   macro avg       0.95      0.80      0.85       600
weighted avg       0.93      0.93      0.92       600



THUS WE HAVE GONE THROUGH THESE ENCODING METHODS:

Method 1:
Label Encoding + Count Vectorization

Method 2:
One-Hot Encoding + TF-IDF Vectorization

Method 3:
Frequency Encoding + Count Vectorization