# Benchamark Score Evaluation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

## Machine Learning Model

In [2]:
df = pd.read_csv('/Users/CRYPTO/Infosys Internship - Hate Speech Detection/Tokenization & Encoding/Reddit_Encoded.csv')

In [3]:
df.head()

Unnamed: 0,comment,hate_speech,lemmatized_comment,document_vector_flat
0,a subsection of retarded hungarians ohh boy br...,1,a subsection of retard hungarians ohh boy brac...,"0.015068972,-0.10067213,-0.0020921142,0.005855..."
1,hiii just got off work Foundation and groundin...,0,hiii just get off work Foundation and ground b...,"-0.011435382,-0.026568763,0.0116622485,0.07178..."
2,wow i guess soyboys are the same in every country,0,wow i guess soyboys be the same in every country,"-0.0065610385,-0.01601213,0.0042292415,0.09200..."
3,owen benjamins soyboy song goes for every coun...,0,owen benjamins soyboy song go for every countr...,"0.0023773473,-0.02649879,-0.0071647125,0.05887..."
4,yall hear sumn by all means i live in a small...,0,yall hear sumn by all mean i live in a small t...,"0.012283095,-0.020542147,0.0040409532,0.044450..."


In [4]:
# Convert the document vectors from their string representation back into numerical form
df['document_vector'] = df['document_vector_flat'].apply(lambda x: np.fromstring(x, sep=','))

# Ensure the conversion was successful
print(df['document_vector'].head())

0    [0.015068972, -0.10067213, -0.0020921142, 0.00...
1    [-0.011435382, -0.026568763, 0.0116622485, 0.0...
2    [-0.0065610385, -0.01601213, 0.0042292415, 0.0...
3    [0.0023773473, -0.02649879, -0.0071647125, 0.0...
4    [0.012283095, -0.020542147, 0.0040409532, 0.04...
Name: document_vector, dtype: object


In [5]:
# Extract features (document vectors) and labels (hate speech)
X = np.array(df['document_vector'].tolist())
y = df['hate_speech']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

## SVM Classifier

In [7]:
# Initialize the SVM
model = SVC(kernel='rbf', gamma='scale', random_state=42)

# Train the classifier
model.fit(X_train_resampled, y_train_resampled)

In [8]:
# Make predictions
y_pred = model.predict(X_test)

In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 0.82
Precision: 0.59
Recall: 0.78
F1-Score: 0.67


In [10]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.83      0.87      3399
           1       0.59      0.78      0.67      1047

    accuracy                           0.82      4446
   macro avg       0.76      0.81      0.77      4446
weighted avg       0.85      0.82      0.83      4446



## HyperParameter Tuning

In [11]:
from sklearn.model_selection import GridSearchCV

In [15]:
param_grid = {
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'gamma': ['scale', 'auto'],   # Kernel coefficient for 'rbf'
    'kernel': ['rbf']             # Kernel type
}

In [13]:
# Initialize the SVM
svc = SVC(random_state=42)

In [16]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3, scoring='recall', verbose=1, n_jobs= -1)

# Perform GridSearchCV to find the best parameters
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and best score
print("Best Parameters found: ", grid_search.best_params_)
print("Best Cross-validation Accuracy: {:.2f}".format(grid_search.best_score_))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters found:  {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-validation Accuracy: 0.91


In [17]:
# Use the best model found by GridSearchCV
best_svc = grid_search.best_estimator_

# Train the best model on the full training data
best_svc.fit(X_train, y_train)

# Make predictions with the best model
y_pred = best_svc.predict(X_test)

In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 0.85
Precision: 0.73
Recall: 0.55
F1-Score: 0.63


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3399
           1       0.73      0.55      0.63      1047

    accuracy                           0.85      4446
   macro avg       0.80      0.75      0.77      4446
weighted avg       0.84      0.85      0.84      4446



In [20]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [21]:
benchmark_score = evaluate_model(best_svc, X_train_resampled, y_train_resampled, X_test, y_test)

print(f"Benchmark score (SVM): {benchmark_score:.4f}")

Benchmark score (SVM): 0.8187
