In [6]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("Preprocessed_cleaned_Final_dataset.csv")
df['clean_text'] = df['clean_text'].fillna('')

In [8]:
# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = vectorizer.fit_transform(df["clean_text"])
x = tfidf_matrix

# Convert the labels to numerical using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['IsHatespeech'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
# Define models and their hyperparameter grids
models = {
    'Logistic Regression': (LogisticRegression(), {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}),
    'SVM': (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 5, 10], 'criterion': ['gini', 'entropy']})
}


In [11]:
# Perform hyperparameter tuning for each model
for model_name, (model, param_grid) in models.items():
    print(f"Tuning {model_name}...")

Tuning Logistic Regression...
Tuning Random Forest...
Tuning SVM...
Tuning Decision Tree...


In [12]:

    random_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=5)
    random_search.fit(X_train, y_train)

In [15]:
    # Print best parameters and score
print(f"Best parameters for {model_name}: {random_search.best_params_}")
print(f"Best accuracy for {model_name}: {random_search.best_score_}")

Best parameters for Decision Tree: {'max_depth': None, 'criterion': 'gini'}
Best accuracy for Decision Tree: 0.6928027007901344
