In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score



df['clean_text'] = df['clean_text'].fillna('')

# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = vectorizer.fit_transform(df["clean_text"])
x = tfidf_matrix

# Convert the labels to numerical using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['IsHatespeech'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define models and their hyperparameter grids
models = {
    'Logistic Regression': (LogisticRegression(), {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}),
    'SVM': (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 5, 10], 'criterion': ['gini', 'entropy']})
}

# Perform hyperparameter tuning for each model
for model_name, (model, param_grid) in models.items():
    print(f"Tuning {model_name}...")

    random_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_iter=5)
    random_search.fit(X_train, y_train)

    # Print best parameters and score
    print(f"Best parameters for {model_name}: {random_search.best_params_}")
    print(f"Best accuracy for {model_name}: {random_search.best_score_}")

    # Evaluate on test set
    y_pred = random_search.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy for {model_name}: {test_accuracy}")
    print("------")