In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split

# Load the balanced training data and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


  train_data = pd.read_csv('train.csv')


In [14]:
# Ensure text column is string type and handle missing values
train_data['lemmatized_comment'] = train_data['lemmatized_comment'].fillna('').astype(str)
test_data['lemmatized_comment'] = test_data['lemmatized_comment'].fillna('').astype(str)

# Drop rows with empty text data
train_data = train_data[train_data['lemmatized_comment'].str.strip() != '']
test_data = test_data[test_data['lemmatized_comment'].str.strip() != '']

# Separate features and labels
X_train_text = train_data['lemmatized_comment']
y_train = train_data['hate_speech']

X_test_text = test_data['lemmatized_comment']
y_test = test_data['hate_speech']


In [17]:
# Use TfidfVectorizer to transform text data into feature vectors
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', min_df=1)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# Initialize the SVM model
svm_model = SVC(random_state=42)


In [20]:
# Parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto']
}

# Function to tune hyperparameters
def tune_hyperparameters(model, param_grid, X_train, y_train):
    print("\n--- Tuning SVM ---")
    try:
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        print(f"Best parameters for SVM: {grid_search.best_params_}")
        return grid_search.best_estimator_
    except Exception as e:
        print(f"Error tuning SVM: {e}")
        return model



In [21]:
# Tune the hyperparameters for SVM
tuned_svm_model = tune_hyperparameters(svm_model, param_grid, X_train, y_train)

# Train the SVM model with the best parameters
tuned_svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = tuned_svm_model.predict(X_test)

# Display the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")



--- Tuning SVM ---
Best parameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.94      3386
           1       0.88      0.67      0.76      1057

    accuracy                           0.90      4443
   macro avg       0.89      0.82      0.85      4443
weighted avg       0.90      0.90      0.89      4443

Accuracy: 0.8983
