In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.sparse import csr_matrix


In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
'''
# Separate features and labels
X_train = train_data.drop(columns=['hate_speech', 'lemmatized_comment'])
y_train = train_data['hate_speech']

X_test = test_data.drop(columns=['hate_speech', 'lemmatized_comment'])
y_test = test_data['hate_speech']

# Convert DataFrames to numpy arrays if necessary
X_train = X_train.values
X_test = X_test.values
'''

  train_data = pd.read_csv('train.csv')


"\n# Separate features and labels\nX_train = train_data.drop(columns=['hate_speech', 'lemmatized_comment'])\ny_train = train_data['hate_speech']\n\nX_test = test_data.drop(columns=['hate_speech', 'lemmatized_comment'])\ny_test = test_data['hate_speech']\n\n# Convert DataFrames to numpy arrays if necessary\nX_train = X_train.values\nX_test = X_test.values\n"

In [3]:
# Ensure text column is string type and handle missing values
train_data['lemmatized_comment'] = train_data['lemmatized_comment'].fillna('').astype(str)
test_data['lemmatized_comment'] = test_data['lemmatized_comment'].fillna('').astype(str)

# Drop rows with empty text data
train_data = train_data[train_data['lemmatized_comment'].str.strip() != '']
test_data = test_data[test_data['lemmatized_comment'].str.strip() != '']

# Separate features and labels
X_train_text = train_data['lemmatized_comment']
y_train = train_data['hate_speech']

X_test_text = test_data['lemmatized_comment']
y_test = test_data['hate_speech']


In [4]:
# Use TfidfVectorizer to transform text data into feature vectors
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', min_df=1)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)


In [5]:
# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": MultinomialNB(),
    "SGD Classifier": SGDClassifier(max_iter=1000, random_state=42)
}

# Store accuracy results
results = {}

In [6]:
# Function to evaluate models
def evaluate_model(model_name, model, X_train_vec, y_train, X_test_vec, y_test):
    print(f"\n--- {model_name} ---")
    try:
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name] = accuracy
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    except Exception as e:
        print(f"Error evaluating {model_name}: {e}")

In [7]:
# Train and evaluate models
for model_name, model in models.items():
    evaluate_model(model_name, model, X_train, y_train, X_test, y_test)

# Determine and print the best model
if results:
    best_model = max(results, key=results.get)
    print(f"\nThe best model is: {best_model} with an accuracy of {results[best_model]:.4f}")
else:
    print("No models were evaluated. Please check the data and preprocessing steps.")


--- Random Forest ---
Accuracy: 0.9113
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3386
           1       0.84      0.77      0.81      1057

    accuracy                           0.91      4443
   macro avg       0.89      0.86      0.87      4443
weighted avg       0.91      0.91      0.91      4443


--- Logistic Regression ---
Accuracy: 0.8841
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      3386
           1       0.88      0.60      0.71      1057

    accuracy                           0.88      4443
   macro avg       0.88      0.79      0.82      4443
weighted avg       0.88      0.88      0.88      4443


--- Support Vector Machine ---
Accuracy: 0.9023
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      3386
           1       0.86      0.70

### Adding Hyperparameter Tuning

In [8]:
from sklearn.model_selection import GridSearchCV


In [9]:
# Parameter grid for hyperparameter tuning
param_grid = {
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    "Support Vector Machine": {
        'C': [100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7]
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "Naive Bayes": {
        'alpha': [0.01, 0.1, 1, 10, 100]
    },
    "SGD Classifier": {
        'loss': ['hinge', 'log', 'perceptron'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'max_iter': [1000, 2000]
    }
}

# Store accuracy results after tuning
tuned_results = {}

In [10]:
# Function to tune hyperparameters
def tune_hyperparameters(model_name, model, param_grid, X_train_vec, y_train):
    print(f"\n--- Tuning {model_name} ---")
    try:
        grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train_vec, y_train)
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        return grid_search.best_estimator_
    except Exception as e:
        print(f"Error tuning {model_name}: {e}")
        return model


In [11]:
# Train and evaluate models after tuning
for model_name, model in models.items():
    tuned_model = tune_hyperparameters(model_name, model, param_grid, X_train, y_train)
    if isinstance(tuned_model, str):
        continue
    evaluate_model(model_name, tuned_model, X_train, y_train, X_test, y_test)
    tuned_results[model_name] = results[model_name]



--- Tuning Random Forest ---
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}

--- Random Forest ---
Accuracy: 0.9124
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94      3386
           1       0.84      0.78      0.81      1057

    accuracy                           0.91      4443
   macro avg       0.89      0.87      0.88      4443
weighted avg       0.91      0.91      0.91      4443


--- Tuning Logistic Regression ---
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

--- Logistic Regression ---
Accuracy: 0.8951
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      3386
           1       0.86      0.66      0.75      1057

    accuracy                           0.90      4443
   macro avg       0.88      0.82      0.84      4443
weighted avg 

120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Pyt

Best parameters for SGD Classifier: {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l1'}

--- SGD Classifier ---
Accuracy: 0.9104
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      3386
           1       0.86      0.74      0.80      1057

    accuracy                           0.91      4443
   macro avg       0.89      0.85      0.87      4443
weighted avg       0.91      0.91      0.91      4443



In [12]:
# Determine and print the best tuned model
if tuned_results:
    best_tuned_model = max(tuned_results, key=tuned_results.get)
    print(f"\nThe best tuned model is: {best_tuned_model} with an accuracy of {tuned_results[best_tuned_model]:.4f}")
else:
    print("No models were evaluated after tuning. Please check the data and preprocessing steps.")


The best tuned model is: Gradient Boosting with an accuracy of 0.9136
