In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Load the train and test CSV files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Split the train data into features (X_train) and labels (y_train)
X_train = train_df.iloc[:, :-2]  # All columns except the last two (features)
y_train = train_df['hate_speech']  # The 'hate_speech' column (labels)

# Split the test data into features (X_test) and labels (y_test)
X_test = test_df.iloc[:, :-2]  # All columns except the last two (features)
y_test = test_df['hate_speech']  # The 'hate_speech' column (labels)


  train_df = pd.read_csv('train.csv')


In [4]:
# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [5]:
# Define the models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "SGD Classifier": SGDClassifier(max_iter=1000, random_state=42)
}

# Store accuracy results
results = {}


In [5]:

# Apply each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Store the results
    results[model_name] = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    # Print the results for each model
    print(f'{model_name} Model:')
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('--------------------------------------------------')


Random Forest Model:
Accuracy: 0.8125140670717983
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      3386
           1       0.64      0.50      0.56      1057

    accuracy                           0.81      4443
   macro avg       0.74      0.70      0.72      4443
weighted avg       0.80      0.81      0.80      4443

--------------------------------------------------
Logistic Regression Model:
Accuracy: 0.7996848975917173
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      3386
           1       0.56      0.74      0.64      1057

    accuracy                           0.80      4443
   macro avg       0.74      0.78      0.75      4443
weighted avg       0.83      0.80      0.81      4443

--------------------------------------------------
Support Vector Machine Model:
Accuracy: 0.8035111411208643
Classification Report:
          

### Adding Hyperparameter Tuning

In [1]:
from sklearn.model_selection import GridSearchCV


In [6]:
param_grid = {
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    "Support Vector Machine": {
        'C': [100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7]
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "SGD Classifier": {
        'loss': ['hinge', 'log', 'perceptron'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'max_iter': [1000, 2000]
    }
}


In [7]:
tuned_results = {}

for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    param_grid_model = param_grid[model_name]
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid_model, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Get classification report
    class_report = classification_report(y_test, y_pred)
    
    # Store the result
    tuned_results[model_name] = {
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'classification_report': class_report
    }
    
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} accuracy: {accuracy:.4f}")
    print(f"{model_name} classification report:\n{class_report}")

# Store the tuned results for further analysis
results = tuned_results


Tuning Random Forest...
Random Forest best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest accuracy: 0.8177
Random Forest classification report:
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      3386
           1       0.65      0.51      0.57      1057

    accuracy                           0.82      4443
   macro avg       0.75      0.71      0.73      4443
weighted avg       0.81      0.82      0.81      4443

Tuning Logistic Regression...




Logistic Regression best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression accuracy: 0.7997
Logistic Regression classification report:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      3386
           1       0.56      0.74      0.64      1057

    accuracy                           0.80      4443
   macro avg       0.74      0.78      0.75      4443
weighted avg       0.83      0.80      0.81      4443

Tuning Support Vector Machine...
Support Vector Machine best parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Support Vector Machine accuracy: 0.8296
Support Vector Machine classification report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3386
           1       0.66      0.60      0.62      1057

    accuracy                           0.83      4443
   macro avg       0.77      0.75      0.76      4443
weighted avg       0.82      

120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
47 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Aman Shekhar Sachan\AppData\Roaming\Pyt

SGD Classifier best parameters: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
SGD Classifier accuracy: 0.8087
SGD Classifier classification report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3386
           1       0.58      0.73      0.65      1057

    accuracy                           0.81      4443
   macro avg       0.74      0.78      0.76      4443
weighted avg       0.83      0.81      0.82      4443



In [8]:
for model_name, results in tuned_results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {results['best_params']}")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Classification Report:\n{results['classification_report']}\n")


Model: Random Forest
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.8177
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      3386
           1       0.65      0.51      0.57      1057

    accuracy                           0.82      4443
   macro avg       0.75      0.71      0.73      4443
weighted avg       0.81      0.82      0.81      4443


Model: Logistic Regression
Best Parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.7997
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      3386
           1       0.56      0.74      0.64      1057

    accuracy                           0.80      4443
   macro avg       0.74      0.78      0.75      4443
weighted avg       0.83      0.80      0.81      4443


Model: Support Vector Machine
Best Parameters: {'C': 100, 'gamma': 