In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,roc_auc_score

In [2]:

# Define the chunk size
chunk_size = 1000
file_path = "balanced_train_data_chunked.csv"

# Initialize an empty list to store the data
data_chunks = []

# Load data in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    data_chunks.append(chunk)

# Combine the chunks into a single DataFrame
balanced_train_data = pd.concat(data_chunks, ignore_index=True)


In [3]:
# Separate features (X) and target (y)
X = balanced_train_data.drop(columns=['hate_speech'])
y = balanced_train_data['hate_speech']


In [4]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## Random Forest Model

In [5]:

# Initialize and train Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_clf.predict(X_val)

# Evaluate the model
print("Random Forest Performance:")
print(classification_report(y_val, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_val, y_pred_rf))


Random Forest Performance:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      2699
           1       0.93      0.94      0.94      2726

    accuracy                           0.94      5425
   macro avg       0.94      0.94      0.94      5425
weighted avg       0.94      0.94      0.94      5425

ROC-AUC Score: 0.9363828536804887


In [6]:
import warnings
warnings.filterwarnings('ignore')

#### Hyperparameter tuning

In [7]:
# Initialize and define the parameter grid for Random Forest
rf_clf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Batch size and number of batches
batch_size = 1000
num_batches = len(X_train) // batch_size + 1  # Adjust to cover all samples

best_score = -1
best_params = None
best_model = None

In [8]:
# Perform batched GridSearchCV
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(X_train))
    
    X_batch = X_train.iloc[start_idx:end_idx]
    y_batch = y_train.iloc[start_idx:end_idx]
    
    grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_batch, y_batch)
    
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)
print("Best ROC-AUC score: ", best_score)



Best parameters found:  {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 300}
Best ROC-AUC score:  0.9016380763736628


In [9]:
# Evaluate on validation set
X_val_scaled = StandardScaler().fit_transform(X_val)  # Assuming X_val fits in memory
y_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]
y_pred = best_model.predict(X_val_scaled)

roc_auc = roc_auc_score(y_val, y_pred_proba)
report = classification_report(y_val, y_pred, target_names=['Not Hate Speech', 'Hate Speech'])

print(f"Validation ROC-AUC Score: {roc_auc}")
print(f"Validation Classification Report:\n{report}")

Validation ROC-AUC Score: 0.7725966819590527
Validation Classification Report:
                 precision    recall  f1-score   support

Not Hate Speech       0.67      0.91      0.77      2699
    Hate Speech       0.86      0.56      0.68      2726

       accuracy                           0.74      5425
      macro avg       0.77      0.74      0.73      5425
   weighted avg       0.77      0.74      0.73      5425

