In [1]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.model_selection import train_test_split


# Define the chunk size
chunk_size = 1000
file_path = "balanced_train_data_chunked.csv"

# Initialize the model
svm_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)

# Initialize StandardScaler for normalization
scaler = StandardScaler()


In [2]:

# Read the first chunk to setup the validation set
first_chunk = pd.read_csv(file_path, chunksize=chunk_size).__next__()

# Split the first chunk into train and validation sets
X = first_chunk.drop(columns=['hate_speech'])
y = first_chunk['hate_speech']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [3]:
# Define class weights manually
class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1], y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Train the scaler on the initial training set
scaler.partial_fit(X_train)

# Initialize empty arrays to store the validation data
X_val_full = X_val
y_val_full = y_val


In [4]:


# Train the model incrementally on each chunk
first_chunk_flag = True
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    X_chunk = chunk.drop(columns=['hate_speech'])
    y_chunk = chunk['hate_speech']
    
    # Exclude the first chunk as it is already used for splitting
    if first_chunk_flag:
        first_chunk_flag = False
        continue
    
    # Scale the data using the partial fit scaler
    scaler.partial_fit(X_chunk)
    X_chunk_scaled = scaler.transform(X_chunk)
    
    # Compute sample weights for the class imbalance
    sample_weights = np.array([class_weight_dict[label] for label in y_chunk])
    
    # Incrementally train the SVM model
    svm_model.partial_fit(X_chunk_scaled, y_chunk, classes=[0, 1], sample_weight=sample_weights)



In [6]:
# Load the entire validation set if not too large
X_val_full_scaled = scaler.transform(X_val_full)

# Predict and evaluate the model
y_pred_proba = svm_model.decision_function(X_val_full_scaled)
auc_score = roc_auc_score(y_val_full, y_pred_proba)

# Generate and print the classification report
y_pred = svm_model.predict(X_val_full_scaled)
report = classification_report(y_val_full, y_pred, target_names=['Non-Hate Speech', 'Hate Speech'])
print("Classification Report:")
print(report)
print(f"ROC-AUC Score: {auc_score}")



Classification Report:
                 precision    recall  f1-score   support

Non-Hate Speech       0.89      0.60      0.72       154
    Hate Speech       0.36      0.76      0.49        46

       accuracy                           0.64       200
      macro avg       0.63      0.68      0.61       200
   weighted avg       0.77      0.64      0.67       200

ROC-AUC Score: 0.7586109542631281
