In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Define the chunk size
chunk_size = 1000
file_path = "balanced_train_data_chunked.csv"

# Initialize an empty list to store the data
data_chunks = []

# Load data in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    data_chunks.append(chunk)

# Combine the chunks into a single DataFrame
balanced_train_data = pd.concat(data_chunks, ignore_index=True)


In [2]:
# Separate features (X) and target (y)
X = balanced_train_data.drop(columns=['hate_speech'])
y = balanced_train_data['hate_speech']


In [3]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


#### Apply Machine Learning Models
### Logistic Regression


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Initialize and train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_val)

# Evaluate the model
print("Logistic Regression Performance:")
print(classification_report(y_val, y_pred_log_reg))
print("ROC-AUC Score:", roc_auc_score(y_val, y_pred_log_reg))


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      2699
           1       0.86      0.85      0.86      2726

    accuracy                           0.86      5425
   macro avg       0.86      0.86      0.86      5425
weighted avg       0.86      0.86      0.86      5425

ROC-AUC Score: 0.8562635083725746


### Random Forest


In [5]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_clf.predict(X_val)

# Evaluate the model
print("Random Forest Performance:")
print(classification_report(y_val, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_val, y_pred_rf))


Random Forest Performance:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      2699
           1       0.93      0.94      0.94      2726

    accuracy                           0.93      5425
   macro avg       0.93      0.93      0.93      5425
weighted avg       0.93      0.93      0.93      5425

ROC-AUC Score: 0.9347357530587265


### Naive Bayes


In [6]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train Naive Bayes model
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

# Make predictions
y_pred_nb = nb_clf.predict(X_val)

# Evaluate the model
print("Naive Bayes Performance:")
print(classification_report(y_val, y_pred_nb))
print("ROC-AUC Score:", roc_auc_score(y_val, y_pred_nb))


Naive Bayes Performance:
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      2699
           1       0.81      0.84      0.82      2726

    accuracy                           0.82      5425
   macro avg       0.82      0.82      0.82      5425
weighted avg       0.82      0.82      0.82      5425

ROC-AUC Score: 0.817413422052188


#### Apply Deep Learning Model
### LSTM Model

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming max_length is the length of the longest sequence
max_length = 100

# Padding the sequences
X_train_padded = pad_sequences(X_train.values, maxlen=max_length, padding='post')
X_val_padded = pad_sequences(X_val.values, maxlen=max_length, padding='post')

# Build LSTM model
lstm_model = Sequential([
    Embedding(input_dim=16626, output_dim=128, input_length=max_length),  # Adjust input_dim to match your vocabulary size
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_val_padded, y_val))

# Evaluate the model
loss, accuracy = lstm_model.evaluate(X_val_padded, y_val)
print(f"LSTM Validation Accuracy: {accuracy}")




Epoch 1/5
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 75ms/step - accuracy: 0.4950 - loss: 0.6942 - val_accuracy: 0.4975 - val_loss: 0.6932
Epoch 2/5
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 74ms/step - accuracy: 0.4949 - loss: 0.6932 - val_accuracy: 0.4975 - val_loss: 0.6932
Epoch 3/5
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 74ms/step - accuracy: 0.4952 - loss: 0.6933 - val_accuracy: 0.5025 - val_loss: 0.6931
Epoch 4/5
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 75ms/step - accuracy: 0.4928 - loss: 0.6932 - val_accuracy: 0.4975 - val_loss: 0.6931
Epoch 5/5
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 75ms/step - accuracy: 0.4952 - loss: 0.6932 - val_accuracy: 0.4975 - val_loss: 0.6932
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.5065 - loss: 0.6931
LSTM Validation Accuracy: 0.49751150608062744


In [8]:
print("Model Comparison:")
print(f"Logistic Regression - ROC-AUC: {roc_auc_score(y_val, y_pred_log_reg)}")
print(f"Random Forest - ROC-AUC: {roc_auc_score(y_val, y_pred_rf)}")
print(f"Naive Bayes - ROC-AUC: {roc_auc_score(y_val, y_pred_nb)}")
print(f"LSTM - Validation Accuracy: {accuracy}")


Model Comparison:
Logistic Regression - ROC-AUC: 0.8562635083725746
Random Forest - ROC-AUC: 0.9347357530587265
Naive Bayes - ROC-AUC: 0.817413422052188
LSTM - Validation Accuracy: 0.49751150608062744


#### Random Forest model has the highest ROC-AUC score, which indicates it is performing the best on this dataset.

### Hyperparameter Tuning
#### Optimize Random Forest: Although Random Forest performed well, you might achieve even better results by fine-tuning its hyperparameters. Use techniques like GridSearchCV or RandomizedSearchCV to explore optimal settings for n_estimators, max_depth, min_samples_split, etc.

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Example of splitting X_train and y_train into batches
batch_size = 1000
num_batches = len(X_train) // batch_size

best_score = -1
best_params = {}

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(X_train))
    
    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]
    
    grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_batch, y_batch)
    
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_params = grid_search.best_params_

print("Best parameters found: ", best_params)
print("Best ROC-AUC score: ", best_score)


Best parameters found:  {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 200}
Best ROC-AUC score:  0.9079026442307694


### Model Validation
#### Cross-Validation: Use k-fold cross-validation to ensure that your model is not overfitting and generalizes well across different data splits.


In [13]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Example of splitting X and y into batches
batch_size = 1000
num_batches = len(X) // batch_size

cv_scores = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(X))
    
    X_batch = X[start_idx:end_idx]
    y_batch = y[start_idx:end_idx]
    
    scores = cross_val_score(rf_clf, X_batch, y_batch, cv=5, scoring='roc_auc')
    cv_scores.extend(scores)

# Filter out nan values
cv_scores = [score for score in cv_scores if not np.isnan(score)]

if cv_scores:
    print("Cross-validated ROC-AUC scores:", cv_scores)
    print("Mean ROC-AUC score:", np.mean(cv_scores))
else:
    print("No valid ROC-AUC scores found.")



Traceback (most recent call last):
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 373, in _score
    y_pred = method_caller(clf, "decision_function", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 390, in _score
    y_pred = self._select_proba_binary(y_pred, clf.classes_)
             ^^^

Cross-validated ROC-AUC scores: [0.8307526881720431, 0.895115753811406, 0.8306041784302653, 0.8107707509881423, 0.8501552795031057, 0.8883304195804196, 0.8606497668997669, 0.7863490675990676, 0.8566433566433567, 0.8114695340501792, 0.8678208872201363, 0.8845084132943958, 0.8767209011264081, 0.8578779029342234, 0.8992598684210524, 0.8002518145459931, 0.8128425418456524, 0.8338764627462597, 0.8110650274033475, 0.875, 0.8783201223751912, 0.7174479166666666, 0.7711759868421053, 0.8880893640350876, 0.8003015350877193, 0.8474551971326164, 0.7746236559139785, 0.7630824372759857, 0.8550537634408603, 0.8654574592074591, 0.8971333333333333, 0.8551333333333333, 0.858224084335721, 0.890660900121638, 0.8399783754561426, 0.821433861650126, 0.8478003258776478, 0.8766108724633387, 0.8397274477855132, 0.8469551282051283, 0.8224864413850645, 0.8333333333333334, 0.767749451754386, 0.8499177631578947, 0.8509457236842106, 0.8498, 0.8872666666666666, 0.8654000000000001, 0.8303066192920121, 0.845111198841952

Traceback (most recent call last):
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 373, in _score
    y_pred = method_caller(clf, "decision_function", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 390, in _score
    y_pred = self._select_proba_binary(y_pred, clf.classes_)
             ^^^

In [14]:
print("Best parameters found: ", best_params)
print("Best ROC-AUC score: ", best_score)

print("Cross-validated ROC-AUC scores:", cv_scores)
print("Mean ROC-AUC score:", np.mean(cv_scores))


Best parameters found:  {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 200}
Best ROC-AUC score:  0.9079026442307694
Cross-validated ROC-AUC scores: [0.8307526881720431, 0.895115753811406, 0.8306041784302653, 0.8107707509881423, 0.8501552795031057, 0.8883304195804196, 0.8606497668997669, 0.7863490675990676, 0.8566433566433567, 0.8114695340501792, 0.8678208872201363, 0.8845084132943958, 0.8767209011264081, 0.8578779029342234, 0.8992598684210524, 0.8002518145459931, 0.8128425418456524, 0.8338764627462597, 0.8110650274033475, 0.875, 0.8783201223751912, 0.7174479166666666, 0.7711759868421053, 0.8880893640350876, 0.8003015350877193, 0.8474551971326164, 0.7746236559139785, 0.7630824372759857, 0.8550537634408603, 0.8654574592074591, 0.8971333333333333, 0.8551333333333333, 0.858224084335721, 0.890660900121638, 0.8399783754561426, 0.821433861650126, 0.8478003258776478, 0.8766108724633387, 0.8397274477855132, 0.8469551282051283, 0.8224864413850645, 0.8333333333333334, 0.767749451754386