In [1]:
# =========================
# Bengali Hate Speech Detection - Random Forest
# Option A (Minimal + Kaggle Output + Inference Time)
# =========================

import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.utils import resample
import joblib
import re
import os
import time

import warnings
warnings.filterwarnings('ignore')

# -----------------------------
# Setup
# -----------------------------
print("üì¶ Downloading NLTK stopwords...")
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("üáßüá© Bengali Hate Speech Detection Pipeline (Minimal)")

# -----------------------------
# Load Dataset
# -----------------------------
print("üì• Loading Bengali hate speech dataset...")
df = pd.read_csv("/kaggle/input/bagla-hate-spech/Bengali hate speech .csv")
print(f"üìä Dataset shape: {df.shape}")
print(f"üìã Columns: {df.columns.tolist()}")

# -----------------------------
# Handle missing values
# -----------------------------
df = df.dropna()
print(f"‚úÖ Shape after dropping missing values: {df.shape}")

# -----------------------------
# Identify text and target columns
# -----------------------------
text_column = 'sentence'
target_column = 'hate'

# -----------------------------
# Label encoding
# -----------------------------
print("üî¢ Encoding labels...")
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df[target_column])
print("üìã Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# -----------------------------
# Create output folder
# -----------------------------
output_dir = "/kaggle/working/bengali_model_outputs"
os.makedirs(output_dir, exist_ok=True)
joblib.dump(label_encoder, f"{output_dir}/label_encoder.pkl")

# -----------------------------
# Clean Bengali text
# -----------------------------
def clean_bengali_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'[^\u0980-\u09FF\s‡ß¶-‡ßØ0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text if len(text) >= 3 else ""

df['clean_text'] = df[text_column].apply(clean_bengali_text)
df = df[df['clean_text'] != ""]
print(f"‚úÖ Shape after cleaning: {df.shape}")

# -----------------------------
# Balance dataset (optional)
# -----------------------------
class_counts = df['encoded_label'].value_counts()
if class_counts.max() / class_counts.min() > 1.5:
    print("‚öñÔ∏è Applying oversampling to balance classes...")
    max_count = class_counts.max()
    balanced_df = pd.DataFrame()
    for label in df['encoded_label'].unique():
        df_label = df[df['encoded_label'] == label]
        upsampled = resample(df_label, replace=True, n_samples=max_count, random_state=42)
        balanced_df = pd.concat([balanced_df, upsampled])
    df = balanced_df.copy()
print(f"‚úÖ Final dataset shape: {df.shape}")

# -----------------------------
# TF-IDF
# -----------------------------
print("üî§ Extracting TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    lowercase=False,
    token_pattern=r'[\u0980-\u09FF]+'
)
X = tfidf.fit_transform(df['clean_text'])
y = df['encoded_label']

joblib.dump(tfidf, f"{output_dir}/tfidf_vectorizer.pkl")

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# -----------------------------
# Train Random Forest
# -----------------------------
print("\nüöÄ Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Save model
joblib.dump(rf_model, f"{output_dir}/random_forest_model.pkl")

# -----------------------------
# Evaluate model
# -----------------------------
y_pred = rf_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"‚úÖ Test Accuracy: {acc:.4f}")

# Classification report
clf_report = classification_report(y_test, y_pred, output_dict=True)
print("\nüßæ Classification Report:\n", classification_report(y_test, y_pred))
pd.DataFrame(clf_report).transpose().to_csv(f"{output_dir}/classification_report.csv", index=True)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig(f"{output_dir}/confusion_matrix.png", dpi=300)
plt.close()

# ROC AUC
roc_auc = roc_auc_score(y_test, y_pred)
print(f"üî• ROC AUC Score: {roc_auc:.4f}")

# -----------------------------
# Feature importance
# -----------------------------
importances = rf_model.feature_importances_
top_n = 30
indices = np.argsort(importances)[::-1][:top_n]
plt.figure(figsize=(8,6))
plt.title("Top 30 Feature Importances")
plt.bar(range(top_n), importances[indices], align='center')
plt.xticks(range(top_n), [tfidf.get_feature_names_out()[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig(f"{output_dir}/feature_importances.png", dpi=300)
plt.close()

# -----------------------------
# Sample inference + average inference time
# -----------------------------
samples = ["‡¶è‡¶ü‡¶æ ‡¶ñ‡ßÅ‡¶¨‡¶á ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞", "‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶¨‡¶æ‡¶ú‡ßá", "‡¶Ü‡¶ú‡¶ï‡ßá‡¶∞ ‡¶ñ‡ßá‡¶≤‡¶æ ‡¶ñ‡ßÅ‡¶¨ ‡¶≠‡¶æ‡¶≤‡ßã ‡¶õ‡¶ø‡¶≤"]
vec_samples = tfidf.transform([clean_bengali_text(t) for t in samples])

start_time = time.time()
preds = rf_model.predict(vec_samples)
end_time = time.time()

inference_time = end_time - start_time
avg_inference_time = inference_time / len(samples)
print("\nüß™ Sample Predictions:")
for text, p in zip(samples, preds):
    print(f"{text} ‚Üí {label_encoder.inverse_transform([p])[0]}")

print(f"‚è± Total inference time: {inference_time:.4f}s")
print(f"‚è± Average inference time per sample: {avg_inference_time:.4f}s")

# Save predictions and inference times
pred_df = pd.DataFrame({'text': samples, 'predicted_label': [label_encoder.inverse_transform([p])[0] for p in preds]})
pred_df.to_csv(f"{output_dir}/sample_predictions.csv", index=False)

with open(f"{output_dir}/inference_times.txt", "w") as f:
    f.write(f"Total inference time: {inference_time:.4f}s\n")
    f.write(f"Average inference time per sample: {avg_inference_time:.4f}s\n")

print(f"\nüíæ All outputs saved in {output_dir}")


üì¶ Downloading NLTK stopwords...
üáßüá© Bengali Hate Speech Detection Pipeline (Minimal)
üì• Loading Bengali hate speech dataset...
üìä Dataset shape: (30000, 3)
üìã Columns: ['sentence', 'hate', 'category']
‚úÖ Shape after dropping missing values: (30000, 3)
üî¢ Encoding labels...
üìã Label mapping: {0: 0, 1: 1}
‚úÖ Shape after cleaning: (29902, 5)
‚öñÔ∏è Applying oversampling to balance classes...
‚úÖ Final dataset shape: (39820, 5)
üî§ Extracting TF-IDF features...
Train samples: 31856, Test samples: 7964

üöÄ Training Random Forest...
‚úÖ Test Accuracy: 0.9461

üßæ Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.95      3982
           1       0.94      0.95      0.95      3982

    accuracy                           0.95      7964
   macro avg       0.95      0.95      0.95      7964
weighted avg       0.95      0.95      0.95      7964

üî• ROC AUC Score: 0.9461

üß™ Sample Predictions:
‡¶è‡¶ü‡¶