In [1]:
# Final corrected pipeline: Advanced preprocessing + Random Forest + conference-ready metrics
# Saves outputs to /kaggle/working/random_forest_outputs

import time
import json
import os
import re
import string
import joblib
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score, log_loss,
    matthews_corrcoef, cohen_kappa_score, balanced_accuracy_score
)

# -------------------------
# Config
# -------------------------
INPUT_CSV = "/kaggle/input/cyberbullying-classification/cyberbullying_tweets.csv"
OUTPUT_DIR = "/kaggle/working/random_forest_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
sns.set_palette("husl")
plt.style.use("seaborn-v0_8")

# -------------------------
# NLTK setup
# -------------------------
nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

# -------------------------
# Load dataset (assumes columns: tweet_text, cyberbullying_type)
# -------------------------
print("üì• Loading dataset...")
df = pd.read_csv(INPUT_CSV)
print(f"Loaded dataset shape: {df.shape}")
print("Columns:", list(df.columns))

# Validate expected columns
TEXT_COL = "tweet_text"
TARGET_COL = "cyberbullying_type"
if TEXT_COL not in df.columns or TARGET_COL not in df.columns:
    raise ValueError(f"Expected columns '{TEXT_COL}' and '{TARGET_COL}' in CSV. Found: {list(df.columns)}")

# -------------------------
# Advanced preprocessing
# -------------------------
def advanced_preprocess(text):
    if pd.isna(text):
        return ""
    s = str(text).lower()
    # remove urls, mentions, hashtags
    s = re.sub(r"http\S+|www\.\S+|@\w+|#\w+", " ", s)
    # remove punctuation
    s = s.translate(str.maketrans("", "", string.punctuation))
    # remove digits
    s = re.sub(r"\d+", " ", s)
    # normalize whitespace
    s = re.sub(r"\s+", " ", s).strip()
    # tokenize
    tokens = s.split()
    # remove stopwords and single-char tokens, apply stemming
    tokens = [ps.stem(tok) for tok in tokens if tok not in stop_words and len(tok) > 1]
    return " ".join(tokens)

print("üßπ Preprocessing texts (advanced)...")
df["clean_text"] = df[TEXT_COL].apply(advanced_preprocess)
# drop empty rows after cleaning
df = df[df["clean_text"].str.len() > 0].reset_index(drop=True)
print(f"After cleaning: {df.shape}")

# -------------------------
# Encode labels
# -------------------------
le = LabelEncoder()
df["label_enc"] = le.fit_transform(df[TARGET_COL])
joblib.dump(le, f"{OUTPUT_DIR}/label_encoder.pkl")
print("üî¢ Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# Save dataset info (pre/post)
dataset_info = {
    "original_rows": int(pd.read_csv(INPUT_CSV).shape[0]),
    "rows_after_cleaning": int(df.shape[0]),
    "text_column": TEXT_COL,
    "target_column": TARGET_COL,
    "classes": list(le.classes_),
    "class_counts": {str(k): int(v) for k, v in df["label_enc"].value_counts().to_dict().items()}
}
with open(f"{OUTPUT_DIR}/dataset_info.json", "w", encoding="utf-8") as f:
    json.dump(dataset_info, f, indent=4, ensure_ascii=False)

# -------------------------
# TF-IDF (fit on cleaned text)
# -------------------------
print("üî§ Fitting TF-IDF...")
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=2, max_df=0.95)
X = tfidf.fit_transform(df["clean_text"])
y = df["label_enc"].values
joblib.dump(tfidf, f"{OUTPUT_DIR}/tfidf_vectorizer.pkl")

# -------------------------
# Train-test split
# -------------------------
print("üîÄ Train-test split...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

# -------------------------
# Train Random Forest
# -------------------------
print("üå≤ Training Random Forest (no GridSearch)...")
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight="balanced")
t0 = time.time()
rf.fit(X_train, y_train)
t1 = time.time()
train_time = t1 - t0
joblib.dump(rf, f"{OUTPUT_DIR}/random_forest_model.pkl")
print(f"‚úÖ Training completed in {train_time:.2f} s")

# -------------------------
# Basic evaluation
# -------------------------
print("üìä Evaluating...")
y_pred = rf.predict(X_test)
try:
    y_proba = rf.predict_proba(X_test)
except Exception:
    y_proba = None

acc = accuracy_score(y_test, y_pred)
prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
rec_macro = recall_score(y_test, y_pred, average="macro", zero_division=0)
f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
prec_weight = precision_score(y_test, y_pred, average="weighted", zero_division=0)
rec_weight = recall_score(y_test, y_pred, average="weighted", zero_division=0)
f1_weight = f1_score(y_test, y_pred, average="weighted", zero_division=0)
mcc = matthews_corrcoef(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
try:
    ll = float(log_loss(y_test, y_proba)) if y_proba is not None else None
except Exception:
    ll = None

# classification report (txt + csv)
clf_rep_txt = classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0)
with open(f"{OUTPUT_DIR}/classification_report_rf.txt", "w", encoding="utf-8") as f:
    f.write(clf_rep_txt)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose().to_csv(f"{OUTPUT_DIR}/classification_report_rf.csv")

# confusion matrix (csv + png)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=le.classes_, columns=le.classes_).to_csv(f"{OUTPUT_DIR}/confusion_matrix_rf.csv")
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_, cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Random Forest")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/confusion_matrix_rf.png", dpi=300)
plt.close()

# -------------------------
# ROC & AUC (per-class) and PR & AUPRC
# -------------------------
n_classes = len(le.classes_)
y_test_bin = label_binarize(y_test, classes=range(n_classes))
roc_auc_per_class = {}
pr_auc_per_class = {}
roc_auc_macro = None
avg_precision_macro = None

if y_proba is not None:
    plt.figure(figsize=(7,6))
    aucs = []
    for i in range(n_classes):
        try:
            fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
            auc_i = roc_auc_score(y_test_bin[:, i], y_proba[:, i])
            aucs.append(auc_i)
            plt.plot(fpr, tpr, label=f"{le.classes_[i]} (AUC={auc_i:.3f})")
            roc_auc_per_class[le.classes_[i]] = float(auc_i)
        except Exception:
            continue
    if len(aucs) > 0:
        roc_auc_macro = float(np.mean(aucs))
    plt.plot([0,1],[0,1],"k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves - Random Forest (per class)")
    plt.legend(loc="lower right", fontsize="small")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/roc_curve_rf.png", dpi=300)
    plt.close()

    # Precision-Recall
    plt.figure(figsize=(7,6))
    aps = []
    for i in range(n_classes):
        try:
            precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_proba[:, i])
            ap_i = average_precision_score(y_test_bin[:, i], y_proba[:, i])
            aps.append(ap_i)
            plt.plot(recall, precision, label=f"{le.classes_[i]} (AP={ap_i:.3f})")
            pr_auc_per_class[le.classes_[i]] = float(ap_i)
        except Exception:
            continue
    if len(aps) > 0:
        avg_precision_macro = float(np.mean(aps))
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curves - Random Forest (per class)")
    plt.legend(loc="lower left", fontsize="small")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/pr_curve_rf.png", dpi=300)
    plt.close()

# -------------------------
# Feature importances (top-k)
# -------------------------
try:
    feat_imp = rf.feature_importances_
    feat_names = tfidf.get_feature_names_out()
    fi_df = pd.DataFrame({"feature": feat_names, "importance": feat_imp})
    fi_df = fi_df.sort_values("importance", ascending=False)
    fi_df.to_csv(f"{OUTPUT_DIR}/feature_importances_rf.csv", index=False)
    top_n = min(30, len(fi_df))
    plt.figure(figsize=(8, max(4, top_n//2)))
    plt.barh(fi_df["feature"].values[:top_n][::-1], fi_df["importance"].values[:top_n][::-1])
    plt.title("Top feature importances - Random Forest")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/feature_importances_rf.png", dpi=300)
    plt.close()
except Exception as e:
    print("Feature importances skipped or error:", e)

# -------------------------
# Learning curve (guarded)
# -------------------------
try:
    print("‚è≥ Computing learning curve (this may take time)...")
    train_sizes, train_scores, test_scores = learning_curve(rf, X, y, cv=5, n_jobs=-1,
                                                            train_sizes=np.linspace(0.1,1.0,5), scoring="accuracy")
    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    plt.figure(figsize=(8,6))
    plt.plot(train_sizes, train_mean, 'o-', label='Train')
    plt.plot(train_sizes, test_mean, 'o-', label='CV')
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy")
    plt.title("Learning Curve - Random Forest")
    plt.legend(loc='best')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/learning_curve_rf.png", dpi=300)
    plt.close()
except Exception as e:
    print("Learning curve computation skipped or failed:", e)

# -------------------------
# Cross-validation summary
# -------------------------
try:
    cv_scores = cross_val_score(rf, X, y, cv=5, scoring="accuracy", n_jobs=-1)
    cv_summary = {"cv_scores": [float(s) for s in cv_scores.tolist()], "cv_mean": float(np.mean(cv_scores)), "cv_std": float(np.std(cv_scores))}
    with open(f"{OUTPUT_DIR}/cv_summary_rf.json", "w", encoding="utf-8") as f:
        json.dump(cv_summary, f, indent=4, ensure_ascii=False)
except Exception as e:
    print("CV summary skipped or failed:", e)

# -------------------------
# Inference timing: 3 samples and full test set
# -------------------------
sample_texts = [
    "I love this community, it's very helpful",
    "You're an idiot, go away",
    "Women don't belong in tech"
]
sample_clean = [advanced_preprocess(t) for t in sample_texts]
sample_vec = tfidf.transform(sample_clean)

# predict batch timing
t0 = time.perf_counter()
sample_preds = rf.predict(sample_vec)
t1 = time.perf_counter()
total_batch_pred = t1 - t0
avg_batch_pred = total_batch_pred / len(sample_texts)

# predict loop timing
t0 = time.perf_counter()
for i in range(len(sample_texts)):
    _ = rf.predict(sample_vec[i])
t1 = time.perf_counter()
avg_loop_pred = (t1 - t0) / len(sample_texts)

# full test timing
t0 = time.perf_counter()
_ = rf.predict(X_test)
t1 = time.perf_counter()
total_test_pred = t1 - t0
avg_test_pred = total_test_pred / X_test.shape[0]

# predict_proba timings if available
total_proba_batch = None
avg_proba_batch = None
total_proba_test = None
avg_proba_test = None
if hasattr(rf, "predict_proba"):
    t0 = time.perf_counter()
    _ = rf.predict_proba(sample_vec)
    t1 = time.perf_counter()
    total_proba_batch = t1 - t0
    avg_proba_batch = total_proba_batch / len(sample_texts)
    t0 = time.perf_counter()
    _ = rf.predict_proba(X_test)
    t1 = time.perf_counter()
    total_proba_test = t1 - t0
    avg_proba_test = total_proba_test / X_test.shape[0]

inference_info = {
    "sample_texts": sample_texts,
    "num_samples": len(sample_texts),
    "total_batch_predict_sec": float(total_batch_pred),
    "avg_batch_predict_sec_per_sample": float(avg_batch_pred),
    "avg_loop_predict_sec_per_sample": float(avg_loop_pred),
    "total_test_predict_sec": float(total_test_pred),
    "avg_test_predict_sec_per_sample": float(avg_test_pred),
    "total_proba_batch_sec": float(total_proba_batch) if total_proba_batch is not None else None,
    "avg_proba_batch_sec_per_sample": float(avg_proba_batch) if avg_proba_batch is not None else None,
    "total_proba_test_sec": float(total_proba_test) if total_proba_test is not None else None,
    "avg_proba_test_sec_per_sample": float(avg_proba_test) if avg_proba_test is not None else None
}
with open(f"{OUTPUT_DIR}/inference_time_rf.json", "w", encoding="utf-8") as f:
    json.dump(inference_info, f, indent=4, ensure_ascii=False)
with open(f"{OUTPUT_DIR}/inference_time_rf.txt", "w", encoding="utf-8") as f:
    for k, v in inference_info.items():
        f.write(f"{k}: {v}\n")

# -------------------------
# Model size
# -------------------------
model_path = Path(f"{OUTPUT_DIR}/random_forest_model.pkl")
model_size_mb = model_path.stat().st_size / (1024*1024)
with open(f"{OUTPUT_DIR}/model_size_rf.txt", "w", encoding="utf-8") as f:
    f.write(f"model_size_mb: {model_size_mb:.4f}\n")

# -------------------------
# Metrics summary JSON
# -------------------------
metrics_summary = {
    "accuracy": float(acc),
    "precision_macro": float(prec_macro),
    "recall_macro": float(rec_macro),
    "f1_macro": float(f1_macro),
    "precision_weighted": float(prec_weight),
    "recall_weighted": float(rec_weight),
    "f1_weighted": float(f1_weight),
    "mcc": float(mcc),
    "cohen_kappa": float(kappa),
    "balanced_accuracy": float(bal_acc),
    "log_loss": float(ll) if ll is not None else None,
    "roc_auc_macro": float(roc_auc_macro) if roc_auc_macro is not None else None,
    "average_precision_macro": float(avg_precision_macro) if avg_precision_macro is not None else None,
    "train_time_sec": float(train_time),
    "model_size_mb": float(model_size_mb),
    "n_classes": int(n_classes),
    "n_features": int(X.shape[1]),
    "train_samples": int(X_train.shape[0]),
    "test_samples": int(X_test.shape[0])
}
with open(f"{OUTPUT_DIR}/metrics_summary_rf.json", "w", encoding="utf-8") as f:
    json.dump(metrics_summary, f, indent=4, ensure_ascii=False)

# -------------------------
# Sample predictions saved
# -------------------------
sample_pred_labels = le.inverse_transform(sample_preds)
sample_df = pd.DataFrame({"text": sample_texts, "cleaned": sample_clean, "predicted": sample_pred_labels})
sample_df.to_csv(f"{OUTPUT_DIR}/sample_predictions_rf.csv", index=False)

# -------------------------
# Final summary print
# -------------------------
print("\n==== METRICS SUMMARY ====")
for k, v in metrics_summary.items():
    print(f"{k}: {v}")
print("\nFiles written to:", OUTPUT_DIR)


üì• Loading dataset...
Loaded dataset shape: (47692, 2)
Columns: ['tweet_text', 'cyberbullying_type']
üßπ Preprocessing texts (advanced)...
After cleaning: (47252, 3)
üî¢ Class mapping: {'age': 0, 'ethnicity': 1, 'gender': 2, 'not_cyberbullying': 3, 'other_cyberbullying': 4, 'religion': 5}
üî§ Fitting TF-IDF...
üîÄ Train-test split...
Train size: 37801, Test size: 9451
üå≤ Training Random Forest (no GridSearch)...
‚úÖ Training completed in 57.66 s
üìä Evaluating...
‚è≥ Computing learning curve (this may take time)...

==== METRICS SUMMARY ====
accuracy: 0.8106020526928367
precision_macro: 0.8095936352438643
recall_macro: 0.8076337092831797
f1_macro: 0.8081756353679146
precision_weighted: 0.8128873442490423
recall_weighted: 0.8106020526928367
f1_weighted: 0.8113175172261858
mcc: 0.772968715468762
cohen_kappa: 0.7727324417796563
balanced_accuracy: 0.8076337092831797
log_loss: 0.553700817262767
roc_auc_macro: 0.9566576399025016
average_precision_macro: 0.8233209156377993
train_time