# Commulyzer
This notebook is used to train and export the toxicity classifiers on the cleaned Reddit comments.

In [None]:
import json, csv
from pathlib import Path
from typing import List, Dict

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import (
    precision_recall_curve, roc_auc_score, average_precision_score,
    f1_score, precision_score, recall_score
)
import joblib

try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
except Exception as e:
    print("iterative-stratification not found. Install with: pip install iterative-stratification")
    raise

try:
    import torch
    from torch.utils.data import Dataset
    from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                              Trainer, TrainingArguments, set_seed)
    HF_AVAILABLE = True
except Exception:
    HF_AVAILABLE = False

DATA_PROCESSED = Path("data/processed")
MERGED_DIR = DATA_PROCESSED / "merged"
LABELED_DATA_PATH = MERGED_DIR / "merged_comments_labeled_cleaned.csv"
OUTPUTS = Path("outputs"); MODELS = OUTPUTS / "models"; REPORTS = OUTPUTS / "reports"
for p in [DATA_PROCESSED, MERGED_DIR, OUTPUTS, MODELS, REPORTS]:
    p.mkdir(parents=True, exist_ok=True)

if not LABELED_DATA_PATH.exists():
    raise FileNotFoundError(
        f"Cleaned dataset not found: {LABELED_DATA_PATH}. Run merge/label/clean scripts first."
    )

LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate","racism"]
print("Using cleaned dataset:", LABELED_DATA_PATH)
print("Using labels:", LABELS)

  from .autonotebook import tqdm as notebook_tqdm
  return f
  return self._get_more_data(ov, maxsize)
  return f
  return self._get_more_data(ov, maxsize)


Using cleaned dataset: data\processed\merged\merged_comments_labeled_cleaned.csv
Using labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'racism']


In [2]:

def normalize_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.replace("\r", " ").replace("\n", " ").strip()
    return " ".join(s.split())

def safe_read_csv(path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(path, dtype=str, keep_default_na=False, quoting=csv.QUOTE_MINIMAL)
    except Exception as e:
        print(f"Failed to read {path}: {e}")
        return pd.DataFrame()

def save_df(df: pd.DataFrame, path: str):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, quoting=csv.QUOTE_MINIMAL)
    print(f"Saved: {path} ({len(df)} rows)")


In [3]:
def prepare_frame(df: pd.DataFrame) -> pd.DataFrame:
    need_cols = ["source_subreddit","post_id","comment_id","created_utc","score","body","permalink","post_rank","post_title","subreddit"]
    for c in need_cols:
        if c not in df.columns: df[c] = ""
    for lab in LABELS:
        if lab not in df.columns and f"{lab}_bin" in df.columns: df[lab] = df[f"{lab}_bin"]
        if lab not in df.columns: df[lab] = "0"
        df[lab] = df[lab].astype(str).str.extract(r"(\d)").fillna("0").astype(int).clip(0,1)

    df["body"] = df["body"].astype(str)
    df = df[~df["body"].isin(["", "[deleted]", "[removed]"])].copy()
    df["body_clean"] = df["body"].map(normalize_text)
    df = df[df["body_clean"] != ""]
    df = df[~df["body_clean"].str.startswith("http")].copy()

    if "comment_id" in df.columns:
        df = df.drop_duplicates(subset=["comment_id"])
    else:
        df = df.drop_duplicates(subset=["body_clean"])
    return df

def load_clean_dataset(path: Path) -> pd.DataFrame:
    print(f"Loading cleaned dataset from {path} ...")
    df = safe_read_csv(str(path))
    if df.empty:
        raise ValueError(f"No rows found in {path}. Ensure the file contains labeled comments.")
    prepared = prepare_frame(df)
    print(f"Prepared rows: {len(prepared)}")
    return prepared

merged = load_clean_dataset(LABELED_DATA_PATH)
print("Merged shape:", merged.shape)
merged.head(3)

Loading cleaned dataset from data\processed\merged\merged_comments_labeled_cleaned.csv ...
Prepared rows: 677139
Merged shape: (677139, 38)
Prepared rows: 677139
Merged shape: (677139, 38)


Unnamed: 0,source_subreddit,post_id,post_rank,post_title,subreddit,comment_id,parent_id,author,created_utc,created_iso,...,racism_bin,labels,toxic,severe_toxic,obscene,threat,insult,identity_hate,racism,body_clean
0,StellaSora,1olnny0,187,Revenue,StellaSora,nmobnux,t1_nmjancc,98NINJA98,1762073636.0,2025-11-02T08:53:56,...,0,toxic|obscene,1,0,1,0,0,0,0,Lmao 99% of the population don't play stella s...
1,StellaSora,1olnny0,187,Revenue,StellaSora,nml3c3l,t1_nmj7b1l,avelineaurora,1762026690.0,2025-11-01T19:51:30,...,0,toxic|obscene,1,0,1,0,0,0,0,They haven't even done any fixes yet... It's l...
2,StellaSora,1olnny0,187,Revenue,StellaSora,nmk65tt,t1_nmjgnvk,TankedCat,1762016350.0,2025-11-01T16:59:10,...,0,toxic|obscene,1,0,1,0,0,0,0,honestly the ‘war on yuri’ is rooted in the pl...


In [4]:
def thread_stratified_split(df: pd.DataFrame, label_cols: List[str], group_col: str="post_id",
                            val_size: float=0.15, test_size: float=0.15, seed: int=42):
    agg = df.groupby(group_col)[label_cols].max().reset_index()
    X = np.arange(len(agg)).reshape(-1,1)
    Y = agg[label_cols].values

    # Train vs temp
    n_splits = max(3, int(1/(test_size+val_size)))
    mskf1 = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    train_idx, temp_idx = next(mskf1.split(X, Y))

    temp_X, temp_Y = X[temp_idx], Y[temp_idx]
    mskf2 = MultilabelStratifiedKFold(n_splits=2, shuffle=True, random_state=seed+1)
    val_rel, test_rel = next(mskf2.split(temp_X, temp_Y))
    val_idx, test_idx = temp_idx[val_rel], temp_idx[test_rel]

    threads = agg[group_col].values
    train_threads, val_threads, test_threads = set(threads[train_idx]), set(threads[val_idx]), set(threads[test_idx])

    pick = lambda s: df[df[group_col].isin(s)].copy()
    train_df, val_df, test_df = pick(train_threads), pick(val_threads), pick(test_threads)
    print(f"Split sizes: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")
    return train_df, val_df, test_df

df = merged.copy()
for lab in LABELS: df[lab] = df[lab].astype(int)

train_df, val_df, test_df = thread_stratified_split(df, LABELS, group_col="post_id")
save_df(train_df, DATA_PROCESSED / "train.csv")
save_df(val_df,   DATA_PROCESSED / "val.csv")
save_df(test_df,  DATA_PROCESSED / "test.csv")

Split sizes: train=455385, val=110036, test=111718
Saved: data\processed\train.csv (455385 rows)
Saved: data\processed\train.csv (455385 rows)
Saved: data\processed\val.csv (110036 rows)
Saved: data\processed\val.csv (110036 rows)
Saved: data\processed\test.csv (111718 rows)
Saved: data\processed\test.csv (111718 rows)


In [5]:

def calibrate_thresholds(y_true: np.ndarray, y_prob: np.ndarray, label_names: List[str]) -> Dict[str, float]:
    thresholds = {}
    for i, lab in enumerate(label_names):
        best_t, best_f1 = 0.5, -1.0
        for t in np.linspace(0.05, 0.95, 19):
            preds = (y_prob[:, i] >= t).astype(int)
            f1 = f1_score(y_true[:, i], preds, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        thresholds[lab] = float(best_t)
    return thresholds

def apply_thresholds(y_prob: np.ndarray, thresholds: Dict[str, float], label_names: List[str]) -> np.ndarray:
    out = np.zeros_like(y_prob, dtype=int)
    for i, lab in enumerate(label_names):
        out[:, i] = (y_prob[:, i] >= thresholds.get(lab, 0.5)).astype(int)
    return out

def evaluate_all(y_true, y_prob, label_names, thresholds=None, title=""):
    if thresholds is None:
        thresholds = {lab:0.5 for lab in label_names}
    y_pred = apply_thresholds(y_prob, thresholds, label_names)

    lines, ap_per, roc_per = [], [], []
    for i, lab in enumerate(label_names):
        ap  = average_precision_score(y_true[:, i], y_prob[:, i]) if y_true[:, i].sum() > 0 else float("nan")
        roc = roc_auc_score(y_true[:, i], y_prob[:, i]) if len(np.unique(y_true[:, i]))>1 else float("nan")
        p = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        r = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        f = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
        ap_per.append(ap); roc_per.append(roc)
        lines.append(f"{lab:15s}  AP={ap:.3f}  ROC-AUC={roc:.3f}  P={p:.3f} R={r:.3f} F1={f:.3f} thr={thresholds[lab]:.2f}")
    micro_f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    summary = f"\n{title}\nMicro-F1={micro_f1:.3f}  Macro-F1={macro_f1:.3f}\n" + "\n".join(lines)
    print(summary)
    return summary


In [7]:
read_kwargs = dict(dtype=str, keep_default_na=False)

train_df = pd.read_csv(DATA_PROCESSED / "train.csv", **read_kwargs)
val_df   = pd.read_csv(DATA_PROCESSED / "val.csv", **read_kwargs)
test_df  = pd.read_csv(DATA_PROCESSED / "test.csv", **read_kwargs)

for split_name, frame in [("train", train_df), ("val", val_df), ("test", test_df)]:
    missing = frame["body_clean"].eq("").sum() + frame["body_clean"].isna().sum()
    if missing:
        print(f"[{split_name}] dropping {missing} rows with empty body_clean")
        frame.drop(frame[frame["body_clean"].isna() | frame["body_clean"].eq("")].index, inplace=True)
    frame["body_clean"] = frame["body_clean"].astype(str)

Y_train = train_df[LABELS].values.astype(int)
Y_val   = val_df[LABELS].values.astype(int)
Y_test  = test_df[LABELS].values.astype(int)

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, sublinear_tf=True)
X_train = vectorizer.fit_transform(train_df["body_clean"])
X_val   = vectorizer.transform(val_df["body_clean"])
X_test  = vectorizer.transform(test_df["body_clean"])

clf = OneVsRestClassifier(LogisticRegression(max_iter=4000, class_weight="balanced"))
clf.fit(X_train, Y_train)

val_prob = clf.predict_proba(X_val)
thresholds = calibrate_thresholds(Y_val, val_prob, LABELS)
test_prob = clf.predict_proba(X_test)

report = evaluate_all(Y_test, test_prob, LABELS, thresholds, title="[Baseline] Test metrics (calibrated)")
Path("outputs/reports").mkdir(parents=True, exist_ok=True)
Path("outputs/reports/baseline_report.txt").write_text(report, encoding="utf-8")

# Save artifacts
model_dir = Path("outputs/models/baseline"); model_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(vectorizer, model_dir / "vectorizer.joblib")
joblib.dump(clf, model_dir / "ovr_lr.joblib")
(model_dir / "labels.txt").write_text("\n".join(LABELS), encoding="utf-8")
(model_dir / "thresholds.json").write_text(json.dumps(thresholds, indent=2), encoding="utf-8")
print("Baseline artifacts saved to", model_dir)


[Baseline] Test metrics (calibrated)
Micro-F1=0.962  Macro-F1=0.841
toxic            AP=0.982  ROC-AUC=0.997  P=0.961 R=0.964 F1=0.962 thr=0.60
severe_toxic     AP=0.629  ROC-AUC=0.996  P=0.615 R=0.571 F1=0.593 thr=0.65
obscene          AP=0.985  ROC-AUC=0.998  P=0.962 R=0.987 F1=0.975 thr=0.55
threat           AP=0.702  ROC-AUC=0.997  P=0.690 R=0.667 F1=0.678 thr=0.85
insult           AP=0.977  ROC-AUC=0.999  P=0.934 R=0.968 F1=0.951 thr=0.70
identity_hate    AP=0.855  ROC-AUC=0.984  P=0.783 R=0.856 F1=0.818 thr=0.20
racism           AP=0.889  ROC-AUC=0.986  P=0.912 R=0.908 F1=0.910 thr=0.40
Baseline artifacts saved to outputs\models\baseline
Baseline artifacts saved to outputs\models\baseline


In [8]:

print("HF available:", HF_AVAILABLE)
if HF_AVAILABLE:
    set_seed(42)
    model_name = "distilbert-base-uncased"; max_len = 256; batch_size = 16; epochs = 3; lr = 2e-5
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    class ToxDataset(Dataset):
        def __init__(self, df):
            self.texts = df["body_clean"].tolist()
            self.labels = df[LABELS].values.astype(float)
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            enc = tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
            item = {k: v.squeeze(0) for k, v in enc.items()}
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
            return item

    train_ds, val_ds, test_ds = ToxDataset(train_df), ToxDataset(val_df), ToxDataset(test_df)

    pos = train_df[LABELS].sum(axis=0).values.astype(float)
    total = len(train_df)
    pos_weight = (total - pos) / np.clip(pos, 1.0, None)
    pos_weight = np.clip(pos_weight, 1.0, 20.0)
    pos_weight_t = torch.tensor(pos_weight, dtype=torch.float)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(LABELS), problem_type="multi_label_classification")

    def custom_loss(outputs, labels):
        logits = outputs.logits
        return torch.nn.functional.binary_cross_entropy_with_logits(logits, labels, pos_weight=pos_weight_t.to(logits.device))

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probs = 1/(1+np.exp(-logits))
        metrics = {}
        for i, lab in enumerate(LABELS):
            if labels[:, i].sum() > 0:
                ap = average_precision_score(labels[:, i], probs[:, i])
                metrics[f"ap_{lab}"] = ap
        preds = (probs >= 0.5).astype(int)
        metrics["micro_f1@0.5"] = f1_score(labels, preds, average="micro", zero_division=0)
        metrics["macro_f1@0.5"] = f1_score(labels, preds, average="macro", zero_division=0)
        return metrics

    training_args = TrainingArguments(
        output_dir="outputs/models/transformer/checkpoints",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="micro_f1@0.5",
        logging_steps=50,
        report_to="none"
    )

    class BCELossTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            loss = custom_loss(outputs, labels)
            return (loss, outputs) if return_outputs else loss

    trainer = BCELossTrainer(
        model=model, args=training_args,
        train_dataset=train_ds, eval_dataset=val_ds,
        tokenizer=tokenizer, compute_metrics=compute_metrics
    )
    trainer.train()

    # Calibrate
    val_logits = trainer.predict(val_ds).predictions
    val_probs = 1/(1+np.exp(-val_logits))
    thr = calibrate_thresholds(val_df[LABELS].values.astype(int), val_probs, LABELS)

    # Test
    test_logits = trainer.predict(test_ds).predictions
    test_probs = 1/(1+np.exp(-test_logits))
    report_t = evaluate_all(test_df[LABELS].values.astype(int), test_probs, LABELS, thr, title="[Transformer] Test metrics (calibrated)")
    Path("outputs/reports/transformer_report.txt").write_text(report_t, encoding="utf-8")

    # Save
    tdir = Path("outputs/models/transformer/final"); tdir.mkdir(parents=True, exist_ok=True)
    trainer.model.save_pretrained(str(tdir)); tokenizer.save_pretrained(str(tdir))
    (tdir / "labels.txt").write_text("\n".join(LABELS), encoding="utf-8")
    (tdir / "thresholds.json").write_text(json.dumps(thr, indent=2), encoding="utf-8")
    print("Transformer artifacts saved to", tdir)
else:
    print("Skipping transformer section (transformers/torch unavailable).")


HF available: False
Skipping transformer section (transformers/torch unavailable).


In [9]:

from typing import Tuple

def load_baseline(model_dir=Path("outputs/models/baseline")) -> Tuple:
    vec = joblib.load(model_dir / "vectorizer.joblib")
    clf = joblib.load(model_dir / "ovr_lr.joblib")
    labels = (model_dir / "labels.txt").read_text(encoding="utf-8").splitlines()
    thresholds = json.loads((model_dir / "thresholds.json").read_text(encoding="utf-8"))
    return vec, clf, labels, thresholds

def infer_baseline(texts: List[str], model_dir=Path("outputs/models/baseline")):
    vec, clf, labels, thresholds = load_baseline(model_dir)
    X = vec.transform([normalize_text(t) for t in texts])
    prob = clf.predict_proba(X)
    preds = apply_thresholds(prob, thresholds, labels)
    return labels, prob, preds

def load_transformer(model_dir=Path("outputs/models/transformer/final")):
    tok = AutoTokenizer.from_pretrained(str(model_dir))
    mdl = AutoModelForSequenceClassification.from_pretrained(str(model_dir))
    labels = (model_dir / "labels.txt").read_text(encoding="utf-8").splitlines()
    thresholds = json.loads((model_dir / "thresholds.json").read_text(encoding="utf-8"))
    return tok, mdl, labels, thresholds

def infer_transformer(texts: List[str], model_dir=Path("outputs/models/transformer/final"), max_len=256):
    tok, mdl, labels, thresholds = load_transformer(model_dir)
    enc = tok(texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
    mdl.eval()
    with torch.no_grad():
        logits = mdl(**{k: v for k,v in enc.items()})[0]
        probs = torch.sigmoid(logits).cpu().numpy()
    preds = apply_thresholds(probs, thresholds, labels)
    return labels, probs, preds


In [1]:
sample_texts = [
    "I'm so happy with this patch and the dev team did an amazing job!",
    "You're such an idiot, go crawl back to your cave.",
    "I'm a hater.",
    "Bro I hate this game.",
    "Bruh."
]

labels, probs, preds = infer_baseline(sample_texts)

for text, prob_row, pred_row in zip(sample_texts, probs, preds):
    active = [lab for lab, is_on in zip(labels, pred_row) if is_on]
    print("TEXT:", text)
    print("Active labels:", active or ["none"])
    print("Probabilities:", {lab: round(float(p), 3) for lab, p in zip(labels, prob_row)})
    print("-" * 60)

NameError: name 'infer_baseline' is not defined