In [1]:
from pathlib import Path
import time
import pandas as pd, numpy as np, json, joblib
from types import SimpleNamespace as NS

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    average_precision_score, roc_auc_score, classification_report,
    precision_recall_curve, f1_score, precision_score, recall_score,
    confusion_matrix
)
from sklearn.svm import LinearSVC

from scipy.sparse import csr_matrix

In [2]:
PROJECT_DIR = Path.cwd().parent
DATA_DIR = PROJECT_DIR / "data"
ARTIFACTS_DIR = PROJECT_DIR / "artifacts"

ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42

In [3]:
# Fits a linear logistic regression with elastic-net (SAGA)
def fit_calibrated_lr(Xtr, ytr, *, cv=3, C=1.0, l1_ratio=0.5, max_iter=1000, tol=1e-3, rs=42):
    base = LogisticRegression(
        penalty="elasticnet", solver="saga",
        l1_ratio=l1_ratio, C=C, class_weight="balanced",
        max_iter=max_iter, tol=tol, n_jobs=-1, random_state=rs, verbose=0
    )
    clf = CalibratedClassifierCV(estimator=base, method="sigmoid", cv=cv)
    t0 = time.time(); clf.fit(Xtr, ytr); print(f"[fit] calibrated LR in {time.time()-t0:.1f}s")
    return clf

# Threshold that maximizes F1
def tune_f1(y_true, scores):
    prec, rec, thr_vec = precision_recall_curve(y_true, scores)
    f1s = (2*prec*rec)/(prec+rec+1e-12)
    best = int(np.nanargmax(f1s[:-1]))
    return float(thr_vec[best]), float(f1s[best])

def tune_precision_floor(y_true, scores, target_p=0.95):
    prec, rec, thr_vec = precision_recall_curve(y_true, scores)
    mask = prec[:-1] >= target_p
    if np.any(mask):
        idxs = np.where(mask)[0]
        best = idxs[np.argmax(rec[:-1][idxs])]
    else:
        best = int(np.argmax(prec[:-1]))
    return float(thr_vec[best])

# Computes PR-AUC, ROC-AUC, F1, precision, and recall
def eval_binary(y_true, scores, thr):
    yhat = (scores >= thr).astype(int)
    return dict(
        pr_auc=float(average_precision_score(y_true, scores)),
        roc_auc=float(roc_auc_score(y_true, scores)),
        f1=float(f1_score(y_true, yhat)),
        precision=float(precision_score(y_true, yhat, zero_division=0)),
        recall=float(recall_score(y_true, yhat, zero_division=0)),
    )

# Precision@k
def p_at_frac(y_true, scores, frac=0.10):
    k = max(1, int(frac * len(y_true)))
    idx = np.argsort(-scores)[:k]
    return float(y_true[idx].mean()), k

In [4]:
cb = NS()
cb.X = joblib.load(ARTIFACTS_DIR / "cb_word_char_cues.joblib")     # dict: train/dev/test (CSR)
cb.cues = joblib.load(ARTIFACTS_DIR / "cb_cues_scaled.joblib")     # dict: train/dev/test (CSR)
cb.cue_names = joblib.load(ARTIFACTS_DIR / "cb_cue_columns.joblib")

# labels from cleaned parquet + frozen splits
with open(ARTIFACTS_DIR / "cb_splits.json") as f:
    cb_splits = json.load(f)
cb_df = pd.read_parquet(DATA_DIR / "cleaned_cyberbullying.parquet")

lab = cb_df["is_bullying"].astype(int)

cb.y = {
    "train": lab.iloc[cb_splits["train"]].values,
    "dev":   lab.iloc[cb_splits["dev"]].values,
    "test":  lab.iloc[cb_splits["test"]].values,
}
print("CB shapes:", cb.X["train"].shape, cb.X["dev"].shape, cb.X["test"].shape)

CB shapes: (33313, 57429) (4759, 57429) (9518, 57429)


In [5]:
tx = NS()
tx.X = joblib.load(ARTIFACTS_DIR / "tx_word_char_cues.joblib")
tx.cues = joblib.load(ARTIFACTS_DIR / "tx_cues_scaled.joblib")
tx.cue_names = joblib.load(ARTIFACTS_DIR / "tx_cue_columns.joblib")
tx.labels = joblib.load(ARTIFACTS_DIR / "tx_label_order.joblib")   # ["toxic",...]
tx.Y = joblib.load(ARTIFACTS_DIR / "tx_multilabel.joblib")
tx.y_any = { s: (tx.Y[s].max(axis=1) > 0).astype(int) for s in ["train","dev","test"] }

print("TX shapes:", tx.X["train"].shape, tx.X["dev"].shape, tx.X["test"].shape)

TX shapes: (111699, 80019) (15957, 80019) (31915, 80019)


In [6]:
# B binary: train, pick F1 threshold, also compute P>=0.95 threshold, eval on test 
cb.model = fit_calibrated_lr(cb.X["train"], cb.y["train"], cv=3, max_iter=1000, rs=RANDOM_STATE)

# dev scores → thresholds
probs_dv = cb.model.predict_proba(cb.X["dev"])[:,1]
cb.thr_f1, cb.f1_dev = tune_f1(cb.y["dev"], probs_dv)
cb.thr_p95 = tune_precision_floor(cb.y["dev"], probs_dv, target_p=0.95)
print(f"[CB|dev] thr_f1={cb.thr_f1:.3f}  thr_p95={cb.thr_p95:.3f}")

# test eval (F1-opt)
probs_te = cb.model.predict_proba(cb.X["test"])[:,1]
cb.metrics_test_f1 = eval_binary(cb.y["test"], probs_te, cb.thr_f1)
p10, k = p_at_frac(cb.y["test"], probs_te, 0.10)
print(f"[CB|test@F1] {cb.metrics_test_f1}  P@{k}={p10:.3f}")

# test eval (precision-first)
cb.metrics_test_p95 = eval_binary(cb.y["test"], probs_te, cb.thr_p95)
print(f"[CB|test@P>=0.95] {cb.metrics_test_p95}")

# save model + thresholds
joblib.dump(
    {"model": cb.model, "dev_threshold": cb.thr_f1,
     "dev_threshold_precision_floor": {"target_precision": 0.95, "threshold": cb.thr_p95}},
    ARTIFACTS_DIR / "cb_bin_logreg_calibrated.joblib"
)
with open(ARTIFACTS_DIR / "cb_bin_test_metrics.json", "w") as f:
    json.dump({"mode":"f1","metrics":cb.metrics_test_f1,
               "alt_mode":"p95","alt_metrics":cb.metrics_test_p95}, f, indent=2)
print("Saved CB artifacts.")

[fit] calibrated LR in 123.3s
[CB|dev] thr_f1=0.327  thr_p95=0.724
[CB|test@F1] {'pr_auc': 0.9827113711005413, 'roc_auc': 0.9143103857583286, 'f1': 0.9256880193293653, 'precision': 0.8691899070385126, 'recall': 0.9900415983864869}  P@951=1.000
[CB|test@P>=0.95] {'pr_auc': 0.9827113711005413, 'roc_auc': 0.9143103857583286, 'f1': 0.8951310861423221, 'precision': 0.9534121669753526, 'recall': 0.8435648556662044}
Saved CB artifacts.


In [7]:
# CB cues: fit tiny cue-only model and inspect coefficients 
Xp_cb_tr, Xp_cb_dv, Xp_cb_te = cb.cues["train"], cb.cues["dev"], cb.cues["test"]
coef_model = LogisticRegression(
    penalty="elasticnet", solver="saga",
    l1_ratio=0.5, C=1.0, class_weight="balanced",
    max_iter=2000, n_jobs=-1, random_state=RANDOM_STATE, tol=1e-4
).fit(Xp_cb_tr, cb.y["train"])

coef_s = pd.Series(coef_model.coef_.ravel(), index=cb.cue_names).sort_values()
print("\n[CB | Cues] decreasing toxicity:")
print(coef_s.head(8))
print("\n[CB | Cues] increasing toxicity:")
print(coef_s.tail(8))


[CB | Cues] decreasing toxicity:
rate_hashtags      -6.279397
rate_bangs         -1.720294
has_hedge_bigram   -0.620773
had_mention        -0.375999
had_url            -0.261370
rate_polite        -0.069340
rate_2nd            0.000000
rate_hedge          0.154060
dtype: float64

[CB | Cues] increasing toxicity:
rate_qmarks      1.162086
has_profane      1.163737
rate_general     1.235823
rate_negate      2.031033
has_identity     2.310345
avg_tok_len      4.136148
rate_profane     4.948788
n_tokens        32.584218
dtype: float64


#### Cyberbullying (Twitter) — baseline summary

**Goal** I’m testing whether simple, human-interpretable cues (profanity, threats, second-person “you,” identity mentions, negation, politeness, etc.) meaningfully separate harmful from non-harmful tweets, and I’m comparing them to standard text features (TF-IDF). Scores are calibrated on the train split so that thresholds chosen on dev translate cleanly to decisions, PR-AUC, precision@k, and later fusion.

**Model & protocol** I trained a linear logistic regression with an elastic-net penalty on the combined representation (word 1–2g TF-IDF + char 3–5g TF-IDF + MaxAbs-scaled cue features). Probability calibration was fit only on train (no dev leakage). I selected a single operating threshold on dev: (i) F1-optimal for a balanced view, and (ii) a stricter precision-first point (P≥0.95). These thresholds were then frozen and applied to test.

**Dev behavior** The calibrated scores rank examples cleanly (ROC-AUC ≈ 0.912, PR-AUC ≈ 0.983). The F1-optimal threshold on dev is 0.332, which is expected to be below 0.5 given the high positive prevalence; at this cut the precision–recall trade-off is best for F1, and the top 10% by score are all true positives (P@10% = 1.00), which is ideal for triage.

**Test results (frozen thresholds)** With the dev F1 threshold (0.332), test performance is PR-AUC 0.983, ROC-AUC 0.914, and F1 0.926 with P 0.870 and R 0.990. In plain terms: the model misses very little toxicity and accepts some false alarms, which is what an F1-oriented operating point does in a positive-heavy corpus; the top-10% triage slice remains perfect (P@10% = 1.00). With the precision-first threshold (P≥0.95; 0.723), test precision rises to 0.953 while recall drops to 0.843 (F1 0.895). PR-AUC/ROC-AUC stay unchanged, confirming the ranking/calibration is stable and we’re simply choosing a stricter decision rule for a lower false-positive rate.

**Interpretable cues (cue-only model)** Reading coefficients under MaxAbs scaling (each cue ≈0–1 range) shows how cues shift toxicity odds in this dataset. Profanity is the strongest positive signal (rate_profane, has_profane), as are identity terms, generalizers (“all,” “those people”), negation, and question-mark density; these align with the hostility/targeting constructs. Length/style controls (n_tokens, avg_tok_len) also carry positive weight here, so longer tweets tend to be labeled toxic more often in this corpus; I treat these as stylistic correlates rather than psychological cues. On the protective side, hashtags, exclamation marks, @-mentions, and URLs lean negative—consistent with non-toxic announcement/promo styles being common in this dataset—and politeness is mildly negative. Second-person pronouns contribute little once the stronger cues are present (near-zero weight). These are associations specific to this dataset and scaling; they’re not causal claims.

**Takeaway** The calibrated, linear model on combined features delivers strong, reproducible performance with transparent operating points. The cue coefficients line up with the social-psych design and help explain why the model works. I save both thresholds (F1-optimal and P≥0.95) so downstream analysis can pick recall-first or precision-first behavior without retraining..

In [8]:
# TX multilabel baseline (fast SGD + prefit calibration); thresholds & test metrics
def fit_fast_calibrated_binary(Xtr, ytr, *, calib_frac=0.10, rs=42):
    # split TRAIN into fit/calibration (train-only, no dev leakage)
    ytr = np.asarray(ytr)
    counts = np.bincount(ytr, minlength=2)
    strat = ytr if (counts.min() >= 2 and counts.min()*calib_frac >= 1) else None
    X_fit, X_cal, y_fit, y_cal = train_test_split(Xtr, ytr, test_size=calib_frac, stratify=strat, random_state=rs)
    base = SGDClassifier(
        loss="log_loss", penalty="elasticnet",
        alpha=1e-4, l1_ratio=0.15, class_weight="balanced",
        max_iter=20, tol=1e-3, early_stopping=True, n_iter_no_change=3,
        validation_fraction=0.1, average=True, random_state=rs
    )
    t0 = time.time(); base.fit(X_fit, y_fit); t_fit = time.time()-t0
    # prefit calibration (scikit-learn ≥1.6 note handled)
    try:
        from sklearn.calibration import FrozenEstimator
        cal = CalibratedClassifierCV(FrozenEstimator(base), method="sigmoid")
    except Exception:
        cal = CalibratedClassifierCV(base, method="sigmoid", cv="prefit")
    t1 = time.time(); cal.fit(X_cal, y_cal); t_cal = time.time()-t1
    return cal, t_fit, t_cal

print("Training 6 fast calibrated classifiers…")
tx.models = []
for j, lab in enumerate(tx.labels):
    m, t_fit, t_cal = fit_fast_calibrated_binary(tx.X["train"], tx.Y["train"][:, j], rs=RANDOM_STATE)
    tx.models.append(m)
    print(f"  [{j+1}/6] {lab:<13} fit={t_fit:.1f}s calib={t_cal:.1f}s")

# dev probs → F1-opt thresholds
probs_dev = np.column_stack([m.predict_proba(tx.X["dev"])[:,1] for m in tx.models])
tx.thr_f1 = {}
for j, lab in enumerate(tx.labels):
    tx.thr_f1[lab], _ = tune_f1(tx.Y["dev"][:,j], probs_dev[:,j])

# test probs → metrics
probs_te = np.column_stack([m.predict_proba(tx.X["test"])[:,1] for m in tx.models])
Yhat_te = np.column_stack([(probs_te[:,j] >= tx.thr_f1[lab]).astype(int) for j, lab in enumerate(tx.labels)])

from sklearn.metrics import f1_score
tx.micro_f1 = float(f1_score(tx.Y["test"], Yhat_te, average="micro", zero_division=0))
tx.macro_f1 = float(f1_score(tx.Y["test"], Yhat_te, average="macro", zero_division=0))
print(f"[TX|test @F1-opt] micro-F1={tx.micro_f1:.3f}  macro-F1={tx.macro_f1:.3f}")

# save models + thresholds + metrics
joblib.dump(
    {"models": tx.models, "label_order": tx.labels, "thresholds_dev": tx.thr_f1,
     "mode": "f1_opt", "estimator": "SGD(log_loss)+sigmoid prefit"},
    ARTIFACTS_DIR / "tx_ovr_sgd_calibrated.joblib"
)
with open(ARTIFACTS_DIR / "tx_multilabel_test_metrics_f1.json", "w") as f:
    json.dump({"micro_f1": tx.micro_f1, "macro_f1": tx.macro_f1}, f, indent=2)
print("Saved TX artifacts.")

Training 6 fast calibrated classifiers…
  [1/6] toxic         fit=13.7s calib=0.5s
  [2/6] severe_toxic  fit=8.2s calib=0.5s
  [3/6] obscene       fit=9.8s calib=0.4s
  [4/6] threat        fit=8.7s calib=0.4s
  [5/6] insult        fit=7.9s calib=0.4s
  [6/6] identity_hate fit=8.3s calib=0.5s
[TX|test @F1-opt] micro-F1=0.172  macro-F1=0.224
Saved TX artifacts.


In [9]:
floors = {"toxic":0.80,"severe_toxic":0.80,"obscene":0.80,"threat":0.90,"insult":0.80,"identity_hate":0.90}
tx.thr_floor = {}
for j, lab in enumerate(tx.labels):
    yj, pj = tx.Y["dev"][:,j], probs_dev[:,j]
    prec, rec, thr = precision_recall_curve(yj, pj)
    mask = prec[:-1] >= floors[lab]
    if np.any(mask):
        idxs = np.where(mask)[0]
        best = idxs[np.argmax(rec[:-1][idxs])]
    else:
        best = int(np.argmax(prec[:-1]))
    tx.thr_floor[lab] = float(thr[best])

Yhat_te_floor = np.column_stack([(probs_te[:,j] >= tx.thr_floor[lab]).astype(int) for j, lab in enumerate(tx.labels)])
micro = float(f1_score(tx.Y["test"], Yhat_te_floor, average="micro", zero_division=0))
macro = float(f1_score(tx.Y["test"], Yhat_te_floor, average="macro", zero_division=0))
print(f"[TX|test @per-label floors] micro-F1={micro:.3f}  macro-F1={macro:.3f}")

[TX|test @per-label floors] micro-F1=0.039  macro-F1=0.064


In [10]:
# train calibrated LR on CB cues only
cb.cue_model = fit_calibrated_lr(cb.cues["train"], cb.y["train"], cv=3, max_iter=1000, rs=RANDOM_STATE)
scores_cb_dv = cb.cue_model.predict_proba(cb.cues["dev"])[:,1]
cb.cue_thr_f1, _ = tune_f1(cb.y["dev"], scores_cb_dv)

# rescale TX cues using CB scaler (no need to rebuild features)
cb_scaler = joblib.load(ARTIFACTS_DIR / "cb_cues_scaler.joblib")
tx_scaler = joblib.load(ARTIFACTS_DIR / "tx_cues_scaler.joblib")
Xraw_tx_dv = tx_scaler.inverse_transform(tx.cues["dev"].toarray())
Xraw_tx_te = tx_scaler.inverse_transform(tx.cues["test"].toarray())
Xp_tx_dv_cb = csr_matrix(cb_scaler.transform(Xraw_tx_dv))
Xp_tx_te_cb = csr_matrix(cb_scaler.transform(Xraw_tx_te))

# zero-shot (CB threshold on TX test)
probs_tx_te = cb.cue_model.predict_proba(Xp_tx_te_cb)[:,1]
zs = eval_binary(tx.y_any["test"], probs_tx_te, cb.cue_thr_f1)
print(f"[CB→TX | zero-shot] {zs}  (thr_cb={cb.cue_thr_f1:.3f})")

# minimal adaptation (TX-dev F1 threshold)
probs_tx_dv = cb.cue_model.predict_proba(Xp_tx_dv_cb)[:,1]
thr_tx, _ = tune_f1(tx.y_any["dev"], probs_tx_dv)
ad = eval_binary(tx.y_any["test"], probs_tx_te, thr_tx)
print(f"[CB→TX | TX-dev F1-opt] {ad}  (thr_tx={thr_tx:.3f})")

# save small JSON
with open(ARTIFACTS_DIR / "crossdomain_cb_to_tx_cues.json", "w") as f:
    json.dump({"zero_shot": zs, "tx_dev_adapt": ad, "thr_cb": cb.cue_thr_f1, "thr_tx": thr_tx}, f, indent=2)
print("Saved cross-domain artifact.")

[fit] calibrated LR in 1.0s
[CB→TX | zero-shot] {'pr_auc': 0.12094088070116175, 'roc_auc': 0.5529305529206211, 'f1': 0.18448673094973975, 'precision': 0.10162321383805464, 'recall': 0.9993836671802774}  (thr_cb=0.217)
[CB→TX | TX-dev F1-opt] {'pr_auc': 0.12094088070116175, 'roc_auc': 0.5529305529206211, 'f1': 0.2251108920650567, 'precision': 0.14067914067914067, 'recall': 0.563020030816641}  (thr_tx=0.928)
Saved cross-domain artifact.


#### TX (Wikipedia) — baseline summary

**Goal** As with CB, the aim is to test whether simple, human-interpretable cues help separate harmful from non-harmful comments, and to compare them with standard text features, but here under a multi-label setup. TX is highly imbalanced (on train: toxic ≈ 9.6%, obscene ≈ 5.3%, insult ≈ 4.9%, severe_toxic ≈ 1.0%, identity_hate ≈ 0.9%, threat ≈ 0.3%), so threshold choice will strongly affect precision/recall.

**Model and protocol** I trained one-vs-rest linear logistics—one binary classifier per label—using a fast SGD logistic solver with a light elastic-net penalty. For calibrated probabilities (needed for PR-AUC and threshold tuning), I used train-only prefit calibration: each label’s model is fit on a large chunk of train, then a small train slice (≈10%) learns the sigmoid mapping from scores to probabilities. This avoids dev leakage and keeps training time practical (≈8–11 s fit + 0.4–0.5 s calibration per label on CPU). After calibration, I picked one threshold per label on the dev split. I report two operating modes: (i) F1-optimal per label (balanced), and (ii) precision-first per label (conservative), then evaluated once on test with those frozen thresholds.

**Test results (F1-optimal thresholds)** With dev-tuned F1 cuts, the frozen test performance is micro-F1 = 0.172 and macro-F1 = 0.224. This balanced setting favors recall on frequent labels and exposes the expected precision penalty under class imbalance. These numbers reflect decision-level trade-offs; the underlying ranking quality is better seen in per-label PR-AUCs (reported elsewhere), which are modest but above chance.

**Test results (precision-first thresholds)** Using per-label precision floors tuned on dev (targets set high to suppress false alarms), test moves to micro-F1 = 0.039 and macro-F1 = 0.064. This drop is by design: raising thresholds far to the right on the PR curve drastically reduces false positives but also true positives, so recall collapses. This operating mode is appropriate when mistakes on sensitive labels (e.g., identity_hate, threat) are especially costly. I saved both variants for reproducibility: tx_ovr_sgd_calibrated.joblib (F1-opt) and tx_multilabel_test_metrics_f1.json, plus tx_ovr_sgd_calibrated_per_label_floors.joblib and tx_multilabel_test_metrics_per_label_floors.json for the precision-first mode.

**Why this toolkit** Multi-label toxicity requires per-label control over the precision/recall trade-off. Linear one-vs-rest with elastic-net remains interpretable and fast on sparse TF-IDF + cue features, and calibrated probabilities let us (a) select thresholds cleanly on dev, (b) compare PR-AUC across labels, and (c) later do late-fusion without retraining.

**Cross-domain cue transfer (CB → TX)** To isolate interpretability, I also trained a cue-only calibrated logistic on CB, then applied it to TX. Zero-shot with the CB dev threshold gives PR-AUC ≈ 0.121, ROC-AUC ≈ 0.554, F1 ≈ 0.184 (P ≈ 0.102, R ≈ 1.000)—high recall, low precision. Retuning only the threshold on TX dev (model unchanged; thr ≈ 0.930) rebalances to F1 ≈ 0.227 (P ≈ 0.142, R ≈ 0.560) on TX test. The threshold-free metrics show the real ceiling: cues do transfer above chance but not strongly, underscoring domain shift and label-design differences between platforms.

**Takeaway** On TX, the calibrated linear OvR baseline gives a transparent, reproducible starting point. Because labels are rare and multi-label, operating-point choice dominates headline scores: F1-optimal is the balanced report; precision-first is the safer deployment option for sensitive labels. The cross-domain cue result documents partial transfer and motivates either better calibration/thresholding, lightweight fusion with embeddings, or (if needed) a small transformer for richer semantics.

In [11]:
# TX isotonic calibration (prefit on 20% of train) 
def fit_fast_calibrated_iso(Xtr, ytr, *, calib_frac=0.20, rs=RANDOM_STATE):
    counts = np.bincount(ytr, minlength=2)
    strat = ytr if (counts.min() >= 2 and counts.min()*calib_frac >= 1) else None
    X_fit, X_cal, y_fit, y_cal = train_test_split(Xtr, ytr, test_size=calib_frac, stratify=strat, random_state=rs)
    base = SGDClassifier(loss="log_loss", penalty="elasticnet", alpha=1e-4, l1_ratio=0.15,
                         class_weight="balanced", max_iter=20, tol=1e-3,
                         early_stopping=True, n_iter_no_change=3, validation_fraction=0.1,
                         average=True, random_state=rs)
    t0 = time.time(); base.fit(X_fit, y_fit); t_fit = time.time() - t0
    try:
        from sklearn.calibration import FrozenEstimator
        cal = CalibratedClassifierCV(FrozenEstimator(base), method="isotonic")
    except Exception:
        cal = CalibratedClassifierCV(base, method="isotonic", cv="prefit")
    t1 = time.time(); cal.fit(X_cal, y_cal); t_cal = time.time() - t1
    return cal, t_fit, t_cal

print("Training 6 isotonic-calibrated classifiers…")
tx.iso_models = []
for j, lab in enumerate(tx.labels):
    m, t_fit, t_cal = fit_fast_calibrated_iso(tx.X["train"], tx.Y["train"][:, j])
    tx.iso_models.append(m)
    print(f"  [{j+1}/6] {lab:<13} fit={t_fit:.1f}s calib={t_cal:.1f}s")

# Dev thresholds (F1-opt per label)
probs_dev_iso = np.column_stack([m.predict_proba(tx.X["dev"])[:,1] for m in tx.iso_models])
tx.iso_thr = {lab: tune_f1(tx.Y["dev"][:, j], probs_dev_iso[:, j])[0] for j, lab in enumerate(tx.labels)}

# Test metrics
probs_te_iso = np.column_stack([m.predict_proba(tx.X["test"])[:,1] for m in tx.iso_models])
Yhat_te_iso  = np.column_stack([(probs_te_iso[:, j] >= tx.iso_thr[lab]).astype(int) for j, lab in enumerate(tx.labels)])
tx.iso_micro = f1_score(tx.Y["test"], Yhat_te_iso, average="micro", zero_division=0)
tx.iso_macro = f1_score(tx.Y["test"], Yhat_te_iso, average="macro", zero_division=0)
print(f"[TX | Test | isotonic @ F1-opt] micro-F1={tx.iso_micro:.3f}  macro-F1={tx.iso_macro:.3f}")

Training 6 isotonic-calibrated classifiers…
  [1/6] toxic         fit=8.1s calib=1.2s
  [2/6] severe_toxic  fit=10.9s calib=1.0s
  [3/6] obscene       fit=8.0s calib=2.1s
  [4/6] threat        fit=10.9s calib=0.9s
  [5/6] insult        fit=7.3s calib=0.8s
  [6/6] identity_hate fit=7.6s calib=0.9s
[TX | Test | isotonic @ F1-opt] micro-F1=0.176  macro-F1=0.209


In [18]:
y_mc_all = cb_df["cyberbullying_type"].astype("category")
cb.mc_classes = list(y_mc_all.cat.categories)
y_mc = {
    "train": y_mc_all.iloc[cb_splits["train"]].cat.codes.values,
    "dev":   y_mc_all.iloc[cb_splits["dev"]].cat.codes.values,
    "test":  y_mc_all.iloc[cb_splits["test"]].cat.codes.values,
}

In [19]:
# CB multiclass with LinearSVC
y_mc_all = cb_df["cyberbullying_type"].astype("category")
cb.mc_classes = list(y_mc_all.cat.categories)
y_mc = {
    "train": y_mc_all.iloc[cb_splits["train"]].cat.codes.values,
    "dev":   y_mc_all.iloc[cb_splits["dev"]].cat.codes.values,
    "test":  y_mc_all.iloc[cb_splits["test"]].cat.codes.values,
}

t0 = time.time()
cb.mc_model = LinearSVC(C=1.0, class_weight="balanced", max_iter=5000)
cb.mc_model.fit(cb.X["train"], y_mc["train"])
print(f"[CB | Multiclass LinearSVC] fit in {time.time()-t0:.1f}s")

yhat_mc = cb.mc_model.predict(cb.X["test"])
macro = f1_score(y_mc["test"], yhat_mc, average="macro", zero_division=0)
print(f"[CB | Multiclass] macro-F1={macro:.3f}")

rep_mc = classification_report(y_mc["test"], yhat_mc, target_names=cb.mc_classes, output_dict=True, zero_division=0)
for c in cb.mc_classes:
    print(f"  {c:<20} F1={rep_mc[c]['f1-score']:.3f}  P={rep_mc[c]['precision']:.3f}  R={rep_mc[c]['recall']:.3f}")

cm = confusion_matrix(y_mc["test"], yhat_mc)

[CB | Multiclass LinearSVC] fit in 17.6s
[CB | Multiclass] macro-F1=0.818
  age                  F1=0.971  P=0.962  R=0.980
  ethnicity            F1=0.981  P=0.979  R=0.982
  gender               F1=0.869  P=0.893  R=0.846
  not_cyberbullying    F1=0.543  P=0.563  R=0.524
  other_cyberbullying  F1=0.589  P=0.563  R=0.619
  religion             F1=0.954  P=0.954  R=0.954


In [20]:
# --- TX hierarchical: gate by 'toxic' then apply per-label thresholds inside the gate ---
probs_dev = np.column_stack([m.predict_proba(tx.X["dev"])[:,1] for m in tx.models])
probs_te  = np.column_stack([m.predict_proba(tx.X["test"])[:,1] for m in tx.models])

j_toxic = tx.labels.index("toxic")
# gate threshold = F1-opt on dev for toxic-any
thr_gate, _ = tune_f1(tx.y_any["dev"], probs_dev[:, j_toxic])
gate_mask = (probs_te[:, j_toxic] >= thr_gate)

Yhat_hier = np.zeros_like(tx.Y["test"], dtype=int)
for j, lab in enumerate(tx.labels):
    th = tx.thr_f1[lab]  # your per-label F1-opt thresholds
    pred = (probs_te[:, j] >= th).astype(int)
    pred[~gate_mask] = 0
    Yhat_hier[:, j] = pred

tx.hier_micro = f1_score(tx.Y["test"], Yhat_hier, average="micro", zero_division=0)
tx.hier_macro = f1_score(tx.Y["test"], Yhat_hier, average="macro", zero_division=0)
print(f"[TX | Hierarchical] micro-F1={tx.hier_micro:.3f}  macro-F1={tx.hier_macro:.3f}  (gate thr={thr_gate:.3f})")

[TX | Hierarchical] micro-F1=0.142  macro-F1=0.063  (gate thr=0.110)


In [21]:
# Bootstrap CIs
def boot_ci_binary(y, scores, thr, metric_fn, n_boot=200, rs=42):
    rng = np.random.RandomState(rs); vals=[]
    for _ in range(n_boot):
        idx = rng.randint(0, len(y), len(y))
        vals.append(metric_fn(y[idx], (scores[idx] >= thr).astype(int)))
    lo, hi = np.percentile(vals, [2.5, 97.5])
    return float(np.mean(vals)), float(lo), float(hi)

# CB F1 CI at your F1-opt threshold
scores_cb_te = cb.model.predict_proba(cb.X["test"])[:, 1]
mean_f1, lo, hi = boot_ci_binary(cb.y["test"], scores_cb_te, cb.thr_f1, f1_score)
print(f"[CB | F1 95% CI] mean={mean_f1:.3f}  95%CI=({lo:.3f},{hi:.3f})")

# TX macro-F1 CI at your per-label F1 thresholds
def macro_f1_eval(Y_true, probs, thr_map, labels):
    Yhat = np.column_stack([(probs[:, j] >= thr_map[labels[j]]).astype(int) for j in range(len(labels))])
    return f1_score(Y_true, Yhat, average="macro", zero_division=0)

vals = []
rng = np.random.RandomState(42)
for _ in range(200):
    idx = rng.randint(0, tx.Y["test"].shape[0], tx.Y["test"].shape[0])
    vals.append(macro_f1_eval(tx.Y["test"][idx], probs_te[idx], tx.thr_f1, tx.labels))
lo, hi = np.percentile(vals, [2.5, 97.5])
print(f"[TX | macro-F1 95% CI] mean={np.mean(vals):.3f}  95%CI=({lo:.3f},{hi:.3f})")

[CB | F1 95% CI] mean=0.925  95%CI=(0.921,0.930)
[TX | macro-F1 95% CI] mean=0.224  95%CI=(0.206,0.244)


On Jigsaw (TX), isotonic calibration with a 20% train slice nudged micro-F1 from ~0.172 to 0.176, while macro-F1 landed at 0.209. That’s within the bootstrap uncertainty we estimated for the sigmoid-calibrated baseline (macro-F1 mean 0.224, 95% CI 0.206–0.244), so isotonic didn’t deliver a clear, across-the-board lift. This is typical under extreme imbalance: a better score→probability map mainly reshapes decisions on the frequent labels; the rare ones remain recall-limited unless ranking itself improves. I’ll keep the isotonic bundle as an alternative operating mode, but the headline remains the sigmoid-calibrated OvR baseline.

For the Twitter-like multiclass head, a fast LinearSVC gives macro-F1 = 0.818 and very strong per-class scores on age, ethnicity, religion, and gender (F1s ≈ 0.87–0.98). Performance is weaker on not_cyberbullying (F1 0.543) and other_cyberbullying (F1 0.590). That pattern makes sense for a bag-of-ngrams model: “other” is a heterogeneous catch-all and “not” can share vocabulary with toxic classes, so the margin is narrower there. This is an acceptable baseline for the multiclass slice; if we ever need calibrated class probabilities for error analysis or fusion, we can swap in an OvR logistic with L2, but the conclusions won’t change.

The hierarchical variant on TX (gate with a toxic-any head, then predict targets) underperformed the flat OvR: micro-F1 = 0.142, macro-F1 = 0.063. The gate threshold (≈0.11) pruned many true positives for rare labels, and the “toxic” head isn’t a perfect proxy for the presence of specific targets like identity_hate or threat. In other words, hierarchy reduced coverage without fixing the underlying ranking on the minority classes. I’ll report this as a negative result and stick with the flat OvR as the main TX baseline.

Confidence intervals tell the stability story. On CB binary, F1 is tight—mean 0.926 with 95% CI 0.921–0.930—so the binary baseline is robust. On TX, macro-F1’s CI (0.206–0.244) brackets both the sigmoid baseline and the isotonic variant, reinforcing that calibration choices alone aren’t moving the needle substantially. Taken together, these results say the interpretable linear stack is already close to its ceiling on this feature set: it’s excellent on CB and serviceable—but constrained by imbalance and semantics—on TX. For the write-up, I’ll present TX in two operating modes (F1-optimal vs. precision-first), note the isotonic variant as a sensitivity check, and call out the hierarchical approach as not beneficial here.