# 09 · Failure Analysis

Analyze where the logistic regression classifier struggles and inspect misclassified posts.

**Objectives**
- Load the production logistic regression bundle and recreate the evaluation splits.
- Quantify performance on calibration/test data.
- Highlight false positives/negatives and summarize their patterns.

In [None]:
# remove -q to see installation logs
%pip install -q -r ../requirements.txt

In [None]:
import os
import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
)
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
SEED = 42
np.random.seed(SEED)

DATA = Path('../data')
PROC = DATA / 'processed'
ART = Path('../artifacts')

TEXT_COL = 'text_all'
PUNCT = r""".,!?:;()[]{}"'/-\"""
TRASH = {"[text]", "[image]", "[removed]", "[deleted]"}
KEEP_SHORT = {"ecg", "sad", "ptsd", "mom", "dad", "anx"}

def tokenize(text: str):
    tokens = []
    for word in str(text).split():
        w = word.strip().strip(PUNCT).lower()
        if w and w not in TRASH and (len(w) >= 3 or w in KEEP_SHORT):
            tokens.append(w)
    return tokens

def identity(x):
    return x

print('Setup complete')
print(f'Data dir: {PROC.resolve()}')
print(f'Artifacts dir: {ART.resolve()}')


In [None]:
df_posts = pd.read_parquet(PROC / 'reddit_anxiety_v1.parquet')
vec = joblib.load(ART / 'vec_final.joblib')
nmf = joblib.load(ART / 'nmf_final.joblib')
bundle = joblib.load(ART / 'triggerlens_logreg_calibrated_bundle.joblib')

df_hand = pd.read_csv(PROC / 'sample_human_labels.csv')
df_ai = pd.read_csv(PROC / 'simple_ai_labels.csv')

label_sets = {}
# Human annotations (rating >= 4)
df_h = df_hand[['post_id']].copy()
df_h['label'] = (pd.to_numeric(df_hand.get('anxiety_rating'), errors='coerce') >= 4).astype(int)
label_sets['hand'] = df_h

# AI labels (confidence >= 0.5, severity >= 4, anxiety/panic category)
df_a = df_ai[['post_id']].copy()
cat = df_ai['ai_category'].astype(str).str.lower()
conf = pd.to_numeric(df_ai['ai_confidence'], errors='coerce').fillna(0)
sev = pd.to_numeric(df_ai['ai_severity'], errors='coerce').fillna(0)
df_a['label'] = (conf >= 0.5) & (sev >= 4) & (cat.str.contains('anx') | cat.str.contains('panic'))
df_a['label'] = df_a['label'].astype(int)
label_sets['ai'] = df_a

# Combined: prefer human labels, fill gaps with AI labels
df_h_tmp = df_h.copy(); df_h_tmp['source'] = 'hand'
df_a_tmp = df_a.copy(); df_a_tmp['source'] = 'ai'
df_comb = pd.concat([df_h_tmp, df_a_tmp], ignore_index=True)
df_comb = df_comb.sort_values('source').drop_duplicates('post_id', keep='last')
label_sets['combined'] = df_comb[['post_id', 'label']]

print(f"Posts loaded: {len(df_posts):,}")
print(f"Vectorizer vocab: {getattr(vec, 'max_features', 0):,}")
print(f"NMF topics: {getattr(nmf, 'n_components', 0)}")
print({k: len(v) for k, v in label_sets.items()})


In [None]:
meta_scaler = bundle.get('meta_scaler')
threshold = float(bundle.get('threshold', 0.5))
clf = bundle['calibrated_model']

def build_feature_matrix(df_subset: pd.DataFrame):
    tokens = df_subset[TEXT_COL].fillna('').map(tokenize)
    X_blocks = []
    X_tfidf = vec.transform(tokens)
    X_blocks.append(X_tfidf)
    X_topics = nmf.transform(X_tfidf)
    X_blocks.append(csr_matrix(X_topics))

    doc_len = np.array([len(t) for t in tokens], dtype=float)[:, None]
    has_url = (
        df_subset[TEXT_COL]
        .fillna('')
        .str.contains('http', case=False)
        .astype(int)
        .values[:, None]
    ).astype(float)
    nrc = (
        df_subset.get('anxiety_score', pd.Series(0, index=df_subset.index))
        .fillna(0)
        .values[:, None]
    ).astype(float)
    meta = np.hstack([np.log1p(doc_len), has_url, nrc])
    scaler = meta_scaler if meta_scaler is not None else StandardScaler().fit(meta)
    meta_scaled = scaler.transform(meta)

    X_blocks.append(csr_matrix(meta_scaled))
    X = hstack(X_blocks, format='csr')
    X.data = np.nan_to_num(X.data, nan=0.0)

    features_df = pd.DataFrame({
        'doc_length': doc_len.ravel(),
        'has_url': has_url.ravel(),
        'anxiety_score': nrc.ravel(),
    }, index=df_subset.index)

    return X, tokens, features_df

# Align posts with combined labels used for production
df_labeled = df_posts.merge(label_sets['combined'], on='post_id', how='inner')
X_all, tokens_all, meta_features = build_feature_matrix(df_labeled)
y_all = df_labeled['label'].values
print('Feature matrix:', X_all.shape)


In [None]:
indices = np.arange(X_all.shape[0])
idx_train, idx_tmp, y_train, y_tmp = train_test_split(
    indices, y_all, test_size=0.4, stratify=y_all, random_state=SEED
)
idx_cal, idx_test, y_cal, y_test = train_test_split(
    idx_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=SEED
)

proba_all = clf.predict_proba(X_all)[:, 1]
preds_all = (proba_all >= threshold).astype(int)

df_eval = df_labeled.copy()
df_eval['split'] = 'train'
df_eval.loc[idx_cal, 'split'] = 'calibration'
df_eval.loc[idx_test, 'split'] = 'test'
df_eval['proba'] = proba_all
df_eval['pred'] = preds_all
df_eval['error'] = np.where(df_eval['pred'] == df_eval['label'], 'correct', 'error')
df_eval['error_type'] = np.select(
    [
        (df_eval['label'] == 1) & (df_eval['pred'] == 0),
        (df_eval['label'] == 0) & (df_eval['pred'] == 1),
    ],
    ['false_negative', 'false_positive'],
    default='correct',
)

df_eval = df_eval.join(meta_features)
df_eval['text_preview'] = df_eval[TEXT_COL].fillna('').str.slice(0, 280)

print(df_eval['split'].value_counts())


In [None]:
def describe_split(name, mask):
    part = df_eval.loc[mask]
    if part.empty:
        print(f'No rows for {name}')
        return
    y_true = part['label'].values
    y_hat = part['pred'].values
    y_score = part['proba'].values
    print(f"
=== {name.upper()} ({len(part)} rows) ===")
    print(f"AUC: {roc_auc_score(y_true, y_score):.3f}")
    print(f"Average precision: {average_precision_score(y_true, y_score):.3f}")
    print('Confusion matrix:')
    print(confusion_matrix(y_true, y_hat))
    print(classification_report(y_true, y_hat, digits=3))

describe_split('Calibration', df_eval['split'] == 'calibration')
describe_split('Test', df_eval['split'] == 'test')


In [None]:
agg = (
    df_eval[df_eval['split'].isin(['calibration', 'test'])]
    .groupby('error_type')
    .agg(
        count=('post_id', 'size'),
        mean_proba=('proba', 'mean'),
        mean_doc_length=('doc_length', 'mean'),
        url_rate=('has_url', 'mean'),
        mean_anxiety_score=('anxiety_score', 'mean'),
    )
    .sort_values('count', ascending=False)
)
agg


In [None]:
false_negs = (
    df_eval
    .query("split == 'test' and error_type == 'false_negative'")
    .sort_values('proba')
    [[
        'post_id', 'proba', 'label', 'doc_length', 'has_url', 'anxiety_score', 'text_preview'
    ]]
    .head(10)
)
false_negs


In [None]:
false_pos = (
    df_eval
    .query("split == 'test' and error_type == 'false_positive'")
    .sort_values('proba', ascending=False)
    [[
        'post_id', 'proba', 'label', 'doc_length', 'has_url', 'anxiety_score', 'text_preview'
    ]]
    .head(10)
)
false_pos


In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
for i, split in enumerate(['calibration', 'test']):
    subset = df_eval[df_eval['split'] == split]
    if subset.empty:
        continue
    sns.histplot(
        subset['proba'],
        bins=30,
        hue=subset['label'],
        palette='Set2',
        element='step',
        ax=ax[i],
        stat='density',
        common_norm=False,
    )
    ax[i].axvline(threshold, color='red', linestyle='--', label=f'Threshold={threshold:.2f}')
    ax[i].set_title(f'Probability distribution ({split})')
    ax[i].set_xlabel('Predicted probability')
    ax[i].legend()
plt.tight_layout()
plt.show()


**Next steps**
- Review the highest-confidence mistakes and annotate whether they stem from labeling noise or missing features.
- Consider adding more metadata features (e.g., subreddit, posting hour) for systematic false positives.