In [2]:
# === AI Privacy Policies Audit — ONE-CELL (2025-11) ===
# Inputs: Put at least one of HTML or TXT for each stem in working dir:
#   google_eu.(html|txt), google_us.(html|txt), meta_eu.(html|txt), meta_us.(html|txt),
#   openai_eu.(html|txt),  openai_us.(html|txt),  xai_eu.(html|txt),  xai_us.(html|txt)
# Output tree: ./outputs/{figures,tables,meta,supplement}
# Colab-safe: installs bs4/lxml/regex only. matplotlib only; one plot/figure; no custom colors.

# ---------- Minimal installs ----------
!pip -q install beautifulsoup4 lxml regex

# ---------- Imports & setup ----------
import os, sys, json, zipfile, hashlib, shutil, warnings, datetime
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import regex as re
from scipy.optimize import linear_sum_assignment
from scipy.stats import mannwhitneyu
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold

SEED = 42
np.random.seed(SEED)

# ---------- Paths ----------
OUT_FIG = Path("outputs/figures"); OUT_TAB = Path("outputs/tables")
OUT_META = Path("outputs/meta"); OUT_SUPP = Path("outputs/supplement")
for d in (OUT_FIG, OUT_TAB, OUT_META, OUT_SUPP): d.mkdir(parents=True, exist_ok=True)

# ---------- Env capture ----------
env = {
    "python": sys.version,
    "numpy": np.__version__,
    "pandas": pd.__version__,
    "scikit_learn": __import__("sklearn").__version__,
    "matplotlib": matplotlib.__version__,
    "scipy": __import__("scipy").__version__,
    "bs4": __import__("bs4").__version__,
    "lxml": __import__("lxml").__version__,
    "regex": re.__version__,
}
(OUT_META/"env_versions.json").write_text(json.dumps(env, indent=2), encoding="utf-8")

# ---------- IO map ----------
PAIR_MAP = {
    "google_eu": ("google","eu"),
    "google_us": ("google","us"),
    "meta_eu":   ("meta","eu"),
    "meta_us":   ("meta","us"),
    "openai_eu": ("openai","eu"),
    "openai_us": ("openai","us"),
    "xai_eu":    ("xai","eu"),
    "xai_us":    ("xai","us"),
}
EXPECTED_STEMS = list(PAIR_MAP.keys())

# ---------- HTML -> TXT (keeping blank-line semantics downstream) ----------
def smart_soup(html_text: str) -> BeautifulSoup:
    head = html_text.lstrip()[:4096].lower()
    is_xml = (
        head.startswith("<?xml") or
        bool(re.search(r'<(rss|feed|sitemapindex|urlset)\b', head)) or
        bool(re.search(r'<html\b[^>]*xmlns=', head))
    )
    if not is_xml:
        warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
    return BeautifulSoup(html_text, "xml" if is_xml else "lxml")

NOISE_SELECTORS = [
    "header","footer","nav","aside","[role=navigation]","[role=dialog]",
    ".cookie",".cookies",".consent",".banner",".gdpr",".truste"
]

def html_to_txt(in_path: Path, out_path: Path, min_chars=30):
    html = in_path.read_text(encoding="utf-8", errors="ignore")
    soup = smart_soup(html)
    for t in soup(["script","style","noscript"]): t.decompose()
    for sel in NOISE_SELECTORS:
        for el in soup.select(sel): el.decompose()
    for br in soup.find_all("br"): br.replace_with("\n")
    blocks = []
    # Use structural blocks, then rebuild paragraphs with \n\n
    for tag in soup.find_all(["h1","h2","h3","h4","h5","h6","p","li","dt","dd"]):
        text = tag.get_text(" ", strip=True)
        if text:
            text = re.sub(r"\s+", " ", text).strip()
            if len(text) >= min_chars:
                blocks.append(text)
    txt = "\n\n".join(blocks)
    out_path.write_text(txt, encoding="utf-8")
    print(f"{in_path.name} -> {out_path.name} | paragraphs={len(blocks)}")

# Convert any available HTML → TXT (without overwriting existing .txt)
for stem in EXPECTED_STEMS:
    html_p, txt_p = Path(f"{stem}.html"), Path(f"{stem}.txt")
    if html_p.exists() and not txt_p.exists():
        html_to_txt(html_p, txt_p)
    elif not html_p.exists() and not txt_p.exists():
        print(f"WARNING: Missing both {stem}.html and {stem}.txt")

# Validate we have TXT for all stems
missing = [f"{s}.txt" for s in EXPECTED_STEMS if not Path(f"{s}.txt").exists()]
if missing:
    raise SystemExit("ERROR: Missing required TXT files: " + ", ".join(missing))

# ---------- Strict paragraphing on \n\n+, keep >=30 chars ----------
def split_paragraphs_strict(raw: str):
    # standardize newlines, then split on blank lines
    raw = raw.replace("\r\n","\n").replace("\r","\n")
    parts = re.split(r"\n\n+", raw)
    return [p.strip() for p in parts if len(p.strip()) >= 30]

rows = []
for stem, (company, region) in PAIR_MAP.items():
    raw = Path(f"{stem}.txt").read_text(encoding="utf-8", errors="ignore")
    paras = split_paragraphs_strict(raw)
    for p in paras:
        n_words = len(re.findall(r"\b\w+\b", p))
        rows.append({"file": f"{stem}.txt", "company": company, "region": region,
                     "paragraph": p, "n_words": n_words})
df = pd.DataFrame(rows)
if df.empty: raise SystemExit("ERROR: No paragraphs after strict split.")

meta = (df.groupby(["company","region","file"])
          .agg(n_paras=("paragraph","size"), n_words=("n_words","sum"))
          .reset_index())
meta.to_csv(OUT_META/"policy_metadata.csv", index=False)
print("\nPolicy metadata:\n", meta.to_string(index=False))

# ---------- Cleaning & brand ablation ----------
def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["text_clean"] = df["paragraph"].apply(clean_text)

brand_lexicon = [
  "google", "meta", "openai", "xai", "x.ai",
  "gemini", "bard", "chatgpt", "dall-e", "gpt", "gpt-4o", "gpt 4o", "gpt4o",
  "llama", "threads", "instagram", "facebook", "whatsapp",
  "youtube", "you tube", "deepmind", "vertex ai", "palm", "med-palm",
  "google llc", "meta platforms", "openai lp", "xai corp"
]
def build_brand_union_regex(terms):
    escaped = []
    for t in terms:
        t = t.lower()
        t = re.escape(t).replace("\\ ", r"\\s+")
        escaped.append(rf"\b{t}\b")
    return re.compile("|".join(escaped), flags=re.IGNORECASE)
brand_union_re = build_brand_union_regex(brand_lexicon)

df["text_ablated"] = df["text_clean"].apply(lambda s: brand_union_re.sub(" ", s))
df["text_ablated"] = df["text_ablated"].str.replace(r"\s+", " ", regex=True).str.strip()

# ---------- TF–IDF (paragraph level, baseline & ablated) ----------
TFIDF = dict(ngram_range=(1,3), stop_words="english", norm="l2", smooth_idf=True, min_df=2)
vec_base = TfidfVectorizer(**TFIDF)
X_base = vec_base.fit_transform(df["text_clean"].values)
vec_abla = TfidfVectorizer(**TFIDF)
X_abla = vec_abla.fit_transform(df["text_ablated"].values)

print("Vocab sizes — baseline:", len(vec_base.get_feature_names_out()),
      "| ablated:", len(vec_abla.get_feature_names_out()))

# ---------- Topics & stability (NMF; top-12 Jaccard; Hungarian) ----------
def top_terms(model, vocab, topn=12):
    tops = []
    for row in model.components_:
        idx = np.argsort(row)[::-1][:topn]
        tops.append([vocab[i] for i in idx])
    return tops

def jaccard(a, b):
    A, B = set(a), set(b)
    return len(A & B) / max(1, len(A | B))

K_GRID = [6,7,8,9]; SEEDS = [13,29,42]
rows_grid = []
for K in K_GRID:
    for sd in SEEDS:
        nmf_b = NMF(n_components=K, init="nndsvda", solver="cd", random_state=sd, max_iter=2000)
        Wb = nmf_b.fit_transform(X_base)
        Tb = top_terms(nmf_b, vec_base.get_feature_names_out(), topn=12)
        nmf_a = NMF(n_components=K, init="nndsvda", solver="cd", random_state=sd, max_iter=2000)
        Wa = nmf_a.fit_transform(X_abla)
        Ta = top_terms(nmf_a, vec_abla.get_feature_names_out(), topn=12)
        cost = np.zeros((K, K))
        for i in range(K):
            for j in range(K):
                cost[i,j] = 1.0 - jaccard(Tb[i], Ta[j])
        ri, cj = linear_sum_assignment(cost)
        matched = [1.0 - cost[i,j] for i,j in zip(ri,cj)]
        rows_grid.append({"K":K, "seed":sd, "mean_jaccard": float(np.mean(matched))})
grid_df = pd.DataFrame(rows_grid)
grid_df.to_csv(OUT_TAB/"S1_topic_stability_grid.csv", index=False)

# Primary (K=6, seed=42)
Kp, Sp = 6, 42
nmf_b_p = NMF(n_components=Kp, init="nndsvda", solver="cd", random_state=Sp, max_iter=2000)
Wb_p = nmf_b_p.fit_transform(X_base)
Tb_p = top_terms(nmf_b_p, vec_base.get_feature_names_out(), topn=12)
nmf_a_p = NMF(n_components=Kp, init="nndsvda", solver="cd", random_state=Sp, max_iter=2000)
Wa_p = nmf_a_p.fit_transform(X_abla)
Ta_p = top_terms(nmf_a_p, vec_abla.get_feature_names_out(), topn=12)
mj_primary = float(grid_df.query("K==6 and seed==42")["mean_jaccard"].iloc[0])

def company_topic_loads(W, labels):
    lab = labels.values
    uniq = sorted(np.unique(lab))
    rows = []
    for u in uniq:
        sel = (lab == u)
        mean_loads = W[sel].mean(axis=0)
        rows.append([u] + list(np.asarray(mean_loads).ravel()))
    cols = ["company"] + [f"topic_{i+1}" for i in range(W.shape[1])]
    return pd.DataFrame(rows, columns=cols)

A1 = company_topic_loads(Wb_p, df["company"])
A2 = company_topic_loads(Wa_p, df["company"])
A3 = pd.DataFrame({"topic":[f"topic_{i+1}" for i in range(Kp)], "top_terms":[", ".join(t) for t in Tb_p]})
A4 = pd.DataFrame({"topic":[f"topic_{i+1}" for i in range(Kp)], "top_terms":[", ".join(t) for t in Ta_p]})
A1.to_csv(OUT_TAB/"A1_topic_loads_baseline_K6.csv", index=False)
A2.to_csv(OUT_TAB/"A2_topic_loads_ablated_K6.csv", index=False)
A3.to_csv(OUT_TAB/"A3_top_terms_baseline_K6.csv", index=False)
A4.to_csv(OUT_TAB/"A4_top_terms_ablated_K6.csv", index=False)

# Figures 1A/1B
def plot_topic_bars(df_loads, title, outbase):
    companies = df_loads["company"].tolist()
    topics = [c for c in df_loads.columns if c.startswith("topic_")]
    K = len(topics)
    x = np.arange(len(companies)); width = 0.8 / max(K, 1)
    plt.figure(figsize=(8, 6))
    for i, t in enumerate(topics):
        plt.bar(x + i*width - (K-1)*width/2, df_loads[t].values, width=width, label=t)
    plt.xticks(x, companies, rotation=0); plt.legend(); plt.title(title)
    plt.tight_layout()
    plt.savefig(OUT_FIG/f"{outbase}.png", dpi=300)
    plt.savefig(OUT_FIG/f"{outbase}.svg"); plt.close()

plot_topic_bars(A1, "Topic profiles (K=6) by company — baseline TF–IDF", "fig01A_topics_baseline")
plot_topic_bars(A2, "Topic profiles (K=6) by company — brand ablation", "fig01B_topics_ablated")
(OUT_FIG/"fig01A_topics_baseline.txt").write_text(
    "Topic profiles (K=6) by company using baseline TF–IDF. Bars show per-company loads across six topics.", encoding="utf-8")
(OUT_FIG/"fig01B_topics_ablated.txt").write_text(
    "Topic profiles (K=6) after brand ablation. Persistence indicates ecosystem vocabularies beyond proper names.", encoding="utf-8")
for fp in ("fig01A_topics_baseline","fig01B_topics_ablated"):
    (OUT_FIG/f"{fp}.txt").write_text((OUT_FIG/f"{fp}.txt").read_text(encoding="utf-8")+f"\nPrimary stability: mean Jaccard (K=6, seed=42) = {mj_primary:.3f}", encoding="utf-8")

# ---------- Classifier (LOGO by document on ablated features) ----------
y = df["company"].values
groups = df["file"].values  # each held-out doc is one file
logo = LeaveOneGroupOut()
fold_rows = []; y_true_all=[]; y_pred_all=[]
for fold, (tr, te) in enumerate(logo.split(X_abla, y, groups=groups), start=1):
    clf = LogisticRegression(solver="lbfgs", class_weight="balanced", max_iter=5000, random_state=SEED)
    clf.fit(X_abla[tr], y[tr])
    yhat = clf.predict(X_abla[te])
    acc = accuracy_score(y[te], yhat); f1m = f1_score(y[te], yhat, average="macro")
    fold_rows.append({"fold": fold, "heldout_file": df.iloc[te[0]]["file"], "accuracy": acc, "macro_f1": f1m})
    y_true_all.extend(y[te]); y_pred_all.extend(yhat)
df_folds = pd.DataFrame(fold_rows)
acc_mean, acc_sd = df_folds["accuracy"].mean(), df_folds["accuracy"].std(ddof=1)
f1_mean,  f1_sd  = df_folds["macro_f1"].mean(), df_folds["macro_f1"].std(ddof=1)

# Tidy classification report (robust to float 'accuracy')
rep = classification_report(y_true_all, y_pred_all, output_dict=True, zero_division=0)
rep_rows = []
for key, val in rep.items():
    row = {"class": key}
    if isinstance(val, dict):
        for k in ("precision","recall","f1-score","support"):
            if k in val: row[k] = val[k]
    elif isinstance(val, (float,int)):
        row["score"] = float(val)
    rep_rows.append(row)
pd.DataFrame(rep_rows).to_csv(OUT_TAB/"S2_classifier_report_LOGO.csv", index=False)

labels = sorted(np.unique(y))
cm = confusion_matrix(y_true_all, y_pred_all, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
cm_df.to_csv(OUT_TAB/"S3_classifier_confusion_LOGO.csv")

# Optimistic sensitivity (5-fold, no grouping)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
accs_s, f1s_s = [], []
for tr, te in skf.split(X_abla, y):
    clf = LogisticRegression(solver="lbfgs", class_weight="balanced", max_iter=5000, random_state=SEED)
    clf.fit(X_abla[tr], y[tr])
    yp = clf.predict(X_abla[te])
    accs_s.append(accuracy_score(y[te], yp))
    f1s_s.append(f1_score(y[te], yp, average="macro"))
sdf = pd.DataFrame({"accuracy":accs_s, "macro_f1":f1s_s})
sdf.to_csv(OUT_TAB/"S3b_classifier_summary_sensitivity.csv", index=False)

# Table 1 (stability + classifiers)
tab1 = pd.DataFrame([
    {"row":"Topic stability (Jaccard), K=6, seed=42", "mean_jaccard": mj_primary},
    {"row":"classifier_primary_LOGO", "accuracy_mean": acc_mean, "accuracy_sd": acc_sd,
     "macro_f1_mean": f1_mean, "macro_f1_sd": f1_sd},
    {"row":"classifier_sensitivity_5fold", "accuracy_mean": sdf["accuracy"].mean(), "accuracy_sd": sdf["accuracy"].std(ddof=1),
     "macro_f1_mean": sdf["macro_f1"].mean(), "macro_f1_sd": sdf["macro_f1"].std(ddof=1)},
])
tab1.to_csv(OUT_TAB/"Table_1_model_stability_and_classifier.csv", index=False)

# ---------- Rights & consent per 1,000 with bootstrap CIs (document level) ----------
RIGHTS_REGEX = r"""(?ix)
  \bright(s)?\s+of\s+(access|rectification|erasure)\b |
  \bright(s)?\s+to\s+(rectification|erasure|deletion|portability|restriction|object|objection|appeal)\b |
  \b(deletion|erase|erasure|portability|restriction|objection|appeal)\b |
  \b(supervisory\s+authority|data\s+protection\s+authority|dpa)\b
"""
CONSENT_REGEX = r"""(?ix)
  \bconsent(s|ed|ing)?\b |
  \bwithdraw\s+consent\b |
  \bopt[-\s]?(in|out)\b |
  \bagree(s|d|ment|ments)?\b |
  \bchoice(s)?\b |
  \bpreference(s)?\b |
  \bsetting(s)?\b
"""
rights_re  = re.compile(RIGHTS_REGEX,  flags=re.IGNORECASE | re.VERBOSE)
consent_re = re.compile(CONSENT_REGEX, flags=re.IGNORECASE | re.VERBOSE)

df["rights_hits"]  = df["paragraph"].apply(lambda s: len(list(rights_re.finditer(s))))
df["consent_hits"] = df["paragraph"].apply(lambda s: len(list(consent_re.finditer(s))))

agg = df.groupby(["company","region","file"]).agg(
    words=("n_words","sum"),
    rights=("rights_hits","sum"),
    consent=("consent_hits","sum")
).reset_index()
agg["rights_per_1000_RAW"] = agg["rights"] / agg["words"] * 1000.0
den = (agg["rights"] + agg["consent"]).replace(0, np.nan)
agg["rights_share"] = (agg["rights"] / den).fillna(0.0)

def bootstrap_doc(sub_df, B=2000, seed=SEED):
    rng = np.random.default_rng(seed)
    n = len(sub_df)
    rates, shares = [], []
    for _ in range(B):
        rs = int(rng.integers(0, 1 << 31))
        samp = sub_df.sample(n, replace=True, random_state=rs)
        w = samp["n_words"].sum(); r = samp["rights_hits"].sum(); c = samp["consent_hits"].sum()
        rates.append((r / w * 1000.0) if w>0 else 0.0)
        shares.append((r / (r+c)) if (r+c)>0 else 0.0)
    lo_rate, hi_rate = np.percentile(rates, [2.5, 97.5])
    lo_share, hi_share = np.percentile(shares, [2.5, 97.5])
    return lo_rate, hi_rate, lo_share, hi_share

cis = []
for (comp, reg, file), sub in df.groupby(["company","region","file"]):
    lo_r, hi_r, lo_s, hi_s = bootstrap_doc(sub)
    cis.append({"company":comp,"region":reg,"file":file,
                "rights_per_1000_RAW_lo": lo_r, "rights_per_1000_RAW_hi": hi_r,
                "rights_share_lo": lo_s, "rights_share_hi": hi_s})
cis = pd.DataFrame(cis)
tab2 = agg.merge(cis, on=["company","region","file"], how="left")
tab2.to_csv(OUT_TAB/"Table_2_rights_rates_RAW.csv", index=False)
tab2.to_csv(OUT_TAB/"S4_rights_share_per_doc.csv", index=False)

# Figure 2 (document rows with CI bars)
labels_lr = [f"{r.company}-{r.region}" for _, r in tab2.iterrows()]
y = tab2["rights_per_1000_RAW"].values
yerr = np.vstack([
    y - tab2["rights_per_1000_RAW_lo"].values,
    tab2["rights_per_1000_RAW_hi"].values - y
])
x = np.arange(len(labels_lr))
plt.figure(figsize=(8,6))
plt.bar(x, y)
plt.errorbar(x, y, yerr=yerr, fmt="none", capsize=3)
plt.xticks(x, labels_lr, rotation=45, ha="right")
plt.ylabel("Rights per 1,000 words (RAW)")
plt.title("Rights references per 1,000 words (with 95% bootstrap CIs)")
plt.tight_layout()
plt.savefig(OUT_FIG/"fig02_rights_per_1k_RAW.png", dpi=300)
plt.savefig(OUT_FIG/"fig02_rights_per_1k_RAW.svg"); plt.close()
(OUT_FIG/"fig02_rights_per_1k_RAW.txt").write_text(
    "Rights references per 1,000 words (RAW) with 95% bootstrap intervals; document-level. Company×region aggregates available in S4.",
    encoding="utf-8"
)

# ---------- Neutral US state-appendix identification (overrideable) ----------
DEFAULT_STATE_APP_REGEX = r"""(?ix)
  \b(ccpa|cpra|california\s+privacy\s+rights?\s+act|california\s+consumer\s+privacy\s+act)\b |
  \b(vcdpa|virginia\s+consumer\s+data\s+protection\s+act)\b |
  \b(cdpa|colorado\s+privacy\s+act)\b |
  \b(ctdpa|connecticut\s+data\s+privacy\s+act)\b |
  \b(ucpa|utah\s+consumer\s+privacy\s+act)\b |
  \b(iowa\s+consumer\s+data\s+protection\s+act)\b |
  \b(oregon\s+consumer\s+privacy\s+act)\b |
  \b(texas\s+data\s+privacy\s+and\s+security\s+act|tdpsa)\b |
  \b(montana|delaware|tennessee|indiana|nebraska|new\s+jersey|new\s+hampshire|florida)\s+(consumer\s+)?privacy\s+act\b |
  \b(state[-\s]?specific\s+(privacy\s+)?(notice|notices|disclosure|disclosures|rights))\b |
  \b(supplemental|additional)\s+(information|notice|disclosures?)\s+for\s+(u\.?\s*s\.?\s*residents|[a-z]+\s+residents)\b |
  \b(notice\s+to\s+(california|virginia|colorado|connecticut|utah|iowa|oregon|texas|montana|delaware|tennessee|indiana|nebraska|new\s+jersey|new\s+hampshire|florida)\s+residents)\b |
  \b(your\s+california\s+privacy\s+rights)\b |
  \b(do\s+not\s+sell|do\s+not\s+share|targeted\s+advertising|sale\s+of\s+personal\s+information)\b
"""
# Allow an override file next to the notebook; also save effective pattern to outputs
override_path = Path("S5_us_fairness_regex.txt")
pattern_text = override_path.read_text(encoding="utf-8") if override_path.exists() else DEFAULT_STATE_APP_REGEX
(OUT_TAB/"S5_us_fairness_regex.txt").write_text(pattern_text, encoding="utf-8")
state_app_re = re.compile(pattern_text, flags=re.IGNORECASE | re.VERBOSE | re.DOTALL)

df["us_doc"] = df["region"].eq("us")
df["state_appendix_flag"] = df.apply(lambda r: (r["us_doc"] and bool(state_app_re.search(r["paragraph"]))), axis=1)

def rights_per1k(sub):
    w = sub["n_words"].sum(); rr = sub["rights_hits"].sum()
    return (rr / w * 1000.0) if w>0 else 0.0

effects = []
for (comp, file), sub in df[df["us_doc"]].groupby(["company","file"]):
    raw_rate  = rights_per1k(sub)
    kept_rate = rights_per1k(sub[~sub["state_appendix_flag"]])
    flagged   = int(sub["state_appendix_flag"].any())
    effects.append({"company":comp,"file":file,"rights_raw":raw_rate,"rights_after":kept_rate,"flagged":flagged})
effects = pd.DataFrame(effects)
effects.to_csv(OUT_TAB/"S5_state_appendix_effects.csv", index=False)

# Diagnostics (flagged vs kept)
diag_rows = []
for (comp, file), sub in df[df["us_doc"]].groupby(["company","file"]):
    all_w = sub["n_words"].sum(); all_r = sub["rights_hits"].sum()
    flg = sub[sub["state_appendix_flag"]]; kept = sub[~sub["state_appendix_flag"]]
    fw, fr = flg["n_words"].sum(), flg["rights_hits"].sum()
    kw, kr = kept["n_words"].sum(), kept["rights_hits"].sum()
    def per1k(r,w): return (r/w*1000.0) if w>0 else 0.0
    diag_rows.append({
      "company":comp,"file":file,
      "paras_total": len(sub), "paras_flagged": len(flg),
      "rights_per_1k_RAW": per1k(all_r, all_w),
      "rights_per_1k_FLAGGED_ONLY": per1k(fr, fw),
      "rights_per_1k_KEPT_ONLY": per1k(kr, kw),
      "delta_after_minus_raw": per1k(kr, kw) - per1k(all_r, all_w)
    })
pd.DataFrame(diag_rows).to_csv(OUT_TAB/"S5c_state_appendix_diagnostics.csv", index=False)

# Figure 4 (US only, RAW vs adjusted)
if not effects.empty:
    x = np.arange(effects.shape[0])
    plt.figure(figsize=(8,6))
    plt.bar(x-0.15, effects["rights_raw"].values, width=0.3, label="RAW")
    plt.bar(x+0.15, effects["rights_after"].values, width=0.3, label="Adjusted")
    plt.xticks(x, effects["company"], rotation=0); plt.legend()
    plt.ylabel("Rights per 1,000 words")
    plt.title("US state-appendix diagnostic: RAW vs adjusted")
    plt.tight_layout()
    plt.savefig(OUT_FIG/"fig04_state_appendix_adjustment.png", dpi=300)
    plt.savefig(OUT_FIG/"fig04_state_appendix_adjustment.svg"); plt.close()
    (OUT_FIG/"fig04_state_appendix_adjustment.txt").write_text(
        "US documents: rights per 1,000 words before/after excluding state-law appendix sections identified by regex. EU values unaffected.",
        encoding="utf-8"
    )

# ---------- Modality markers (+ pronouns, doc level) ----------
ORG_DISCRETION = [
    "we may","we reserve the right","we will","at our discretion","in our sole discretion",
    "we are not responsible","we disclaim"
]
USER_AGENCY  = ["you can","you may","you have the right","you are able to"]
USER_CONSTR  = ["you may not","you cannot","you must not","prohibited"]

def count_phrase(text, phrase):
    return len(list(re.finditer(rf"\b{re.escape(phrase)}\b", text, flags=re.IGNORECASE)))
def per_1000(c, w): return (c/w*1000.0) if w>0 else 0.0

mod_rows = []
for (comp, reg, file), sub in df.groupby(["company","region","file"]):
    text = " ".join(sub["paragraph"].tolist())
    words = len(re.findall(r"\b\w+\b", text))
    org  = sum(count_phrase(text, p) for p in ORG_DISCRETION)
    ag   = sum(count_phrase(text, p) for p in USER_AGENCY)
    cons = sum(count_phrase(text, p) for p in USER_CONSTR)
    mod_rows.append({
        "company":comp,"region":reg,"file":file,
        "org_discretion_per_1k": per_1000(org, words),
        "user_agency_per_1k": per_1000(ag, words),
        "user_constraints_per_1k": per_1000(cons, words),
        "we_our": len(re.findall(r"\b(we|our)\b", text, flags=re.IGNORECASE)),
        "you_your": len(re.findall(r"\b(you|your)\b", text, flags=re.IGNORECASE)),
        "words": words
    })
mod = pd.DataFrame(mod_rows)
mod.to_csv(OUT_TAB/"S6_modality_per_doc.csv", index=False)
pd.DataFrame(mod[["company","region","file","we_our","you_your","words"]]).to_csv(OUT_TAB/"S7_pronouns_per_doc.csv", index=False)

reg = (mod.groupby("region")
         .agg(org_discretion_per_1k=("org_discretion_per_1k","mean"),
              user_agency_per_1k=("user_agency_per_1k","mean"),
              user_constraints_per_1k=("user_constraints_per_1k","mean"))
         .reset_index())
plt.figure(figsize=(8,6))
x = np.arange(reg.shape[0]); w = 0.25
plt.bar(x-w, reg["org_discretion_per_1k"], width=w, label="Organisational discretion")
plt.bar(x,   reg["user_agency_per_1k"],  width=w, label="User agency")
plt.bar(x+w, reg["user_constraints_per_1k"], width=w, label="User constraints")
plt.xticks(x, reg["region"]); plt.legend(); plt.title("Modality markers per 1,000 words (EU vs US)")
plt.tight_layout()
plt.savefig(OUT_FIG/"fig03_modality_markers.png", dpi=300)
plt.savefig(OUT_FIG/"fig03_modality_markers.svg"); plt.close()
(OUT_FIG/"fig03_modality_markers.txt").write_text(
    "Modality markers per 1,000 words aggregated by region. Organisational discretion vs user agency vs user constraints.",
    encoding="utf-8"
)

# ---------- Simple significance tests (doc-level EU vs US) ----------
def mwu_row(name, eu, us):
    if len(eu)==0 or len(us)==0: return {"metric":name, "stat": float("nan"), "p_value": float("nan")}
    stat, p = mannwhitneyu(eu, us, alternative="two-sided")
    return {"metric":name, "stat": float(stat), "p_value": float(p)}
rows_tests = []
rows_tests.append(mwu_row("rights_share",
                          tab2.loc[tab2["region"]=="eu","rights_share"],
                          tab2.loc[tab2["region"]=="us","rights_share"]))
for col in ["org_discretion_per_1k","user_agency_per_1k","user_constraints_per_1k"]:
    rows_tests.append(mwu_row(col,
                              mod.loc[mod["region"]=="eu", col],
                              mod.loc[mod["region"]=="us", col]))
pd.DataFrame(rows_tests).to_csv(OUT_TAB/"S1b_document_level_tests.csv", index=False)

# ---------- Class balance (Table 0 + fig00) ----------
bal = meta[["company","region","file","n_paras"]].copy()
bal.to_csv(OUT_TAB/"Table_0_class_balance.csv", index=False)
plt.figure(figsize=(8,6))
labels_cb = bal["company"] + "-" + bal["region"]
x = np.arange(len(bal)); plt.bar(x, bal["n_paras"].values)
plt.xticks(x, labels_cb, rotation=45, ha="right"); plt.title("Paragraph count per company–region")
plt.tight_layout()
plt.savefig(OUT_FIG/"fig00_class_balance.png", dpi=300)
plt.savefig(OUT_FIG/"fig00_class_balance.svg"); plt.close()
(OUT_FIG/"fig00_class_balance.txt").write_text(
    "Class balance: paragraph counts per company–region used in the classifier.", encoding="utf-8")

# ---------- Supplement README + bundle + manifest ----------
readme = (
  "Supplement bundle — AI Privacy Policies Audit (Nov 2025)\n"
  "Methods: strict \\n\\n+ paragraphing (>=30 chars); TF–IDF(1–3,min_df=2,L2,English), brand ablation via union regex;\n"
  "NMF topics (K=6 primary, seed=42); Hungarian match on top-12 Jaccard; LOGO classifier on ablated features; 5-fold sensitivity;\n"
  "Rights/consent per 1k with doc bootstrap CIs; neutral US state-appendix identification (overrideable regex);\n"
  "Modality markers (org discretion, user agency, user constraints). Figures are single-plot matplotlib PNG+SVG; captions as .txt.\n"
  f"Generated UTC: {datetime.datetime.now(datetime.timezone.utc).isoformat()}\n"
)
(OUT_SUPP/"README.txt").write_text(readme, encoding="utf-8")

# Manifest (sha256 + size for everything under outputs/)
manifest = []
for root, _, files in os.walk("outputs"):
    for fn in files:
        p = Path(root)/fn
        if p.is_file():
            h = hashlib.sha256(p.read_bytes()).hexdigest()
            manifest.append({"path": str(p.relative_to("outputs")), "sha256": h, "bytes": p.stat().st_size})
(OUT_META/"manifest.json").write_text(json.dumps({"generated_at": datetime.datetime.utcnow().isoformat()+"Z",
                                                  "manifest": manifest}, indent=2),
                                      encoding="utf-8")

# Bundle core supplement tables/figures/meta + README
bundle = OUT_SUPP/"supplement_bundle_tp_robustness.zip"
with zipfile.ZipFile(bundle, "w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(OUT_SUPP/"README.txt", arcname="README.txt")
    for sub in ("figures","tables","meta"):
        base = Path("outputs")/sub
        for root, _, files in os.walk(base):
            for fn in files:
                p = Path(root)/fn
                z.write(p, arcname=str(p.relative_to("outputs")))

sha = hashlib.sha256(bundle.read_bytes()).hexdigest()
(OUT_SUPP/"ZIP_SHA256.txt").write_text(sha+"\n", encoding="utf-8")

# ---------- FINAL SUMMARY ----------
print("\n==== FINAL SUMMARY ====")
print(meta[["company","region","n_paras","n_words"]].to_string(index=False))
print(f"\nPrimary topic stability (K=6, seed=42): mean Jaccard = {mj_primary:.3f}")
print(f"Classifier LOGO: accuracy = {acc_mean:.3f} ± {acc_sd:.3f} | macro-F1 = {f1_mean:.3f} ± {f1_sd:.3f}")
panel_out = (tab2.groupby(['company','region'])
             .agg(rights_per_1000_RAW=('rights_per_1000_RAW','mean'))
             .reset_index())
print("\nRights per 1k (document mean by company×region):")
print(panel_out.to_string(index=False))
if not effects.empty:
    print("\nUS state-appendix deltas (per 1k):")
    print(effects[["company","rights_raw","rights_after","flagged"]].to_string(index=False))
reg_print = (mod.groupby("region")
             .agg(org_discretion_per_1k=("org_discretion_per_1k","mean"),
                  user_agency_per_1k=("user_agency_per_1k","mean"),
                  user_constraints_per_1k=("user_constraints_per_1k","mean"))
             .reset_index())
print("\nModality aggregates (mean per region):")
print(reg_print.to_string(index=False))
print("\nSaved figures → outputs/figures; tables → outputs/tables; meta → outputs/meta")
print("Supplement bundle:", bundle)
print("Supplement SHA256:", sha)

# Convenience: zip everything for download, try Colab auto-download
ALL_ZIP = Path(f"ai_policy_audit_ALL_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.zip")
with zipfile.ZipFile(ALL_ZIP, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk("outputs"):
        for fn in files:
            p = Path(root)/fn
            z.write(p, arcname=str(p))
    # include source policy files
    for stem in EXPECTED_STEMS:
        for ext in (".txt",".html"):
            p = Path(f"{stem}{ext}")
            if p.exists(): z.write(p, arcname=str(p))
try:
    from google.colab import files
    files.download(str(ALL_ZIP))
except Exception:
    pass

google_eu.html -> google_eu.txt | paragraphs=469
google_us.html -> google_us.txt | paragraphs=260
meta_eu.html -> meta_eu.txt | paragraphs=1458
meta_us.html -> meta_us.txt | paragraphs=1020
openai_eu.html -> openai_eu.txt | paragraphs=86
openai_us.html -> openai_us.txt | paragraphs=88
xai_eu.html -> xai_eu.txt | paragraphs=50
xai_us.html -> xai_us.txt | paragraphs=113

Policy metadata:
 company region          file  n_paras  n_words
 google     eu google_eu.txt      469    15823
 google     us google_us.txt      260     8309
   meta     eu   meta_eu.txt     1458    27815
   meta     us   meta_us.txt     1020    19013
 openai     eu openai_eu.txt       86     2891
 openai     us openai_us.txt       88     3163
    xai     eu    xai_eu.txt       50     1938
    xai     us    xai_us.txt      113     5704
Vocab sizes — baseline: 21170 | ablated: 20530

==== FINAL SUMMARY ====
company region  n_paras  n_words
 google     eu      469    15823
 google     us      260     8309
   meta     eu  

  (OUT_META/"manifest.json").write_text(json.dumps({"generated_at": datetime.datetime.utcnow().isoformat()+"Z",


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>