In [2]:
# === Select top "V-only vs VAD" disagreement chunks and export ============
from pathlib import Path
import numpy as np
import pandas as pd
import textwrap

ROOT      = Path("..")  # run from emobook/notebooks/
SCORE_DIR = ROOT / "scored_v21"
OUT_DIR   = ROOT / "review"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _odd(n): 
    n = int(round(n))
    return n if n % 2 == 1 else max(1, n-1)

def load_scored(book_stem: str):
    f = SCORE_DIR / f"{book_stem}.scored_v21.csv"
    df = pd.read_csv(f).sort_values("chunk_id").reset_index(drop=True)
    return df

def add_rolls(df: pd.DataFrame, win=None, frac=0.01):
    if win is None:
        win = _odd(max(5, int(len(df) * frac)))
    df = df.copy()
    for col in ["v","a","d","coverage"]:
        df[f"{col}_roll"] = df[col].rolling(win, center=True, min_periods=1).mean()
    return df, win

def zscore(x):
    mu = np.nanmean(x); sd = np.nanstd(x) + 1e-9
    return (x - mu) / sd

def build_disagreement_table(book_stem: str, win=None, frac=0.01):
    df, win = add_rolls(load_scored(book_stem), win=win, frac=frac)
    # VAD fused (signed intensity) on rolled series
    vad_fused = np.sign(df["v_roll"]) * np.sqrt(df["v_roll"]**2 + df["a_roll"]**2 + df["d_roll"]**2)
    diff = vad_fused - df["v_roll"]
    # z-scores to know the dominating dimension at this point
    Vz, Az, Dz = zscore(df["v_roll"]), zscore(df["a_roll"]), zscore(df["d_roll"])
    dom = np.array(["V","A","D"])[np.argmax(np.vstack([np.abs(Vz), np.abs(Az), np.abs(Dz)]), axis=0)]
    out = df[["book","chunk_id","text","coverage","v_roll","a_roll","d_roll"]].copy()
    out["vad_fused"] = vad_fused
    out["diff_abs"]  = np.abs(diff)
    out["diff_signed"] = diff
    out["dom_dim"] = dom
    out["roll_win"] = win
    return out

def pick_top_k_diverse(tbl: pd.DataFrame, k=10, min_gap=15, balanced=True):
    """
    Pick k chunk rows with largest |diff|, enforcing a min gap between chosen chunks.
    If balanced=True → try to take ~k/2 where diff>0 and ~k/2 where diff<0.
    """
    chosen = []
    used = set()

    def _take(cand, how_many):
        nonlocal chosen, used
        for _, row in cand.iterrows():
            i = int(row["chunk_id"])
            if any(abs(i - j) < min_gap for j in used):
                continue
            chosen.append(row)
            used.add(i)
            if len(chosen) >= how_many:
                break

    if balanced:
        pos = tbl[tbl["diff_signed"] > 0].sort_values("diff_abs", ascending=False)
        neg = tbl[tbl["diff_signed"] < 0].sort_values("diff_abs", ascending=False)
        k_pos = k // 2
        k_neg = k - k_pos
        _take(pos, k_pos)
        _take(neg, k)  # fill remaining from neg first
        if len(chosen) < k:
            rest = tbl.sort_values("diff_abs", ascending=False)
            _take(rest, k)
    else:
        _take(tbl.sort_values("diff_abs", ascending=False), k)

    sel = pd.DataFrame(chosen).sort_values("chunk_id").reset_index(drop=True)
    return sel

def add_neighbor_context(df_all: pd.DataFrame, sel: pd.DataFrame, left=1, right=1, wrap=None):
    """
    Add short left/right neighbor context (optional wrap chars).
    """
    def _ctx(i, offset):
        j = i + offset
        if j < 0 or j >= len(df_all): return ""
        t = str(df_all.loc[j, "text"])
        return textwrap.shorten(t, width=wrap, placeholder="…") if wrap else t

    rows = []
    for _, r in sel.iterrows():
        i = int(r["chunk_id"])
        row = r.to_dict()
        row["prev_chunk_id"] = i-1 if i-1 >= 0 else None
        row["next_chunk_id"] = i+1 if i+1 < len(df_all) else None
        row["prev_text"] = _ctx(i, -left) if left else ""
        row["next_text"] = _ctx(i,  right) if right else ""
        rows.append(row)
    return pd.DataFrame(rows)

def export_reviews(k=10, min_gap=15, win=None, frac=0.01, wrap=300):
    excel_path = OUT_DIR / "V_vs_VAD_review.xlsx"
    writer = pd.ExcelWriter(excel_path, engine="xlsxwriter")
    index = []

    for csv in sorted(SCORE_DIR.glob("*.scored_v21.csv")):
        stem = csv.stem.replace(".scored_v21", "")
        tbl = build_disagreement_table(stem, win=win, frac=frac)
        # keep full chunk text in CSV; Excel gets wrapped shortened view
        diverse = pick_top_k_diverse(tbl, k=k, min_gap=min_gap, balanced=True)

        # Add compact neighbor context columns (shortened for human scan)
        df_all, _ = add_rolls(load_scored(stem), win=tbl["roll_win"].iloc[0], frac=frac)
        diverse_ctx = add_neighbor_context(df_all, diverse, left=1, right=1, wrap=wrap)

        # nicer column order
        cols = ["book","chunk_id","dom_dim","coverage","v_roll","a_roll","d_roll",
                "vad_fused","diff_abs","diff_signed","text","prev_chunk_id","prev_text","next_chunk_id","next_text","roll_win"]
        diverse_ctx = diverse_ctx[cols]

        # Write per-book CSV and Excel sheet
        csv_out = OUT_DIR / f"{stem}.review_top{str(k)}.csv"
        diverse_ctx.to_csv(csv_out, index=False)
        # For Excel, shorten main chunk text so the sheet is readable; CSV keeps full.
        to_xlsx = diverse_ctx.copy()
        to_xlsx["text"] = to_xlsx["text"].apply(lambda s: textwrap.shorten(str(s), width=wrap, placeholder="…"))
        to_xlsx.to_excel(writer, sheet_name=stem[:31], index=False)

        index.append({"book": stem, "saved_csv": str(csv_out.name), "num_rows": len(diverse_ctx),
                      "avg_cov_selected": round(diverse_ctx["coverage"].mean(), 3)})

        print(f"✔ {stem}: wrote {csv_out.name} and Excel sheet")

    pd.DataFrame(index).to_excel(writer, sheet_name="INDEX", index=False)
    writer.close()
    print(f"\nAll done → {excel_path}")

# ---- RUN ----
export_reviews(k=10, min_gap=15, win=None, frac=0.01, wrap=320)


✔ Frankenstein.clean: wrote Frankenstein.clean.review_top10.csv and Excel sheet
✔ Mobi Dick.clean: wrote Mobi Dick.clean.review_top10.csv and Excel sheet
✔ Pride and Prejudice.clean: wrote Pride and Prejudice.clean.review_top10.csv and Excel sheet
✔ Romeo and Juliet.clean: wrote Romeo and Juliet.clean.review_top10.csv and Excel sheet
✔ The Adventures of Sherlock Holmes.clean: wrote The Adventures of Sherlock Holmes.clean.review_top10.csv and Excel sheet

All done → ../review/V_vs_VAD_review.xlsx


In [3]:
from pathlib import Path
import numpy as np, pandas as pd, textwrap

ROOT      = Path("..")
SCORE_DIR = ROOT / "scored_v21"
OUT_DIR   = ROOT / "review"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _odd(n): n=int(round(n)); return n if n%2==1 else max(1,n-1)

def load_scored(stem):
    df = pd.read_csv(SCORE_DIR / f"{stem}.scored_v21.csv").sort_values("chunk_id").reset_index(drop=True)
    return df

def add_rolls(df, frac=0.01, win=None):
    if win is None:
        win = _odd(max(5, int(len(df)*frac)))
    out = df.copy()
    for c in ["v","a","d","coverage"]:
        out[f"{c}_roll"] = out[c].rolling(win, center=True, min_periods=1).mean()
    return out, win

def zscore(x): m=np.nanmean(x); s=np.nanstd(x)+1e-9; return (x-m)/s

def enrich_disagreement(stem, frac=0.01, win=None, z_thr=0.8):
    df, win = add_rolls(load_scored(stem), frac=frac, win=win)

    # VAD fused (same as before)
    vad_fused = np.sign(df["v_roll"]) * np.sqrt(df["v_roll"]**2 + df["a_roll"]**2 + df["d_roll"]**2)
    diff = vad_fused - df["v_roll"]

    # z-space for opposition tests
    Vz, Az, Dz = map(zscore, (df["v_roll"], df["a_roll"], df["d_roll"]))

    opp_A = (np.sign(Az) != np.sign(Vz)) & (np.abs(Az) >= z_thr)
    opp_D = (np.sign(Dz) != np.sign(Vz)) & (np.abs(Dz) >= z_thr)

    # angle (deg) from Valence axis and orthogonal magnitude
    denom = np.sqrt(df["v_roll"]**2 + df["a_roll"]**2 + df["d_roll"]**2) + 1e-9
    theta_deg = np.degrees(np.arccos(np.clip(np.abs(df["v_roll"]) / denom, 0.0, 1.0)))
    orth = np.sqrt(df["a_roll"]**2 + df["d_roll"]**2)

    out = df[["book","chunk_id","text","coverage","v_roll","a_roll","d_roll"]].copy()
    out["vad_fused"]   = vad_fused
    out["diff_abs"]    = np.abs(diff)          # intensity difference (non-directional)
    out["diff_signed"] = diff                  # will have sign of v_roll by design
    out["Vz"] = Vz; out["Az"] = Az; out["Dz"] = Dz
    out["opp_A"] = opp_A; out["opp_D"] = opp_D
    out["theta_deg"] = theta_deg              # directional deviation from V
    out["orth_mag"]  = orth                   # how much A/D energy is present
    out["roll_win"]  = win
    return out

def pick_review_rows(tbl, k=10, min_gap=15, mode="mixed"):
    """
    mode:
      - 'intensity': top by diff_abs (your old criterion)
      - 'angle': top by theta_deg (directional difference)
      - 'orth': top by orth_mag (A/D energy)
      - 'opp': prioritize chunks with opp_A or opp_D, then by theta
      - 'mixed': take ~k/2 from 'opp', rest from 'angle'
    """
    def spaced(rows):
        chosen=[]; used=set()
        for _,r in rows.iterrows():
            i=int(r["chunk_id"])
            if any(abs(i-j)<min_gap for j in used): continue
            used.add(i); chosen.append(r)
            if len(chosen)>=k: break
        return pd.DataFrame(chosen)

    if mode=="intensity":
        cand = tbl.sort_values("diff_abs", ascending=False)
        return spaced(cand)
    if mode=="angle":
        cand = tbl.sort_values("theta_deg", ascending=False)
        return spaced(cand)
    if mode=="orth":
        cand = tbl.sort_values("orth_mag", ascending=False)
        return spaced(cand)
    if mode=="opp":
        cand = pd.concat([
            tbl[(tbl["opp_A"]|tbl["opp_D"])].sort_values(["theta_deg","orth_mag"], ascending=False),
            tbl[~(tbl["opp_A"]|tbl["opp_D"])].sort_values("theta_deg", ascending=False)
        ], ignore_index=True)
        return spaced(cand)
    # mixed
    k1 = k//2; k2 = k-k1
    first = pick_review_rows(tbl, k=k1, min_gap=min_gap, mode="opp")
    remain = tbl.drop(first.index)
    second = pick_review_rows(remain, k=k2, min_gap=min_gap, mode="angle")
    out = pd.concat([first, second]).sort_values("chunk_id").reset_index(drop=True)
    return out

def export_reviews_with_metrics(k=10, min_gap=15, frac=0.01, mode="mixed", wrap=320):
    xlsx = OUT_DIR / f"V_vs_VAD_review_enhanced_{mode}.xlsx"
    writer = pd.ExcelWriter(xlsx, engine="xlsxwriter")
    idx_rows=[]

    for f in sorted(SCORE_DIR.glob("*.scored_v21.csv")):
        stem = f.stem.replace(".scored_v21","")
        T = enrich_disagreement(stem, frac=frac)
        sel = pick_review_rows(T, k=k, min_gap=min_gap, mode=mode)

        # Compact sheet (shortened text), full CSV too
        cols = ["book","chunk_id","coverage",
                "v_roll","a_roll","d_roll","vad_fused",
                "diff_abs","diff_signed","theta_deg","orth_mag","opp_A","opp_D","text"]
        short = sel[cols].copy()
        short["text"] = short["text"].apply(lambda s: textwrap.shorten(str(s), width=wrap, placeholder="…"))
        short.to_excel(writer, sheet_name=stem[:31], index=False)

        csv_out = OUT_DIR / f"{stem}.review_top{k}_{mode}.csv"
        sel[cols].to_csv(csv_out, index=False)

        idx_rows.append({
            "book": stem, "rows": len(sel),
            "opp_frac": float((sel["opp_A"]|sel["opp_D"]).mean()),
            "mean_theta": round(float(sel["theta_deg"].mean()),2),
            "mean_orth": round(float(sel["orth_mag"].mean()),3),
            "mean_diff_abs": round(float(sel["diff_abs"].mean()),3),
            "sheet": stem[:31]
        })
        print(f"✔ {stem}: saved {csv_out.name} & Excel sheet")

    pd.DataFrame(idx_rows).to_excel(writer, sheet_name="INDEX", index=False)
    writer.close()
    print(f"\nAll done → {xlsx}")

#(picks 10 per book emphasizing true V-vs-VAD disagreements)
export_reviews_with_metrics(k=10, min_gap=15, mode="mixed")


✔ Frankenstein.clean: saved Frankenstein.clean.review_top10_mixed.csv & Excel sheet
✔ Mobi Dick.clean: saved Mobi Dick.clean.review_top10_mixed.csv & Excel sheet
✔ Pride and Prejudice.clean: saved Pride and Prejudice.clean.review_top10_mixed.csv & Excel sheet
✔ Romeo and Juliet.clean: saved Romeo and Juliet.clean.review_top10_mixed.csv & Excel sheet
✔ The Adventures of Sherlock Holmes.clean: saved The Adventures of Sherlock Holmes.clean.review_top10_mixed.csv & Excel sheet

All done → ../review/V_vs_VAD_review_enhanced_mixed.xlsx
