In [5]:
# --- Robust paths: works from repo root OR notebooks/ ----------------------
from pathlib import Path

BASE = Path.cwd()

def pick_dir(label, candidates, must_contain=None, pattern=None):
    """
    Pick the first existing directory that either contains a specific file
    or at least one file matching a glob `pattern`.
    """
    for rel in candidates:
        d = (BASE / rel).resolve()
        if not d.exists():
            continue
        if must_contain and not (d / must_contain).exists():
            if pattern:
                if not list(d.glob(pattern)):
                    continue
            else:
                continue
        print(f"{label}: {d}")
        return d
    raise FileNotFoundError(f"Couldn't find {label}. Tried: {candidates}")

# find chunks dir (has *.chunks.csv)
CHUNK_DIR = pick_dir(
    "CHUNK_DIR",
    ["chunks", "../chunks", "../../chunks"],
    pattern="*.chunks.csv",
)

# find scored dir (prefer scored_v21/, else scored/)
SCORED_DIR = pick_dir(
    "SCORED_DIR",
    ["scored_v21", "../scored_v21", "../../scored_v21", "scored", "../scored", "../../scored"],
    pattern="*.scored_v21.csv",
)

# where to write turning-point outputs (put at repo root by default)
OUT_DIR = (BASE / "turning_points")
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("OUT_DIR:", OUT_DIR.resolve())


CHUNK_DIR: /Users/nageshs/Desktop/college/emobook/emobook/chunks
SCORED_DIR: /Users/nageshs/Desktop/college/emobook/emobook/scored_v21
OUT_DIR: /Users/nageshs/Desktop/college/emobook/emobook/notebooks/turning_points


In [8]:
# TURNING-POINT TAGGER + V vs VAD COMPARISON
# ------------------------------------------
from pathlib import Path
import pandas as pd
import numpy as np
import re, textwrap



# ---- Helpers ----------------------------------------------------------------
def _read_csv_maybe(path):
    if not path.exists():
        raise FileNotFoundError(f"Missing: {path}")
    return pd.read_csv(path)

import pandas as pd
import numpy as np

def load_book_frames(book_stem: str, roll_win_default=15):
    """
    Join chunk texts (from CHUNK_DIR) with scores (from SCORED_DIR).
    Ensures v_roll/a_roll/d_roll and vad_fused exist.
    """
    chunks_path = CHUNK_DIR / f"{book_stem}.chunks.csv"
    scored_path = SCORED_DIR / f"{book_stem}.scored_v21.csv"

    if not chunks_path.exists():
        raise FileNotFoundError(f"Missing chunk CSV: {chunks_path}")
    if not scored_path.exists():
        # fallback: pick any scored file that matches the stem
        cand = list(SCORED_DIR.glob(f"{book_stem}*.scored*.csv"))
        if not cand:
            raise FileNotFoundError(f"Missing scored CSV: {scored_path}")
        scored_path = cand[0]

    df_chunks = pd.read_csv(chunks_path)
    df_scored = pd.read_csv(scored_path)

    if "chunk_id" not in df_chunks: df_chunks["chunk_id"] = np.arange(len(df_chunks))
    if "chunk_id" not in df_scored: df_scored["chunk_id"] = np.arange(len(df_scored))

    # prefer text from chunks if scored lacks it
    base = (df_scored.merge(
                df_chunks[["chunk_id","text"]],
                on="chunk_id", how="left", suffixes=("", "_from_chunks"))
            .assign(text=lambda d: d["text"].fillna(d.get("text_from_chunks"))))

    # ensure rolled cols
    for raw, roll in [("v","v_roll"), ("a","a_roll"), ("d","d_roll")]:
        if roll not in base.columns:
            if raw in base.columns:
                base[roll] = base[raw].rolling(roll_win_default, center=True, min_periods=1).mean()
            else:
                base[roll] = np.nan

    # ensure fused (if your scored_v21 already has it, this is no-op)
    if "vad_fused" not in base.columns:
        signed = np.sign(base["v_roll"].astype(float))
        mag = np.sqrt(base["v_roll"]**2 + base["a_roll"]**2 + base["d_roll"]**2).astype(float)
        base["vad_fused"] = signed * mag

    for c in ["v_roll","a_roll","d_roll","vad_fused"]:
        base[c] = pd.to_numeric(base[c], errors="coerce")

    return base.sort_values("chunk_id").reset_index(drop=True)


def zscore(x):
    x = np.asarray(x, float)
    return (x - np.nanmean(x)) / (np.nanstd(x) + 1e-8)

def find_chunks_for_keywords(text_series, any_terms=None, all_terms=None, regex=None):
    """
    Returns list of indices whose chunk text matches (case-insensitive):
      - any one of the strings in any_terms OR
      - all of the strings in all_terms OR
      - the regex pattern
    """
    any_terms = [t.lower() for t in (any_terms or [])]
    all_terms = [t.lower() for t in (all_terms or [])]
    patt = re.compile(regex, flags=re.I) if regex else None

    hits = []
    for i, t in enumerate(text_series.fillna("").astype(str)):
        tl = t.lower()
        ok = False
        if patt is not None and patt.search(t):
            ok = True
        if not ok and any_terms:
            ok = any((k in tl) for k in any_terms)
        if not ok and all_terms:
            ok = all((k in tl) for k in all_terms)
        if ok:
            hits.append(i)
    return hits

# -------- improved local-peak finder ----------
def local_peak(series_z, center_idx, half_window=8):
    """
    Strongest |z| *within the window* around center_idx.
    Returns (peak_idx, peak_z, dist_to_center).
    """
    n = len(series_z)
    lo, hi = max(0, center_idx - half_window), min(n, center_idx + half_window + 1)
    window = np.abs(series_z[lo:hi])
    if window.size == 0 or np.all(np.isnan(window)):
        return None, np.nan, np.nan
    rel = int(np.nanargmax(window))
    peak_idx = lo + rel
    peak_val = float(series_z[peak_idx])
    return peak_idx, peak_val, int(peak_idx - center_idx)

def compare_exact_then_peak(vz_hit, fz_hit, v_peak_z, f_peak_z,
                            tie_margin=0.30, z_thr=0.80,
                            v_peak_dist=None, f_peak_dist=None):
    """
    1) EXACT: compare |vz_hit| vs |fz_hit| if either crosses z_thr (recognised at the exact chunk).
    2) Else PEAK: compare |v_peak_z| vs |f_peak_z| if either crosses z_thr in the window.
    3) Else: Neither.
    Returns dict with verdicts/flags.
    """
    res = {}
    av, af = abs(vz_hit), abs(fz_hit)
    v_exact = av >= z_thr
    f_exact = af >= z_thr

    # EXACT decision first
    if v_exact or f_exact:
        if (af - av) > tie_margin:
            res["winner_exact"] = "VAD > V"
        elif (av - af) > tie_margin:
            res["winner_exact"] = "V-only > VAD"
        else:
            res["winner_exact"] = f"Tie (±{tie_margin:.2f})"
        res["decision"] = "EXACT"
        res["v_exact_recognised"] = v_exact
        res["f_exact_recognised"] = f_exact
        return res

    # PEAK fallback
    pv, pf = abs(v_peak_z), abs(f_peak_z)
    v_peak_ok = pv >= z_thr
    f_peak_ok = pf >= z_thr
    if v_peak_ok or f_peak_ok:
        if (pf - pv) > tie_margin:
            res["winner_peak"] = "VAD > V"
        elif (pv - pf) > tie_margin:
            res["winner_peak"] = "V-only > VAD"
        else:
            res["winner_peak"] = f"Tie (±{tie_margin:.2f})"
        res["decision"] = "PEAK"
        res["v_peak_recognised"] = v_peak_ok
        res["f_peak_recognised"] = f_peak_ok
        res["v_peak_dist"] = v_peak_dist
        res["f_peak_dist"] = f_peak_dist
        return res

    # Neither crossed the threshold anywhere relevant
    res["decision"] = "NEITHER"
    res["winner_exact"] = "n/a"
    res["winner_peak"] = "n/a"
    res["v_exact_recognised"] = False
    res["f_exact_recognised"] = False
    res["v_peak_recognised"] = False
    res["f_peak_recognised"] = False
    res["v_peak_dist"] = v_peak_dist
    res["f_peak_dist"] = f_peak_dist
    return res
def evaluate_turning_points_strict(book_stem,
                                   half_window=8,
                                   z_thr=0.80,
                                   tie_margin=0.30):
    """
    For each matched turning point:
      - take z at the EXACT matched chunk (vz_hit, fz_hit)
      - if neither passes z_thr, look for strongest local peak within ±half_window
      - record both exact and peak metrics, with a final decision that prioritizes EXACT
    """
    df = load_book_frames(book_stem)
    vz = zscore(df["v_roll"].values)
    fz = zscore(df["vad_fused"].values)

    rows = []
    for ev in TP[book_stem]:
        any_terms = ev.get("any")
        all_terms = ev.get("all")
        regex     = ev.get("regex")

        hits = find_chunks_for_keywords(df["text"], any_terms=any_terms, all_terms=all_terms, regex=regex)
        hit_idx = hits[0] if hits else None

        if hit_idx is None:
            rows.append({
                "book": book_stem, "event": ev["id"], "match_status":"NOT FOUND",
                "match_idx": np.nan, "chunk_preview":"", 
                "vz_hit": np.nan, "fz_hit": np.nan,
                "v_peak_idx": np.nan, "v_peak_z": np.nan, "v_peak_dist": np.nan,
                "f_peak_idx": np.nan, "f_peak_z": np.nan, "f_peak_dist": np.nan,
                "decision":"n/a", "winner_exact":"n/a", "winner_peak":"n/a",
                "v_exact_recognised": False, "f_exact_recognised": False,
                "v_peak_recognised": False, "f_peak_recognised": False,
                "v_roll_hit": np.nan, "a_roll_hit": np.nan, "d_roll_hit": np.nan, "fused_hit": np.nan
            })
            continue

        # exact z at hit
        vz_hit = float(vz[hit_idx])
        fz_hit = float(fz[hit_idx])

        # best peaks near hit (for fallback + reporting)
        v_pk_idx, v_pk_z, v_pk_dist = local_peak(vz, hit_idx, half_window=half_window)
        f_pk_idx, f_pk_z, f_pk_dist = local_peak(fz, hit_idx, half_window=half_window)

        dec = compare_exact_then_peak(
            vz_hit, fz_hit, v_pk_z, f_pk_z,
            tie_margin=tie_margin, z_thr=z_thr,
            v_peak_dist=v_pk_dist, f_peak_dist=f_pk_dist
        )

        rows.append({
            "book": book_stem,
            "event": ev["id"],
            "match_status": "FOUND",
            "match_idx": int(hit_idx),
            "chunk_preview": preview(df.loc[hit_idx, "text"]),
            # exact
            "vz_hit": vz_hit, "fz_hit": fz_hit,
            # peak (window)
            "v_peak_idx": v_pk_idx, "v_peak_z": v_pk_z, "v_peak_dist": v_pk_dist,
            "f_peak_idx": f_pk_idx, "f_peak_z": f_pk_z, "f_peak_dist": f_pk_dist,
            # decisions
            "decision": dec["decision"],
            "winner_exact": dec.get("winner_exact","n/a"),
            "winner_peak": dec.get("winner_peak","n/a"),
            "v_exact_recognised": dec.get("v_exact_recognised", False),
            "f_exact_recognised": dec.get("f_exact_recognised", False),
            "v_peak_recognised": dec.get("v_peak_recognised", False),
            "f_peak_recognised": dec.get("f_peak_recognised", False),
            # raw numbers at the EXACT hit (nice for reading context)
            "v_roll_hit": float(df.loc[hit_idx,"v_roll"]),
            "a_roll_hit": float(df.loc[hit_idx,"a_roll"]),
            "d_roll_hit": float(df.loc[hit_idx,"d_roll"]),
            "fused_hit":  float(df.loc[hit_idx,"vad_fused"]),
        })

    return pd.DataFrame(rows)


def verdict_from_z(z_v, z_fused, margin=0.30):
    """
    Decide which method reacts more strongly at the event window.
    """
    av, af = abs(z_v), abs(z_fused)
    if af - av > margin:
        return "VAD > V"
    elif av - af > margin:
        return "V-only > VAD"
    else:
        return "Tie (±{:.2f})".format(margin)

def preview(text, n=220):
    txt = " ".join(str(text).split())
    return (txt[:n] + " …") if len(txt) > n else txt

# ---- Turning-point catalog ---------------------------------------------------
# You can edit/extend this dictionary freely. Each event can specify:
#   - 'any': any of these phrases triggers a hit, OR
#   - 'all': all of these tokens must be present (rough AND), OR
#   - 'regex': an explicit regex (case-insensitive)
TP = {
    "Frankenstein.clean": [
        {"id":"Animation of the Creature",
         "any":[
            "dreary night of november", "dull yellow eye", "spark of being"
         ]},
        {"id":"William’s murder announced", "all":["william","murder"]},
        {"id":"Justine’s trial / execution", "any":["justine","trial","confession","execut"]},
        {"id":"Creature demands a mate", "any":["create a female","female companion","mate for me","female for me"]},
        {"id":"Victor destroys the female", "any":["tore to pieces","destroyed the female","the work i had begun was undone"]},
        {"id":"Elizabeth murdered (wedding night)", "all":["elizabeth","wedding"]},
        {"id":"Arctic pursuit & endgame", "any":["sledge","ice","walton","north pole","arctic"]}
    ],
    "Mobi Dick.clean": [
        {"id":"Ahab nails the doubloon / vow",
         "any":["spanish ounce of gold","doubloon","whosoever of ye raises me","the quarter-deck"]},
        {"id":"Ahab declares vengeance on Moby Dick", "any":["it was moby dick that dismasted me","i'll chase him round good hope","moby dick—moby dick"]},
        {"id":"Typhoon / St. Elmo’s fire", "any":["typhoon","lightning","candles","st. elmo"]},
        {"id":"The Symphony (Ahab & Starbuck)", "any":["the symphony","wife and child","starbuck"]},
        {"id":"The Chase—First Day", "any":["the chase—first day","chase—first day"]},
        {"id":"The Chase—Third Day (Ahab’s death)", "any":["the chase.—third day","third time, moby dick","hemp","the harpoon"]},
    ],
    "Pride and Prejudice.clean": [
        {"id":"Meryton ball — Darcy’s slight", "any":["tolerable; but not handsome enough","meryton","assembly"]},
        {"id":"First proposal at Hunsford", "any":["in vain i have struggled","hunsford","proposal"]},
        {"id":"Darcy’s letter & revelation", "all":["letter","wickham"]},
        {"id":"Pemberley encounter", "any":["pemberley"]},
        {"id":"Lydia’s elopement", "all":["lydia","eloped"]},
        {"id":"Bingley & Jane engaged", "all":["bingley","engaged"]},
        {"id":"Second proposal (resolution)", "any":["if your feelings are still what they were last april",
                                                   "you are too generous to trifle with me","my feelings and wishes are unchanged"]}
    ],
    "Romeo and Juliet.clean": [
        {"id":"They meet at the Capulet feast", "any":["holy palmers' kiss","pilgrim","capulet's house"]},
        {"id":"Balcony scene", "any":["wherefore art thou romeo","balcony"]},
        {"id":"Secret marriage (Friar’s cell)", "all":["friar","cell","married"]},
        {"id":"Mercutio slain; Tybalt slain", "any":["a plague o' both your houses","tybalt","mercutio"]},
        {"id":"Romeo banished", "any":["banished"]},
        {"id":"Potion plan", "any":["vial","distilling liquor","slumber"]},
        {"id":"Tomb — deaths", "any":["tomb","thus with a kiss i die","poison","dagger"]}
    ],
    "The Adventures of Sherlock Holmes.clean": [
        # Focus on "A Scandal in Bohemia" (book-wide arcs still OK)
        {"id":"Masked King seeks help", "any":["count von kramm","bohemia","king of bohemia"]},
        {"id":"Godfrey Norton & sudden wedding", "any":["godfrey norton","st. monica"]},
        {"id":"Briony Lodge stakeout", "any":["briony lodge","serpentine avenue"]},
        {"id":"Smoke-rocket ruse / Fire!", "any":["smoke-rocket","cry of fire","fire!"]},
        {"id":"Adler’s farewell letter", "any":["good-night, mr. sherlock holmes","good night, mr. sherlock holmes"]},
        {"id":"'The woman' outwits Holmes", "any":["the woman","i admire her intellect","she has left the country"]}
    ],
}

BOOKS = list(TP.keys())




In [9]:
# -------- runner: write per-book CSV + master Excel ----------
def run_tp_eval_exact_first(books=None, half_window=8, z_thr=0.80, tie_margin=0.30):
    books = books or list(TP.keys())
    all_out = []
    xlsx_path = OUT_DIR / "turning_points_eval_exact_first.xlsx"
    with pd.ExcelWriter(xlsx_path, engine="xlsxwriter") as xlw:
        for b in books:
            dfb = evaluate_turning_points_strict(b, half_window=half_window, z_thr=z_thr, tie_margin=tie_margin)
            out_csv = OUT_DIR / f"{b}.turning_points_eval_exact_first.csv"
            dfb.to_csv(out_csv, index=False)
            dfb.to_excel(xlw, sheet_name=b[:31], index=False)
            all_out.append(dfb)
            print(f"✔ {b}: exact-first eval saved → {out_csv.name}")
    # small summary
    summary = (pd.concat(all_out, ignore_index=True)
                 .assign(win=lambda d:
                         np.where(d["decision"].eq("EXACT"), d["winner_exact"],
                         np.where(d["decision"].eq("PEAK"), d["winner_peak"], "Neither")))
                 .groupby(["book","decision","win"]).size().unstack(fill_value=0).reset_index())
    display(summary)
    print(f"\n📁 Master workbook: {xlsx_path}")

# ---- RUN (use your book list) ----
run_tp_eval_exact_first(books=BOOKS, half_window=8, z_thr=0.80, tie_margin=0.30)

✔ Frankenstein.clean: exact-first eval saved → Frankenstein.clean.turning_points_eval_exact_first.csv
✔ Mobi Dick.clean: exact-first eval saved → Mobi Dick.clean.turning_points_eval_exact_first.csv
✔ Pride and Prejudice.clean: exact-first eval saved → Pride and Prejudice.clean.turning_points_eval_exact_first.csv
✔ Romeo and Juliet.clean: exact-first eval saved → Romeo and Juliet.clean.turning_points_eval_exact_first.csv
✔ The Adventures of Sherlock Holmes.clean: exact-first eval saved → The Adventures of Sherlock Holmes.clean.turning_points_eval_exact_first.csv


win,book,decision,Neither,Tie (±0.30),V-only > VAD,VAD > V
0,Frankenstein.clean,EXACT,0,6,0,0
1,Frankenstein.clean,PEAK,0,0,1,0
2,Mobi Dick.clean,EXACT,0,0,0,6
3,Pride and Prejudice.clean,EXACT,0,1,1,0
4,Pride and Prejudice.clean,NEITHER,1,0,0,0
5,Pride and Prejudice.clean,PEAK,0,1,1,1
6,Pride and Prejudice.clean,,1,0,0,0
7,Romeo and Juliet.clean,EXACT,0,2,0,2
8,Romeo and Juliet.clean,NEITHER,1,0,0,0
9,Romeo and Juliet.clean,PEAK,0,2,0,0



📁 Master workbook: /Users/nageshs/Desktop/college/emobook/emobook/notebooks/turning_points/turning_points_eval_exact_first.xlsx


In [7]:

# ---- Run for all books & save ------------------------------------------------
all_dfs = []
for b in BOOKS:
    dfb = evaluate_turning_points(b, half_window=8, margin=0.30)
    all_dfs.append(dfb)
    csv_path = OUT_DIR / f"{b}.turning_points_eval.csv"
    dfb.to_csv(csv_path, index=False)
    print(f"✔ {b}: saved {csv_path.name} ({dfb['match_status'].eq('FOUND').sum()}/{len(dfb)} found)")

# Excel with one sheet per book
xlsx_path = OUT_DIR / "turning_points_eval.xlsx"
with pd.ExcelWriter(xlsx_path, engine="xlsxwriter") as xlw:
    for dfb in all_dfs:
        sheet = dfb['book'].iloc[0]
        # Sheets can't be longer than 31 chars, so shorten safely
        sheet = sheet[:31]
        dfb.to_excel(xlw, sheet_name=sheet, index=False)
print(f"\n📁 Master workbook: {xlsx_path}")

# Small summary table to eyeball who wins more often
summary = (pd.concat(all_dfs, ignore_index=True)
             .assign(win=lambda d: d['winner'].fillna("n/a"))
             .groupby(['book','win']).size().unstack(fill_value=0).reset_index())
summary


✔ Frankenstein.clean: saved Frankenstein.clean.turning_points_eval.csv (7/7 found)
✔ Mobi Dick.clean: saved Mobi Dick.clean.turning_points_eval.csv (6/6 found)
✔ Pride and Prejudice.clean: saved Pride and Prejudice.clean.turning_points_eval.csv (6/7 found)
✔ Romeo and Juliet.clean: saved Romeo and Juliet.clean.turning_points_eval.csv (7/7 found)
✔ The Adventures of Sherlock Holmes.clean: saved The Adventures of Sherlock Holmes.clean.turning_points_eval.csv (5/6 found)

📁 Master workbook: /Users/nageshs/Desktop/college/emobook/emobook/notebooks/turning_points/turning_points_eval.xlsx


win,book,Tie (±0.30),V-only > VAD,VAD > V,n/a
0,Frankenstein.clean,4,2,1,0
1,Mobi Dick.clean,0,0,6,0
2,Pride and Prejudice.clean,3,2,1,1
3,Romeo and Juliet.clean,4,0,3,0
4,The Adventures of Sherlock Holmes.clean,2,2,1,1
