In [1]:
# === Inspect VAD scoring on real chunks ===================================
from pathlib import Path
import pandas as pd, numpy as np, re, unicodedata, textwrap

ROOT      = Path("..")  # run from emobook/notebooks/
CHUNK_DIR = ROOT / "chunks"
SCORE_DIR = ROOT / "scored_v21"

# Reuse UNI/MWE/MAX_MWE/NEGATORS if already loaded; else load them quickly
try:
    UNI, MWE, MAX_MWE
except NameError:
    # quick loader that matches the v2.1 scorer cell
    RES_DIR = ROOT / "resources"
    def find_v21_path():
        for p in [RES_DIR/"NRC-VAD-Lexicon-v2.1"/"NRC-VAD-Lexicon-v2.1.txt",
                  RES_DIR/"NRC-VAD-Lexicon-v2.1.txt"]:
            if p.exists(): return p
        raise FileNotFoundError("Put NRC-VAD-Lexicon-v2.1.txt under ../resources/NRC-VAD-Lexicon-v2.1/")
    df = pd.read_csv(find_v21_path(), sep="\t")
    df["term"] = df["term"].astype(str).str.strip().str.lower()
    is_mwe = df["term"].str.contains(r"\s")
    UNI = {t:(float(v),float(a),float(d)) for t,v,a,d in df.loc[~is_mwe,["term","valence","arousal","dominance"]].itertuples(index=False)}
    def tokseq(s): return tuple(re.findall(r"[a-z]+(?:'[a-z]+)?", s))
    MWE = {tokseq(t):(float(v),float(a),float(d)) for t,v,a,d in df.loc[is_mwe,["term","valence","arousal","dominance"]].itertuples(index=False)}
    MAX_MWE = max((len(k) for k in MWE.keys()), default=1)

NEGATORS = set("not no never none nobody nothing neither nor n't cannot can't don't won't isn't wasn't aren't weren't".split())

TOKEN_RE = re.compile(r"[a-z]+(?:'[a-z]+)?", re.I)
def tokenize(text: str):
    t = unicodedata.normalize("NFC", text)
    return [tok.lower() for tok in TOKEN_RE.findall(t)]

def match_and_explain(text: str, handle_negation=True, window_after_neg=3):
    """
    Returns:
      matches: list of dicts {term,type,span,negated,v,a,d,v_used}
      agg: {v,a,d,n_tokens,n_hits,coverage,mwe_share}
    """
    toks = tokenize(text)
    i, matches, n_tokens = 0, [], 0
    flip = 0

    while i < len(toks):
        tok = toks[i]
        n_tokens += 1

        if handle_negation and tok in NEGATORS:
            flip = window_after_neg
            i += 1
            continue

        # try MWE (longest first)
        used = False
        if MAX_MWE > 1:
            L = min(MAX_MWE, len(toks)-i)
            for n in range(L, 1, -1):
                key = tuple(toks[i:i+n])
                trip = MWE.get(key)
                if trip:
                    v,a,d = trip
                    neg = False
                    if flip > 0:
                        v = -v; neg = True; flip -= 1
                    matches.append({
                        "term": " ".join(key), "type": "mwe",
                        "span": (i, i+n), "negated": neg,
                        "v": trip[0], "a": trip[1], "d": trip[2],
                        "v_used": v
                    })
                    i += n
                    used = True
                    break
        if used: continue

        trip = UNI.get(tok)
        if trip:
            v,a,d = trip
            neg = False
            if flip > 0:
                v = -v; neg = True; flip -= 1
            matches.append({
                "term": tok, "type": "uni",
                "span": (i, i+1), "negated": neg,
                "v": trip[0], "a": trip[1], "d": trip[2],
                "v_used": v
            })
        else:
            if flip > 0: flip -= 1

        i += 1

    if matches:
        arr_v = np.array([m["v_used"] for m in matches], float)
        arr_a = np.array([m["a"] for m in matches], float)
        arr_d = np.array([m["d"] for m in matches], float)
        v_mean, a_mean, d_mean = arr_v.mean(), arr_a.mean(), arr_d.mean()
    else:
        v_mean = a_mean = d_mean = None

    mwe_share = (sum(1 for m in matches if m["type"]=="mwe")/len(matches)) if matches else 0.0

    agg = dict(
        v=v_mean, a=a_mean, d=d_mean,
        n_tokens=n_tokens, n_hits=len(matches),
        coverage=len(matches)/max(1,n_tokens),
        mwe_share=mwe_share
    )
    return matches, agg

def explain_chunk(book_stem: str, chunk_id: int, wrap=100, per_sentence=False):
    csv = SCORE_DIR / f"{book_stem}.scored_v21.csv"
    df = pd.read_csv(csv)
    row = df.loc[df["chunk_id"]==chunk_id].iloc[0]
    text = row["text"]
    print(f"\n=== {book_stem} | chunk {chunk_id} ===")
    print(textwrap.fill(text, width=wrap))
    print("\n-- matches --")
    matches, agg = match_and_explain(text, handle_negation=True)
    if not matches:
        print("(no lexicon hits)")
    else:
        for m in matches:
            tag = "MWE" if m["type"]=="mwe" else "UNI"
            neg = " [NEG]" if m["negated"] else ""
            print(f"{tag:<3} {m['term']:<25} v={m['v']:+.3f} a={m['a']:+.3f} d={m['d']:+.3f}  -> v_used={m['v_used']:+.3f}{neg}")

    print(f"\nAggregate: v={agg['v']:+.3f} a={agg['a']:+.3f} d={agg['d']:+.3f} | "
          f"hits={agg['n_hits']} / tokens={agg['n_tokens']} "
          f"coverage={agg['coverage']:.3f} | MWE%={agg['mwe_share']:.0%}")

    if per_sentence:
        # optional: show sentence-level scoring inside this chunk
        print("\n-- per sentence --")
        try:
            # use your pluggable sentence splitter if defined in this kernel
            sents = split_sentences(text)
        except Exception:
            # minimal fallback
            sents = re.split(r'(?<=[.!?])\s+', text)
        for i, s in enumerate(sents, 1):
            m2, a2 = match_and_explain(s, handle_negation=True)
            print(f"[s{i:02d}] v={None if a2['v'] is None else round(a2['v'],3)} "
                  f"a={None if a2['a'] is None else round(a2['a'],3)} "
                  f"d={None if a2['d'] is None else round(a2['d'],3)} "
                  f"(cov={a2['coverage']:.2f}) :: {s[:80]}{'...' if len(s)>80 else ''}")

def peek_examples(book_stem: str, mode="high_v", k=3, per_sentence=False):
    """
    mode: 'high_v' | 'low_v' | 'high_a' | 'low_cov' | 'random'
    """
    f = SCORE_DIR / f"{book_stem}.scored_v21.csv"
    df = pd.read_csv(f)

    sel = None
    if mode == "high_v":
        sel = df.sort_values("v", ascending=False).head(k)
    elif mode == "low_v":
        sel = df.sort_values("v", ascending=True).head(k)
    elif mode == "high_a":
        sel = df.sort_values("a", ascending=False).head(k)
    elif mode == "low_cov":
        sel = df.sort_values("coverage", ascending=True).head(k)
    elif mode == "random":
        sel = df.sample(n=min(k, len(df)), random_state=42)
    else:
        raise ValueError("Unknown mode")

    display(sel[["chunk_id","v","a","d","coverage"]])
    for cid in sel["chunk_id"].tolist():
        explain_chunk(book_stem, int(cid), per_sentence=per_sentence)


In [2]:
# Examples:
peek_examples("Frankenstein.clean", mode="high_v", k=2, per_sentence=False)
peek_examples("Mobi Dick.clean", mode="low_v", k=2)
peek_examples("Romeo and Juliet.clean", mode="low_cov", k=3, per_sentence=True)

# Or inspect a specific chunk id:
explain_chunk("Pride and Prejudice.clean", chunk_id=123, per_sentence=True)


Unnamed: 0,chunk_id,v,a,d,coverage
1006,1006,0.289813,-0.009933,0.10788,0.675676
153,153,0.285983,-0.049633,0.117517,0.674157



=== Frankenstein.clean | chunk 1006 ===
You, perhaps, regard her as your sister, without any wish that she might become your wife. Nay, you
may have met with another whom you may love; and considering yourself as bound in honour to
Elizabeth, this struggle may occasion the poignant misery which you appear to feel." "My dear
father, reassure yourself. I love my cousin tenderly and sincerely. I never saw any woman who
excited, as Elizabeth does, my warmest admiration and affection. My future hopes and prospects are
entirely bound up in the expectation of our union." "The expression of your sentiments of this
subject, my dear Victor, gives me more pleasure than I have for some time experienced.

-- matches --
UNI perhaps                   v=+0.000 a=-0.333 d=-0.333  -> v_used=+0.000
UNI regard                    v=+0.604 a=-0.328 d=+0.618  -> v_used=+0.604
UNI as                        v=+0.000 a=+0.000 d=+0.000  -> v_used=+0.000
UNI your                      v=+0.000 a=+0.000 d=+0.259  

Unnamed: 0,chunk_id,v,a,d,coverage
2666,2666,-0.121089,0.068244,-0.018244,0.714286
3958,3958,-0.103301,0.017452,-0.072425,0.634783



=== Mobi Dick.clean | chunk 2666 ===
Starbuck was too late. At the instant of the dart an ulcerous jet shot from this cruel wound, and
goaded by it into more than sufferable anguish, the whale now spouting thick blood, with swift fury
blindly darted at the craft, bespattering them and their glorying crews all over with showers of
gore, capsizing Flask's boat and marring the bows. It was his death stroke.

-- matches --
UNI was                       v=+0.000 a=-0.083 d=+0.000  -> v_used=+0.000
UNI too                       v=+0.000 a=+0.000 d=+0.000  -> v_used=+0.000
UNI late                      v=-0.510 a=-0.284 d=-0.370  -> v_used=-0.510
UNI at                        v=+0.000 a=+0.000 d=+0.000  -> v_used=+0.000
UNI instant                   v=+0.532 a=-0.092 d=+0.108  -> v_used=+0.532
UNI of                        v=+0.000 a=+0.000 d=+0.000  -> v_used=+0.000
UNI dart                      v=-0.166 a=+0.216 d=+0.000  -> v_used=-0.166
UNI ulcerous                  v=-1.000 a=+0.375 d=-

Unnamed: 0,chunk_id,v,a,d,coverage
1,1,0.186459,-0.256525,0.11018,0.521368
2,2,0.195841,-0.127683,0.045127,0.538462
40,40,0.113,-0.091792,-0.019271,0.539326



=== Romeo and Juliet.clean | chunk 1 ===
VI. Friar Lawrence's Cell. ACT III Scene I. A public Place. Scene II. A Room in Capulet's House.
Scene III. Friar Lawrence's cell. Scene IV. A Room in Capulet's House. Scene V. An open Gallery to
Juliet's Chamber, overlooking the Garden. ACT IV Scene I. Friar Lawrence's Cell. Scene II. Hall in
Capulet's House. Scene III. Juliet's Chamber. Scene IV. Hall in Capulet's House. Scene V. Juliet's
Chamber; Juliet on the bed. ACT V Scene I. Mantua. A Street. Scene II. Friar Lawrence's Cell. Scene
III. A churchyard; in it a Monument belonging to the Capulets. Dramatis Personæ ESCALUS, Prince of
Verona. MERCUTIO, kinsman to the Prince, and friend to Romeo. PARIS, a young Nobleman, kinsman to
the Prince.

-- matches --
UNI friar                     v=+0.020 a=-0.480 d=+0.062  -> v_used=+0.020
UNI cell                      v=+0.082 a=-0.250 d=-0.070  -> v_used=+0.082
UNI act                       v=+0.312 a=+0.408 d=+0.470  -> v_used=+0.312
UNI scene      