In [2]:
from pathlib import Path
import pandas as pd, numpy as np, unicodedata, re

ROOT       = Path("..")            # this notebook runs from emobook/notebooks/
RES_DIR    = ROOT / "resources"
CHUNK_DIR  = ROOT / "chunks"
OUT_DIR    = ROOT / "scored_v21"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- locate + load NRC VAD v2.1 (centered in [-1,1]) ----------
def find_v21_path():
    cands = [
        RES_DIR / "NRC-VAD-Lexicon-v2.1" / "NRC-VAD-Lexicon-v2.1.txt",
        RES_DIR / "NRC-VAD-Lexicon-v2.1.txt",
    ]
    for p in cands:
        if p.exists():
            return p
    raise FileNotFoundError("Put NRC-VAD-Lexicon-v2.1.txt under ../resources/NRC-VAD-Lexicon-v2.1/")

def load_v21(path: Path):
    df = pd.read_csv(path, sep="\t")
    df["term"] = df["term"].astype(str).str.strip().str.lower()
    # Build unigram and MWE dicts
    is_mwe = df["term"].str.contains(r"\s")
    uni = {t:(float(v),float(a),float(d)) for t,v,a,d in df.loc[~is_mwe,["term","valence","arousal","dominance"]].itertuples(index=False)}
    # For MWEs, store as tuple of tokens for greedy matching
    def tokseq(s): return tuple(re.findall(r"[a-z]+(?:'[a-z]+)?", s))
    mwe_rows = df.loc[is_mwe, ["term","valence","arousal","dominance"]]
    mwe = {tokseq(t):(float(v),float(a),float(d)) for t,v,a,d in mwe_rows.itertuples(index=False)}
    max_mwe_len = max((len(k) for k in mwe.keys()), default=1)
    return uni, mwe, max_mwe_len

UNI, MWE, MAX_MWE = load_v21(find_v21_path())
print(f"Loaded VAD v2.1 → {len(UNI):,} unigrams, {len(MWE):,} MWEs (max len {MAX_MWE})")

# ---------- tokenizer (simple, fast, lowercasing; keeps contractions) ----------
TOKEN_RE = re.compile(r"[a-z]+(?:'[a-z]+)?", re.I)
def tokenize(text: str):
    t = unicodedata.normalize("NFC", text)
    return [tok.lower() for tok in TOKEN_RE.findall(t)]

# ---------- negation handling (very light, optional) ----------
NEGATORS = set("not no never none nobody nothing neither nor n't cannot can't don't won't isn't wasn't aren't weren't".split())

# ---------- greedy longest-match VAD scorer (prefers MWE over unigram) ----------
def score_chunk_v21(text: str, handle_negation=True, window_after_neg=3):
    toks = tokenize(text)
    i, hits, n_tokens = 0, [], 0
    flip = 0  # count down after negator

    while i < len(toks):
        tok = toks[i]
        n_tokens += 1

        # negate scope trigger
        if handle_negation and tok in NEGATORS:
            flip = window_after_neg
            i += 1
            continue

        # try longest MWE first
        matched = False
        if MAX_MWE > 1:
            L = min(MAX_MWE, len(toks)-i)
            for n in range(L, 1, -1):
                key = tuple(toks[i:i+n])
                trip = MWE.get(key)
                if trip:
                    v,a,d = trip
                    if flip > 0: v = -v; flip -= 1
                    hits.append((v,a,d))
                    i += n
                    matched = True
                    break
        if matched:
            continue

        # fallback to unigram
        trip = UNI.get(tok)
        if trip:
            v,a,d = trip
            if flip > 0: v = -v; flip -= 1
            hits.append((v,a,d))

        else:
            if flip > 0: flip -= 1

        i += 1

    if not hits:
        return dict(v=None, a=None, d=None, n_tokens=n_tokens, n_hits=0, coverage=0.0)

    arr = np.array(hits, float)
    v_mean, a_mean, d_mean = arr.mean(axis=0).tolist()
    return dict(
        v=round(v_mean, 6), a=round(a_mean, 6), d=round(d_mean, 6),
        n_tokens=n_tokens, n_hits=len(hits),
        coverage=round(len(hits)/max(1,n_tokens), 6)
    )

# ---------- score all chunk CSVs (created earlier) ----------
def score_all_chunks_v21():
    rows = []
    for csv in sorted(CHUNK_DIR.glob("*.chunks.csv")):
        df = pd.read_csv(csv)
        out = []
        for txt in df["text"].astype(str):
            out.append(score_chunk_v21(txt, handle_negation=True))
        sdf = pd.DataFrame(out)
        out_df = pd.concat([df, sdf], axis=1)
        # convenience columns
        out_df["valence_only"] = out_df["v"]               # same v dimension ([-1,1])
        out_df["v01"] = (out_df["v"] + 1.0) / 2.0          # optional 0..1 view for plotting
        out_df["a01"] = (out_df["a"] + 1.0) / 2.0
        out_df["d01"] = (out_df["d"] + 1.0) / 2.0
        out_df.to_csv(OUT_DIR / f"{csv.stem.replace('.chunks','')}.scored_v21.csv", index=False)
        rows.append({
            "book": df["book"].iloc[0],
            "chunks": len(df),
            "avg_cov": round(out_df["coverage"].mean(), 4),
            "mwe_hit_rate": round(np.mean([1 if x>1 else 0 for x in [len(re.findall(r'\\s', t)) for t in df['text'].head(1)]]),4)  # placeholder
        })
        print(f"✔ {csv.name} → {csv.stem.replace('.chunks','')}.scored_v21.csv | avg coverage {rows[-1]['avg_cov']}")
    return pd.DataFrame(rows)

summary = score_all_chunks_v21()
display(summary)
print(f"Saved to: {OUT_DIR}")


Loaded VAD v2.1 → 44,728 unigrams, 10,073 MWEs (max len 3)
✔ Frankenstein.clean.chunks.csv → Frankenstein.clean.scored_v21.csv | avg coverage 0.687
✔ Mobi Dick.clean.chunks.csv → Mobi Dick.clean.scored_v21.csv | avg coverage 0.6937
✔ Pride and Prejudice.clean.chunks.csv → Pride and Prejudice.clean.scored_v21.csv | avg coverage 0.6987
✔ Romeo and Juliet.clean.chunks.csv → Romeo and Juliet.clean.scored_v21.csv | avg coverage 0.6788
✔ The Adventures of Sherlock Holmes.clean.chunks.csv → The Adventures of Sherlock Holmes.clean.scored_v21.csv | avg coverage 0.6973


Unnamed: 0,book,chunks,avg_cov,mwe_hit_rate
0,Frankenstein.clean,1559,0.687,0.0
1,Mobi Dick.clean,4194,0.6937,0.0
2,Pride and Prejudice.clean,2598,0.6987,0.0
3,Romeo and Juliet.clean,499,0.6788,0.0
4,The Adventures of Sherlock Holmes.clean,2168,0.6973,0.0


Saved to: ../scored_v21
