## **Build "results" table**

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.5-py3-none-any.whl (172 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m122.9/172.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.5


In [None]:
# ==========================================================
# Results Table Builder (Pack A -> results.xlsx) — No output_paste column
# - Expands over models/versions/shots
# - User pastes directly into `output_raw`
# - `n_predicted_events` auto-counts "<EVENTSEP>" via Excel formula
# - Optional normalizer to bake static values
# ==========================================================

import os
import re
import pandas as pd
from pathlib import Path
from typing import List

In [None]:
# ------------------ Configuration ------------------
PACKA_PATH = "/content/PackA_TextChunks.csv"   # path to Pack A (must have chunk_id, text)
OUT_XLSX   = "/content/results.xlsx"                 # main file with formulas
OUT_CSV    = "/content/results.csv"                  # optional CSV mirror (no formulas)

MODEL_NAMES = ["ChatGPT", "Gemini", "DeepSeek", "Grok"]
CHATGPT_VERSIONS = ["GPT5", "GPT-4o"]
OTHER_VERSION = ""     # for non-ChatGPT rows
SHOT_SET = [0, 1, 3, 5]

# Column widths (Excel aesthetics)
COL_WIDTHS = {
    "A": 12,   # chunk_id
    "B": 60,   # text
    "C": 12,   # model_name
    "D": 14,   # model_version
    "E": 16,   # condition_shots
    "F": 120,  # output_raw (user pastes here; newlines OK)
    "G": 18,   # n_predicted_events (formula)
}

In [None]:
# ------------------ Core Builders ------------------
def require_packa(path: str) -> pd.DataFrame:
    """Load Pack A and validate the required columns."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"Pack A not found: {path}")
    df = pd.read_csv(path, dtype=str).fillna("")
    required = {"chunk_id", "text"}
    if not required.issubset(df.columns):
        raise ValueError(f"Pack A must have columns: {required}")
    return df[["chunk_id", "text"]].copy()

def expand_results_rows(packA_df: pd.DataFrame) -> pd.DataFrame:
    """Create the full cartesian expansion over models/versions/shots."""
    rows: List[dict] = []
    for _, r in packA_df.iterrows():
        cid = r["chunk_id"]
        txt = r["text"]
        for model in MODEL_NAMES:
            versions = CHATGPT_VERSIONS if model == "ChatGPT" else [OTHER_VERSION]
            for ver in versions:
                for k in SHOT_SET:
                    rows.append({
                        "chunk_id": cid,
                        "text": txt,
                        "model_name": model,
                        "model_version": ver,
                        "condition_shots": k,
                        "output_raw": "",          # user pastes here (multi-line OK)
                        "n_predicted_events": ""   # Excel formula will auto-count <EVENTSEP>
                    })
    df = pd.DataFrame(rows, columns=[
        "chunk_id","text","model_name","model_version","condition_shots",
        "output_raw","n_predicted_events"
    ])
    return df.sort_values(["chunk_id","model_name","model_version","condition_shots"]).reset_index(drop=True)

def save_csv(df: pd.DataFrame, out_path: str) -> None:
    df.to_csv(out_path, index=False, encoding="utf-8")

def save_xlsx_with_formula(df: pd.DataFrame, out_path: str) -> None:
    """Write results.xlsx with Excel formula for n_predicted_events based on output_raw."""
    try:
        import xlsxwriter  # noqa
        engine = "xlsxwriter"
    except Exception:
        engine = "openpyxl"  # fallback; formula text is still stored for Excel to evaluate

    with pd.ExcelWriter(out_path, engine=engine) as writer:
        df.to_excel(writer, sheet_name="results", index=False)
        ws = writer.sheets["results"]

        # Set column widths and freeze header
        for col, w in COL_WIDTHS.items():
            ws.set_column(f"{col}:{col}", w)
        ws.freeze_panes(1, 0)

        # Column indexes (0-based)
        headers = list(df.columns)
        col_idx = {name: i for i, name in enumerate(headers)}

        # Insert the counting formula for each row
        n_rows = len(df)
        for r in range(n_rows):
            excel_row = r + 2  # 1-based; row 1 is header
            raw_cell = f"F{excel_row}"  # output_raw
            cnt_col = col_idx["n_predicted_events"]

            # Count how many "<EVENTSEP>" substrings appear in output_raw
            # Works with multi-line cells; Excel treats line breaks as characters.
            count_formula = (
                f'=IF({raw_cell}="","",'
                f'(LEN({raw_cell})-LEN(SUBSTITUTE({raw_cell},"<EVENTSEP>","")))/LEN("<EVENTSEP>"))'
            )
            ws.write_formula(excel_row - 1, cnt_col, count_formula)


In [None]:
# ------------------ Optional Normalizer ------------------
def _flatten_whitespace(s: str) -> str:
    """Collapse newlines and repeated spaces to a single space; ensure spacing before <EVENTSEP>."""
    if not isinstance(s, str) or not s.strip():
        return ""
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s*<EVENTSEP>', ' <EVENTSEP>', s).strip()
    s = re.sub(r' {2,}', ' ', s)
    return s

def _count_eventsep(s: str) -> int:
    if not isinstance(s, str) or not s:
        return 0
    return s.count("<EVENTSEP>")

def normalize_results(in_path: str, out_path: str) -> None:
    """
    Bake a static file (no formulas):
      - Keep user-entered `output_raw` as-is or flattened (choose below).
      - Persist integer `n_predicted_events` by counting "<EVENTSEP>".
    """
    ext = Path(in_path).suffix.lower()
    if ext == ".xlsx":
        df = pd.read_excel(in_path, sheet_name="results", dtype=str).fillna("")
    elif ext == ".csv":
        df = pd.read_csv(in_path, dtype=str).fillna("")
    else:
        raise ValueError("in_path must be .xlsx or .csv")

    if "output_raw" not in df.columns:
        raise ValueError("The input file must contain 'output_raw' column.")

    # Optional: flatten multi-line outputs for portability (uncomment if desired)
    # df["output_raw"] = df["output_raw"].apply(_flatten_whitespace)

    df["n_predicted_events"] = df["output_raw"].apply(_count_eventsep).astype(int)

    # Save static copy
    if out_path.lower().endswith(".xlsx"):
        with pd.ExcelWriter(out_path, engine="xlsxwriter") as writer:
            df.to_excel(writer, sheet_name="results", index=False)
            writer.sheets["results"].freeze_panes(1, 0)
    else:
        df.to_csv(out_path, index=False, encoding="utf-8")

    print(f"Normalized (static) results written to: {out_path}")

In [None]:
# ------------------ Main ------------------
def main():
    packA = require_packa(PACKA_PATH)
    results_df = expand_results_rows(packA)

    # Save CSV (no formulas) and XLSX (with formula on n_predicted_events)
    save_csv(results_df, OUT_CSV)
    save_xlsx_with_formula(results_df, OUT_XLSX)

    print("Created files:")
    print(f"- {OUT_XLSX}  (paste into 'output_raw'; 'n_predicted_events' auto-counts)")
    print(f"- {OUT_CSV}   (no formulas; for reference)")

    # Example: when you want a static copy (no formulas), run:
    normalize_results(OUT_XLSX, "/content/results_normalized.xlsx")

if __name__ == "__main__":
    main()

Created files:
- /content/results.xlsx  (paste into 'output_raw'; 'n_predicted_events' auto-counts)
- /content/results.csv   (no formulas; for reference)
Normalized (static) results written to: /content/results_normalized.xlsx


## **Post-Processing & Mapping**

In [None]:
# ================================
# Evaluation-ready version (per model/version/shots)
# ================================
import os, re, json
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -------- Paths (update to your actual files) --------
RESULTS_PATH   = "/content/results.xlsx"         # <-- your multi-model Excel
PACKB_GOLD     = "/content/PackB_Gold.csv"
SCHEMA_ALIASES = "/content/packs/Schema_Aliases.csv"  # optional
OUT_DIR        = "/content/out_eval"
os.makedirs(OUT_DIR, exist_ok=True)

# -------- Regex for parsing model outputs --------
EVENT_LINE_RE = re.compile(
    r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*(?P<etype>[^.\n\r:]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)',
    flags=re.IGNORECASE
)

def normalize_type(s: str) -> str:
    if s is None: return ""
    return re.sub(r'[_\-\s]+',' ', str(s).strip()).lower()

def normalize_trigger(s: str) -> str:
    if s is None: return ""
    s = str(s).strip().strip('.;,:-').lower()
    return re.sub(r'\s+',' ', s)

def parse_event_lines(output_raw: str):
    if not isinstance(output_raw, str) or not output_raw.strip():
        return []
    pairs = []
    parts = [p for p in output_raw.split("<EVENTSEP>") if p.strip()] or [output_raw]
    for part in parts:
        for m in EVENT_LINE_RE.finditer(part):
            et = normalize_type(m.group("etype"))
            tr = normalize_trigger(m.group("trig"))
            if et and tr:
                pairs.append((et, tr))
    return pairs

# -------- Load results (xlsx or csv) and filter non-empty --------
ext = Path(RESULTS_PATH).suffix.lower()
if ext == ".xlsx":
    res = pd.read_excel(RESULTS_PATH, dtype=str).fillna("")
else:
    res = pd.read_csv(RESULTS_PATH, dtype=str).fillna("")
required_cols = {"chunk_id","output_raw","model_name","model_version","condition_shots"}
if not required_cols.issubset(res.columns):
    raise ValueError(f"RESULTS must contain: {required_cols}")
res = res[res["output_raw"].astype(str).str.strip() != ""].copy()

# -------- Load gold & build allowed schema set --------
gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("PackB_Gold.csv must have columns: chunk_id, events_norm")

schema_types = set()
for row in gold["events_norm"]:
    if not isinstance(row, str): continue
    for item in row.split(";"):
        item = item.strip()
        if "|" in item:
            tp,_ = item.split("|",1)
            if tp.strip():
                schema_types.add(tp.strip())
allowed_schema = sorted(schema_types)
allowed_norm   = [normalize_type(x) for x in allowed_schema]
norm2schema    = {normalize_type(x): x for x in allowed_schema}

# (optional) alias table
alias_map = {}
if os.path.exists(SCHEMA_ALIASES):
    alias_df = pd.read_csv(SCHEMA_ALIASES, dtype=str).fillna("")
    if {"alias","schema_type"}.issubset(alias_df.columns):
        for _, r in alias_df.iterrows():
            a = normalize_type(r["alias"])
            t = str(r["schema_type"]).strip()
            if a and t:
                alias_map[a] = t

# TF-IDF over labels
def build_label_corpus(labels):
    expanded=[]
    for lab in labels:
        toks = re.split(r'[_\-\s]+', lab)
        expanded.append(" ".join([lab] + toks))
    return expanded

tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_schema = tfidf.fit_transform(build_label_corpus(allowed_norm))

def tfidf_best_match(open_label_norm: str, threshold=0.35):
    v = tfidf.transform(build_label_corpus([open_label_norm]))
    sims = cosine_similarity(v, X_schema).flatten()
    idx  = int(np.argmax(sims))
    score = float(sims[idx])
    if score >= threshold:
        best_norm = allowed_norm[idx]
        return norm2schema[best_norm], score
    return None, score

def map_open_to_schema(open_type: str, trigger: str):
    o_norm = normalize_type(open_type)
    # 1) alias
    if o_norm in alias_map:
        tgt = alias_map[o_norm]
        if normalize_type(tgt) in norm2schema:
            return norm2schema[normalize_type(tgt)], "alias", 1.0
        return tgt, "alias", 1.0
    # 2) exact
    if o_norm in norm2schema:
        return norm2schema[o_norm], "exact", 1.0
    # 3) light normalization
    candidates = {o_norm, re.sub(r'(ings|ing|ed|s)$','', o_norm)}
    for c in list(candidates):
        candidates.add(c+"ing"); candidates.add(c+"ed")
    for c in candidates:
        if c in norm2schema:
            return norm2schema[c], "norm", 1.0
    # 4) tf-idf
    best, sc = tfidf_best_match(o_norm, threshold=0.35)
    if best: return best, "tfidf", sc
    # 5) fallback
    return "UNMAPPED", "unmapped", 0.0

def normalize_gold_pairs(s: str):
    out=set()
    for item in str(s).split(";"):
        item=item.strip()
        if "|" in item:
            tp,tr=item.split("|",1)
            out.add((normalize_type(tp), normalize_trigger(tr)))
    return out

# -------- Evaluate per (model_name, model_version, condition_shots) --------
group_cols = ["model_name","model_version","condition_shots"]
metrics_rows = []
all_mapped_frames = []   # to save per-group mapped events if desired

for gkeys, gdf in res.groupby(group_cols):
    gname = {k:v for k,v in zip(group_cols, gkeys)}
    # Parse & map
    parsed_rows=[]
    for _, r in gdf.iterrows():
        cid = r["chunk_id"]
        for (ot,tr) in parse_event_lines(r["output_raw"]):
            parsed_rows.append({"chunk_id":cid,"open_type":ot,"trigger":tr})
    if not parsed_rows:
        # nothing predicted in this group
        metrics_rows.append({**gname, "TP":0,"FP":0,"FN":0,"precision":0.0,"recall":0.0,"f1":0.0,"pred_events":0})
        continue

    mapped=[]
    for pr in parsed_rows:
        sch, mth, sc = map_open_to_schema(pr["open_type"], pr["trigger"])
        mapped.append({**pr, "schema_type":sch, "method":mth, "score":sc})
    mapped_df = pd.DataFrame(mapped)

    # Build gold & pred pair-sets per chunk
    gold_by_chunk = {cid: normalize_gold_pairs(ev) for cid, ev in zip(gold["chunk_id"], gold["events_norm"])}
    pred_by_chunk = {}
    for cid, sub in mapped_df.groupby("chunk_id"):
        pairs = set((normalize_type(t), normalize_trigger(tr)) for t,tr in zip(sub["schema_type"], sub["trigger"]))
        pred_by_chunk[cid] = pairs

    TP=FP=FN=0
    for cid in gold_by_chunk:
        gset = gold_by_chunk.get(cid, set())
        pset = pred_by_chunk.get(cid, set())
        TP += len(gset & pset)
        FP += len(pset - gset)
        FN += len(gset - pset)

    prec = TP/(TP+FP) if (TP+FP) else 0.0
    rec  = TP/(TP+FN) if (TP+FN) else 0.0
    f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0

    metrics_rows.append({**gname, "TP":TP,"FP":FP,"FN":FN,
                         "precision":round(prec,4),"recall":round(rec,4),"f1":round(f1,4),
                         "pred_events":len(mapped_df)})

    # (optional) save per-group mapped events
    mapped_df["model_name"]=gname["model_name"]
    mapped_df["model_version"]=gname["model_version"]
    mapped_df["condition_shots"]=gname["condition_shots"]
    all_mapped_frames.append(mapped_df)

# -------- Write outputs --------
metrics = pd.DataFrame(metrics_rows).sort_values(["model_name","model_version","condition_shots"])
metrics_path = f"{OUT_DIR}/Metrics_by_group.csv"
metrics.to_csv(metrics_path, index=False, encoding="utf-8")

if all_mapped_frames:
    mapped_all = pd.concat(all_mapped_frames, ignore_index=True)
    mapped_path = f"{OUT_DIR}/Mapped_Results_by_group.csv"
    mapped_all.to_csv(mapped_path, index=False, encoding="utf-8")
else:
    mapped_path = None

print(f"[OK] Wrote metrics: {metrics_path}")
if mapped_path:
    print(f"[OK] Wrote mapped events: {mapped_path}")

[OK] Wrote metrics: /content/out_eval/Metrics_by_group.csv
[OK] Wrote mapped events: /content/out_eval/Mapped_Results_by_group.csv


In [None]:
# ==========================
# Mapping without aliases: Hybrid similarity (string + TF-IDF + optional embeddings) + trigger boost
# ==========================
import os, re, math
import pandas as pd
import numpy as np
from pathlib import Path

# --- install helpers (Colab-friendly) ---
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio, partial_ratio
except Exception:
    import sys
    !pip -q install rapidfuzz
    from rapidfuzz.fuzz import ratio as fuzz_ratio, partial_ratio

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

# (optional) sentence embeddings
USE_EMBEDDINGS = False
if USE_EMBEDDINGS:
    try:
        from sentence_transformers import SentenceTransformer
    except Exception:
        !pip -q install sentence-transformers
        from sentence_transformers import SentenceTransformer

# ---------------- Paths ----------------
PACKB_GOLD = "/content/PackB_Gold.csv"       # has: chunk_id, events_norm
RESULTS    = "/content/results.xlsx"         # has: chunk_id, model_name, model_version, condition_shots, output_raw

# ------------- Normalizers -------------
def norm_type(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[_\-\s]+', ' ', s)   # unify separators
    return s

def norm_trigger(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[\.;:,\-]+$', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

# ------------- Parse EVENTSEP -----------
EVENT_LINE_RE = re.compile(
    r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*(?P<etype>[^.\n\r:]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)',
    flags=re.IGNORECASE
)

def parse_output(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    parts = [p for p in text.split("<EVENTSEP>") if p.strip()] or [text]
    out = []
    for p in parts:
        for m in EVENT_LINE_RE.finditer(p):
            et = norm_type(m.group("etype"))
            tr = norm_trigger(m.group("trig"))
            if et and tr:
                out.append((et, tr))
    return out

# -------- Load schema from GOLD ---------
gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("PackB_Gold.csv must contain: chunk_id, events_norm")

# canonical schema label list
schema_types = set()
triggers_by_type = {}  # for trigger-based boosting
for s in gold["events_norm"]:
    if not isinstance(s, str): continue
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp, tr = item.split("|", 1)
        tp_canon = tp.strip()
        tr_n = norm_trigger(tr)
        schema_types.add(tp_canon)
        triggers_by_type.setdefault(tp_canon, {})
        triggers_by_type[tp_canon][tr_n] = triggers_by_type[tp_canon].get(tr_n, 0) + 1

schema_types = sorted(schema_types)
schema_norm  = [norm_type(x) for x in schema_types]
norm2canon   = {norm_type(x): x for x in schema_types}

# -------- Build similarity index --------
def expand_label(s: str) -> str:
    toks = re.split(r'[_\-\s]+', s)
    return " ".join([s] + toks)

# TF-IDF (char 3–5 grams) over schema labels
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_tfidf = tfidf.fit_transform([expand_label(x) for x in schema_norm])

# optional embeddings
if USE_EMBEDDINGS:
    emb_model = SentenceTransformer("all-MiniLM-L6-v2")
    X_emb = emb_model.encode(schema_norm, normalize_embeddings=True)

# -------- Similarity scorers ----------
def jaccard_tokens(a: str, b: str) -> float:
    A = set(a.split())
    B = set(b.split())
    if not A or not B: return 0.0
    return len(A & B) / len(A | B)

def tfidf_sim(q: str) -> float:
    v = tfidf.transform([expand_label(q)])
    sims = cosine_similarity(v, X_tfidf).ravel()
    return float(np.max(sims))

def tfidf_best(q: str):
    v = tfidf.transform([expand_label(q)])
    sims = cosine_similarity(v, X_tfidf).ravel()
    idx = int(np.argmax(sims))
    return idx, float(sims[idx])

def emb_best(q: str):
    if not USE_EMBEDDINGS: return None, 0.0
    v = emb_model.encode([q], normalize_embeddings=True)[0]
    sims = (X_emb @ v).ravel()
    idx = int(np.argmax(sims))
    return idx, float(sims[idx])

# -------- Hybrid mapper (no alias) -----
def map_open_type(open_type: str, trigger: str, tfidf_threshold=0.33, hybrid_threshold=0.38):
    q = norm_type(open_type)

    # 1) exact normalized
    if q in norm2canon:
        return norm2canon[q], "exact", 1.0

    # 2) light morphology (strip/add ing/ed/s)
    variants = {q, re.sub(r'(ings|ing|ed|s)$', '', q)}
    for v in list(variants):
        variants.add(v+"ing"); variants.add(v+"ed")
    for v in variants:
        if v in norm2canon:
            return norm2canon[v], "norm", 1.0

    # 3) fuzzy + jaccard + tf-idf (combine)
    #    scale fuzzy (0–100) -> 0–1
    fuzzy_scores = [fuzz_ratio(q, t)/100 for t in schema_norm]
    jacc_scores  = [jaccard_tokens(q, t) for t in schema_norm]
    idx_tfidf, sc_tfidf = tfidf_best(q)

    # trigger boost
    trig = norm_trigger(trigger)
    boost = np.zeros(len(schema_norm))
    if trig:
        for i, canon in enumerate(schema_types):
            # normalized histogram presence
            freq = triggers_by_type.get(canon, {}).get(trig, 0)
            if freq > 0:
                boost[i] = min(0.15, 0.05 + 0.02*math.log1p(freq))  # small boost up to 0.15

    # weighted fusion
    # give more weight to tf-idf & fuzzy; jaccard and boost are helpers
    fused = 0.45*np.array(fuzzy_scores) + 0.40*np.array([tfidf_sim(q)])*0 + 0.0  # placeholder
    # we’ll compute tf-idf per index directly to avoid second pass:
    # Build array of tf-idf scores
    v = tfidf.transform([expand_label(q)])
    sims_all = cosine_similarity(v, X_tfidf).ravel()
    fused = 0.45*np.array(fuzzy_scores) + 0.40*sims_all + 0.10*np.array(jacc_scores) + boost

    best_idx = int(np.argmax(fused))
    best_sc  = float(fused[best_idx])

    # optional embeddings: take max with embedding score
    if USE_EMBEDDINGS:
        e_idx, e_sc = emb_best(q)
        if e_sc > best_sc:
            best_idx, best_sc = e_idx, e_sc
            method = "embed"
        else:
            method = "hybrid"
    else:
        method = "hybrid"

    if best_sc >= hybrid_threshold or sc_tfidf >= tfidf_threshold:
        return schema_types[best_idx], method, best_sc

    return "UNMAPPED", "unmapped", best_sc

# ---------- Example: run on a few open types ----------
examples = [
    ("flood", "flooding"),
    ("death", "death"),
    ("dam break", "broke"),
    ("venture", "venture"),
    ("trapping", "trapping"),
    ("plebiscite", "plebiscite"),
]

for ot, tr in examples:
    mapped, how, score = map_open_type(ot, tr)
    print(f"{ot:12s} -> {mapped:24s}  [{how}  {score:.3f}]")


flood        -> UNMAPPED                  [unmapped  0.247]
death        -> Death                     [exact  1.000]
dam break    -> UNMAPPED                  [unmapped  0.286]
venture      -> UNMAPPED                  [unmapped  0.301]
trapping     -> Traveling                 [hybrid  0.438]
plebiscite   -> Rite                      [hybrid  0.367]


In [None]:
# ==========================================================
# Post-Processing & Evaluation (per model/version/shots)
# - Parses EVENTSEP outputs -> (open_type, trigger)
# - Builds allowed schema set from PackB_Gold
# - Maps open types -> canonical schema (alias/exact/morph/TF-IDF/[optional: embeddings])
# - Evaluates micro Precision/Recall/F1 per (model_name, model_version, condition_shots)
# - Writes: Metrics_by_group.csv, Mapped_Results_by_group.csv
# ==========================================================

import os, re, math
import pandas as pd
import numpy as np
from pathlib import Path

# --------------------- CONFIG ---------------------
# Adjust these paths for your environment
RESULTS_PATH   = "/content/results.xlsx"          # Your multi-model result table
PACKB_GOLD     = "/content/PackB_Gold.csv"        # Gold with events_norm
SCHEMA_ALIASES = "/content/packs/Schema_Aliases.csv"  # Optional (alias,schema_type)
OUT_DIR        = "/content/out_eval"
os.makedirs(OUT_DIR, exist_ok=True)

# Optional: semantic embeddings (off by default)
USE_EMBEDDINGS = False

# TF-IDF & hybrid thresholds; tune for your data
TFIDF_THRESHOLD   = 0.35
HYBRID_THRESHOLD  = 0.38       # acceptance threshold for hybrid similarity

# ------------------ DEPENDENCIES ------------------
# Install missing packages only if needed (Colab-safe)
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    !pip -q install rapidfuzz
    from rapidfuzz.fuzz import ratio as fuzz_ratio

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

if USE_EMBEDDINGS:
    try:
        from sentence_transformers import SentenceTransformer
    except Exception:
        !pip -q install sentence-transformers
        from sentence_transformers import SentenceTransformer

# ------------------- NORMALIZERS -------------------
def norm_type(s: str) -> str:
    """Normalize type labels for matching: lowercase, unify separators."""
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[_\-\s]+', ' ', s)
    return s

def canon_space_to_underscore(s: str) -> str:
    """Turn 'process start' -> 'Process_start' for nicer reporting (optional)."""
    if not isinstance(s, str): return s
    parts = re.split(r'\s+', s.strip())
    if not parts: return s
    return parts[0].capitalize() + ''.join('_'+p for p in parts[1:])

def norm_trigger(s: str) -> str:
    """Normalize triggers for pair-matching with gold."""
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[\.;:,\-]+$', '', s)  # strip trailing punctuation
    s = re.sub(r'\s+', ' ', s)
    return s

# ------------------- PARSING -------------------
# Match: Event type: <etype>. Trigger: <trig>
EVENT_LINE_RE = re.compile(
    r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*(?P<etype>[^.\n\r:]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)',
    flags=re.IGNORECASE
)

def parse_eventsep_output(text: str):
    """Extract (open_type, trigger) pairs from a model output string."""
    if not isinstance(text, str) or not text.strip():
        return []
    parts = [p for p in text.split("<EVENTSEP>") if p.strip()] or [text]
    out = []
    for p in parts:
        for m in EVENT_LINE_RE.finditer(p):
            et = norm_type(m.group("etype"))
            tr = norm_trigger(m.group("trig"))
            if et and tr:
                out.append((et, tr))
    return out

# --------------- LOAD INPUT TABLES ---------------
# Results: Excel or CSV; must contain chunk_id, model_name, model_version, condition_shots, output_raw
ext = Path(RESULTS_PATH).suffix.lower()
if ext == ".xlsx":
    results = pd.read_excel(RESULTS_PATH, dtype=str).fillna("")
else:
    results = pd.read_csv(RESULTS_PATH, dtype=str).fillna("")

required_cols = {"chunk_id","output_raw","model_name","model_version","condition_shots"}
if not required_cols.issubset(results.columns):
    raise ValueError(f"[Results] required columns missing: {required_cols}")

# Keep only rows where output_raw has content
results = results[results["output_raw"].astype(str).str.strip() != ""].copy()

# Gold: must contain chunk_id, events_norm with format "Type|trigger ; Type|trigger ; ..."
gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("[Gold] must contain columns: chunk_id, events_norm")

# --------------- BUILD SCHEMA FROM GOLD ---------------
# Collect canonical schema labels from gold
schema_types = set()
triggers_by_type = {}  # trigger frequency per canonical type (for boosting)
for s in gold["events_norm"]:
    if not isinstance(s, str): continue
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp, tr = item.split("|", 1)
        tp = tp.strip()                # canonical (as in gold)
        tr_n = norm_trigger(tr)
        schema_types.add(tp)
        triggers_by_type.setdefault(tp, {})
        triggers_by_type[tp][tr_n] = triggers_by_type[tp].get(tr_n, 0) + 1

schema_types = sorted(schema_types)         # canonical labels
schema_norm  = [norm_type(x) for x in schema_types]
norm2canon   = {norm_type(x): x for x in schema_types}

# --------------- OPTIONAL: ALIAS TABLE ---------------
alias_map = {}
if os.path.exists(SCHEMA_ALIASES):
    alias_df = pd.read_csv(SCHEMA_ALIASES, dtype=str).fillna("")
    if {"alias","schema_type"}.issubset(alias_df.columns):
        for _, r in alias_df.iterrows():
            a = norm_type(r["alias"])
            t = str(r["schema_type"]).strip()
            if a and t:
                alias_map[a] = t  # t should be canonical (as in gold)

# --------------- INDEXES FOR SIMILARITY ---------------
def expand_label(s: str) -> str:
    """Help TF-IDF by adding tokenized variants: 'process start' -> 'process start process start'."""
    toks = re.split(r'[_\-\s]+', s)
    return " ".join([s] + toks)

# TF-IDF over normalized schema labels (char-ngrams)
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_tfidf = tfidf.fit_transform([expand_label(x) for x in schema_norm])

# Optional semantic embeddings
if USE_EMBEDDINGS:
    emb_model = SentenceTransformer("all-MiniLM-L6-v2")
    X_emb = emb_model.encode(schema_norm, normalize_embeddings=True)

# --------------- SIMILARITY FUNCTIONS ---------------
def jaccard_tokens(a: str, b: str) -> float:
    """Jaccard similarity over whitespace tokens."""
    A = set(a.split()); B = set(b.split())
    if not A or not B: return 0.0
    return len(A & B) / len(A | B)

def tfidf_all_sims(q: str) -> np.ndarray:
    """Vector of TF-IDF cosine similarities to all schema labels."""
    v = tfidf.transform([expand_label(q)])
    sims = cosine_similarity(v, X_tfidf).ravel()
    return sims

def embeddings_best(q: str):
    """Best match via sentence embeddings (optional)."""
    if not USE_EMBEDDINGS: return None, 0.0
    v = emb_model.encode([q], normalize_embeddings=True)[0]
    sims = (X_emb @ v).ravel()
    idx = int(np.argmax(sims))
    return idx, float(sims[idx])

# --------------- MAPPING (no LLM, pure post-process) ---------------
def map_open_type(open_type: str, trigger: str):
    """
    Map a free-form open_type to a canonical schema type using:
    1) manual alias (if provided)
    2) exact normalized label
    3) light morphology (strip/add ing/ed/s)
    4) hybrid similarity: RapidFuzz (char), TF-IDF(char ngram), Jaccard(tokens), + trigger-frequency boost
    5) (optional) embeddings
    Returns: (canonical_type or 'UNMAPPED', method, score)
    """
    q = norm_type(open_type)

    # 1) alias
    if q in alias_map:
        return alias_map[q], "alias", 1.0

    # 2) exact normalized label
    if q in norm2canon:
        return norm2canon[q], "exact", 1.0

    # 3) light morphology
    variants = {q, re.sub(r'(ings|ing|ed|s)$', '', q)}
    for v in list(variants):
        variants.add(v + "ing"); variants.add(v + "ed")
    for v in variants:
        if v in norm2canon:
            return norm2canon[v], "norm", 1.0

    # 4) hybrid similarity (no alias)
    #    Components: fuzzy (RapidFuzz), TF-IDF, Jaccard, trigger-boost
    fuzzy_scores = [fuzz_ratio(q, t)/100 for t in schema_norm]
    jacc_scores  = [jaccard_tokens(q, t) for t in schema_norm]
    tfidf_scores = tfidf_all_sims(q)  # vector

    # trigger-based boosting: if this trigger frequently co-occurs with a type in gold
    trig = norm_trigger(trigger)
    boost = np.zeros(len(schema_norm))
    if trig:
        for i, canon in enumerate(schema_types):
            freq = triggers_by_type.get(canon, {}).get(trig, 0)
            if freq > 0:
                boost[i] = min(0.15, 0.05 + 0.02*math.log1p(freq))  # cap at 0.15

    # weighted fusion (tune weights if needed)
    fused = 0.45*np.array(fuzzy_scores) + 0.40*np.array(tfidf_scores) + 0.10*np.array(jacc_scores) + boost
    best_idx = int(np.argmax(fused))
    best_sc  = float(fused[best_idx])

    # Optional embeddings: override if stronger
    method = "hybrid"
    if USE_EMBEDDINGS:
        e_idx, e_sc = embeddings_best(q)
        if e_sc > best_sc:
            best_idx, best_sc = e_idx, e_sc
            method = "embed"

    # Accept if score is strong enough
    if best_sc >= HYBRID_THRESHOLD or np.max(tfidf_scores) >= TFIDF_THRESHOLD:
        return schema_types[best_idx], method, best_sc

    # 5) fallback
    return "UNMAPPED", "unmapped", best_sc

# --------------- GOLD / PRED PAIR SETS ---------------
def gold_pairs_for_chunk(events_norm_str: str):
    """Convert 'Type|trigger ; Type|trigger' -> set of (norm_type, norm_trigger)."""
    pairs = set()
    s = str(events_norm_str) if isinstance(events_norm_str, str) else ""
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp, tr = item.split("|", 1)
        pairs.add((norm_type(tp), norm_trigger(tr)))
    return pairs

# Prebuild gold per chunk
gold_by_chunk = {cid: gold_pairs_for_chunk(ev) for cid, ev in zip(gold["chunk_id"], gold["events_norm"])}

# --------------- EVALUATION (per model/version/shots) ---------------
group_cols = ["model_name","model_version","condition_shots"]
metrics_rows = []
mapped_frames = []

for gkeys, gdf in results.groupby(group_cols):
    group_info = {k: v for k, v in zip(group_cols, gkeys)}

    # Parse & map all events in this group
    parsed = []
    for _, r in gdf.iterrows():
        cid = r["chunk_id"]
        events = parse_eventsep_output(r["output_raw"])
        for (ot, tr) in events:
            parsed.append({"chunk_id": cid, "open_type": ot, "trigger": tr})

    if not parsed:
        metrics_rows.append({**group_info, "TP":0,"FP":0,"FN":0,"precision":0.0,"recall":0.0,"f1":0.0,"pred_events":0})
        continue

    mapped = []
    for pr in parsed:
        sch, method, score = map_open_type(pr["open_type"], pr["trigger"])
        mapped.append({**pr, "schema_type": sch, "method": method, "score": score})

    mapped_df = pd.DataFrame(mapped)

    # Build predicted pair-sets per chunk
    pred_by_chunk = {}
    for cid, sub in mapped_df.groupby("chunk_id"):
        pairs = set((norm_type(t), norm_trigger(tr)) for t, tr in zip(sub["schema_type"], sub["trigger"]))
        pred_by_chunk[cid] = pairs

    # Compute micro TP/FP/FN
    TP = FP = FN = 0
    for cid in gold_by_chunk:
        gset = gold_by_chunk.get(cid, set())
        pset = pred_by_chunk.get(cid, set())
        TP += len(gset & pset)
        FP += len(pset - gset)
        FN += len(gset - pset)

    precision = TP / (TP + FP) if (TP + FP) else 0.0
    recall    = TP / (TP + FN) if (TP + FN) else 0.0
    f1        = 2*precision*recall / (precision+recall) if (precision+recall) else 0.0

    metrics_rows.append({
        **group_info,
        "TP": TP, "FP": FP, "FN": FN,
        "precision": round(precision, 4),
        "recall":    round(recall, 4),
        "f1":        round(f1, 4),
        "pred_events": len(mapped_df)
    })

    # Keep mapped rows for auditing
    mapped_df["model_name"]       = group_info["model_name"]
    mapped_df["model_version"]    = group_info["model_version"]
    mapped_df["condition_shots"]  = group_info["condition_shots"]
    # Optional: beautify schema_type for readability
    mapped_df["schema_type"] = mapped_df["schema_type"].apply(lambda x: canon_space_to_underscore(x) if x != "UNMAPPED" else x)
    mapped_frames.append(mapped_df)

# --------------- WRITE OUTPUTS ---------------
metrics = pd.DataFrame(metrics_rows).sort_values(["model_name","model_version","condition_shots"])
metrics_path = f"{OUT_DIR}/Metrics_by_group.csv"
metrics.to_csv(metrics_path, index=False, encoding="utf-8")

if mapped_frames:
    mapped_all = pd.concat(mapped_frames, ignore_index=True)
    mapped_path = f"{OUT_DIR}/Mapped_Results_by_group.csv"
    mapped_all.to_csv(mapped_path, index=False, encoding="utf-8")
else:
    mapped_path = None

print(f"[OK] Wrote metrics -> {metrics_path}")
if mapped_path:
    print(f"[OK] Wrote mapped events -> {mapped_path}")

# --------------- QUICK PREVIEW ---------------
print("\nPreview: metrics (top 10):")
display(metrics.head(10))

if mapped_path:
    print("\nPreview: mapped events (top 10):")
    display(mapped_all.head(10))


[OK] Wrote metrics -> /content/out_eval/Metrics_by_group.csv
[OK] Wrote mapped events -> /content/out_eval/Mapped_Results_by_group.csv

Preview: metrics (top 10):


Unnamed: 0,model_name,model_version,condition_shots,TP,FP,FN,precision,recall,f1,pred_events
0,ChatGPT,GPT-4o,0,33,167,252,0.165,0.1158,0.1361,206
1,ChatGPT,GPT-4o,1,43,200,242,0.177,0.1509,0.1629,246
2,ChatGPT,GPT-4o,3,47,193,238,0.1958,0.1649,0.179,242
3,ChatGPT,GPT-4o,5,44,205,241,0.1767,0.1544,0.1648,252
4,ChatGPT,GPT5,0,29,179,256,0.1394,0.1018,0.1176,213
5,ChatGPT,GPT5,1,41,202,244,0.1687,0.1439,0.1553,250
6,ChatGPT,GPT5,3,44,206,241,0.176,0.1544,0.1645,256
7,ChatGPT,GPT5,5,46,213,239,0.1776,0.1614,0.1691,262
8,DeepSeek,,0,39,145,246,0.212,0.1368,0.1663,192
9,DeepSeek,,1,50,197,235,0.2024,0.1754,0.188,254



Preview: mapped events (top 10):


Unnamed: 0,chunk_id,open_type,trigger,schema_type,method,score,model_name,model_version,condition_shots
0,TST_0001,dam break,broke,UNMAPPED,unmapped,0.286334,ChatGPT,GPT-4o,0
1,TST_0001,flood,flooding,UNMAPPED,unmapped,0.247092,ChatGPT,GPT-4o,0
2,TST_0001,trapping,trapping,Traveling,hybrid,0.438268,ChatGPT,GPT-4o,0
3,TST_0001,death,death,Death,exact,1.0,ChatGPT,GPT-4o,0
4,TST_0002,toleration,tolerated,Military_operation,hybrid,0.447596,ChatGPT,GPT-4o,0
5,TST_0002,aid,provide,UNMAPPED,unmapped,0.272547,ChatGPT,GPT-4o,0
6,TST_0002,alarm,alarmed,UNMAPPED,unmapped,0.373215,ChatGPT,GPT-4o,0
7,TST_0002,movement/travel,go,Body_movement,hybrid,0.47075,ChatGPT,GPT-4o,0
8,TST_0002,request,request,Request,exact,1.0,ChatGPT,GPT-4o,0
9,TST_0003,detection,detected,Action,hybrid,0.48726,ChatGPT,GPT-4o,0


In [None]:
# ==========================================================
# Smart Post-Processing & Evaluation (v2)
# ----------------------------------------------------------
# Key upgrades vs v1:
# 1) Strong trigger-driven mapping:
#    - Build a trigger->type prior from GOLD (lemma-level)
#    - High-confidence prior (prop >= 0.55 & count >= 3) wins immediately
# 2) Hybrid scoring:
#    final_score = 0.35*TypeLabelSimilarity + 0.55*TriggerEvidence + 0.10*TokenJaccard
#    where TriggerEvidence = normalized trigger prior + TF-IDF(sim(trigger, triggers_of_type))
# 3) Robust normalization (light lemmatizer, punctuation cleanup)
# 4) Still supports manual aliases (optional), exact, and light-morph matches
# 5) Per-(model,version,shots) micro P/R/F1 + full mapped audit
# ==========================================================

import os, re, math
import pandas as pd
import numpy as np
from pathlib import Path

# --------------------- CONFIG ---------------------
RESULTS_PATH   = "/content/results.xlsx"              # your results table
PACKB_GOLD     = "/content/PackB_Gold.csv"            # gold with 'chunk_id','events_norm'
SCHEMA_ALIASES = "/content/packs/Schema_Aliases.csv"  # optional alias,schema_type
OUT_DIR        = "/content/out_eval_v2"
os.makedirs(OUT_DIR, exist_ok=True)

# Thresholds (tune if needed)
PRIOR_PROP_MIN   = 0.55   # min share for trigger's top type to accept prior
PRIOR_COUNT_MIN  = 3      # min count for that trigger in gold
TFIDF_THRESHOLD  = 0.33   # accept if tf-idf on type-label is >= this
FUSED_THRESHOLD  = 0.40   # accept if final fused score is >= this

# Optional semantic embeddings (off by default)
USE_EMBEDDINGS = False

# ------------------ DEPENDENCIES ------------------
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    !pip -q install rapidfuzz
    from rapidfuzz.fuzz import ratio as fuzz_ratio

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

if USE_EMBEDDINGS:
    try:
        from sentence_transformers import SentenceTransformer
    except Exception:
        !pip -q install sentence-transformers
        from sentence_transformers import SentenceTransformer

# ---------------- NORMALIZATION -------------------
def norm_type(s: str) -> str:
    """Normalize type labels for matching."""
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[_\-\s]+', ' ', s)
    return s

def norm_trigger(s: str) -> str:
    """Normalize triggers for pair matching."""
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[.;,:\-]+$', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

# very light lemmatizer (no external corpora)
IRREGULAR = {
    "died":"die", "was":"be", "were":"be", "began":"begin",
    "took":"take", "torn":"tear", "led":"lead", "left":"leave",
    "broke":"break", "ruled":"rule", "saw":"see", "made":"make",
}
def simple_lemma(token: str) -> str:
    """Heuristic lemma for triggers."""
    t = norm_trigger(token)
    if t in IRREGULAR: return IRREGULAR[t]
    # strip common suffixes
    for suf in ["ing","ed","es","s"]:
        if t.endswith(suf) and len(t) > len(suf)+2:
            return t[:-len(suf)]
    return t

def canon_spaces_to_underscore(s: str) -> str:
    """'process start' -> 'Process_start' (for nicer reporting)."""
    if not isinstance(s, str): return s
    parts = re.split(r'\s+', s.strip())
    if not parts: return s
    return parts[0].capitalize() + ''.join('_'+p for p in parts[1:])

# ---------------- PARSING -------------------------
EVENT_LINE_RE = re.compile(
    r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*(?P<etype>[^.\n\r:]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)',
    flags=re.IGNORECASE
)

def parse_eventsep_output(text: str):
    """Extract (open_type_norm, trigger_norm, trigger_lemma) from one output string."""
    if not isinstance(text, str) or not text.strip():
        return []
    parts = [p for p in text.split("<EVENTSEP>") if p.strip()] or [text]
    out = []
    for p in parts:
        for m in EVENT_LINE_RE.finditer(p):
            et = norm_type(m.group("etype"))
            tr = norm_trigger(m.group("trig"))
            if et and tr:
                out.append((et, tr, simple_lemma(tr)))
    return out

# ---------------- LOAD TABLES ---------------------
ext = Path(RESULTS_PATH).suffix.lower()
if ext == ".xlsx":
    results = pd.read_excel(RESULTS_PATH, dtype=str).fillna("")
else:
    results = pd.read_csv(RESULTS_PATH, dtype=str).fillna("")
required_cols = {"chunk_id","output_raw","model_name","model_version","condition_shots"}
if not required_cols.issubset(results.columns):
    raise ValueError(f"[Results] required columns: {required_cols}")
results = results[results["output_raw"].astype(str).str.strip() != ""].copy()

gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("[Gold] must contain: chunk_id, events_norm")

# --------- SCHEMA + TRIGGER PRIORS FROM GOLD ----------
schema_types = set()
trig_counts_by_type = {}     # {type_canon: {trig_lemma: count}}
trig_global_counts  = {}     # {trig_lemma: total_count}
for s in gold["events_norm"]:
    if not isinstance(s, str): continue
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp, tr = item.split("|", 1)
        tp_canon = tp.strip()
        tri = simple_lemma(tr)
        schema_types.add(tp_canon)
        d = trig_counts_by_type.setdefault(tp_canon, {})
        d[tri] = d.get(tri, 0) + 1
        trig_global_counts[tri] = trig_global_counts.get(tri, 0) + 1

schema_types = sorted(schema_types)              # canonical labels (as in gold)
schema_norm  = [norm_type(x) for x in schema_types]
norm2canon   = {norm_type(x): x for x in schema_types}

# Precompute trigger prior ratios per type
# prior[type][trig] = count(type,trig) / sum_over_types count(.,trig)
trig_prior = {}
for tri, total in trig_global_counts.items():
    for tp in schema_types:
        c = trig_counts_by_type.get(tp, {}).get(tri, 0)
        if c > 0:
            trig_prior.setdefault(tp, {})[tri] = c / total

# --------------- OPTIONAL: ALIASES ---------------------
alias_map = {}
if os.path.exists(SCHEMA_ALIASES):
    alias_df = pd.read_csv(SCHEMA_ALIASES, dtype=str).fillna("")
    if {"alias","schema_type"}.issubset(alias_df.columns):
        for _, r in alias_df.iterrows():
            a = norm_type(r["alias"])
            t = str(r["schema_type"]).strip()
            if a and t:
                alias_map[a] = t  # canonical expected

# --------------- SIMILARITY INDEXES --------------------
def expand_label(s: str) -> str:
    """Help TF-IDF by duplicating tokenized variants."""
    toks = re.split(r'[_\-\s]+', s)
    return " ".join([s] + toks)

# TF-IDF over normalized schema labels (type names)
tfidf_type = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_types = tfidf_type.fit_transform([expand_label(x) for x in schema_norm])

# TF-IDF over triggers-of-each-type (join all triggers for that type)
type_trigger_corpus = []
for tp in schema_types:
    bag = []
    for tri, cnt in trig_counts_by_type.get(tp, {}).items():
        bag += [tri] * cnt  # repeat by frequency
    type_trigger_corpus.append(" ".join(bag) if bag else "")
tfidf_trig = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_trigs = tfidf_trig.fit_transform(type_trigger_corpus)

if USE_EMBEDDINGS:
    from sentence_transformers import SentenceTransformer
    emb_model = SentenceTransformer("all-MiniLM-L6-v2")
    X_types_emb = emb_model.encode(schema_norm, normalize_embeddings=True)

def tfidf_sim_type_label(q_norm: str) -> np.ndarray:
    v = tfidf_type.transform([expand_label(q_norm)])
    return cosine_similarity(v, X_types).ravel()

def tfidf_sim_trigger_to_type(trig_lemma: str) -> np.ndarray:
    """Similarity of a trigger lemma to each type's trigger-bag."""
    v = tfidf_trig.transform([trig_lemma])
    return cosine_similarity(v, X_trigs).ravel()

# ----------------- MAPPING FUNCTION --------------------
def map_open_to_schema(open_type: str, trig: str, trig_lemma: str):
    """
    Map open_type to canonical schema using:
    1) Alias (if provided)
    2) Exact normalized label
    3) Light morphology (strip/add ing/ed/s)
    4) High-confidence trigger prior (prop>=PRIOR_PROP_MIN & count>=PRIOR_COUNT_MIN)
    5) Hybrid score (type-label similarity + trigger evidence)
    """
    q = norm_type(open_type)

    # 1) alias
    if q in alias_map:
        return alias_map[q], "alias", 1.0

    # 2) exact match
    if q in norm2canon:
        return norm2canon[q], "exact", 1.0

    # 3) morphology
    base = re.sub(r'(ings|ing|ed|s)$', '', q)
    for cand in {q, base, base+"ing", base+"ed"}:
        if cand in norm2canon:
            return norm2canon[cand], "norm", 1.0

    # 4) Trigger PRIOR (high-confidence)
    #    pick top type for this trigger lemma; accept if dominance & support are high
    top_tp, top_prop, top_cnt = None, 0.0, 0
    total = trig_global_counts.get(trig_lemma, 0)
    if total > 0:
        for tp in schema_types:
            prop = trig_prior.get(tp, {}).get(trig_lemma, 0.0)
            if prop > top_prop:
                top_tp, top_prop = tp, prop
        top_cnt = trig_counts_by_type.get(top_tp, {}).get(trig_lemma, 0) if top_tp else 0
        if top_tp and top_prop >= PRIOR_PROP_MIN and top_cnt >= PRIOR_COUNT_MIN:
            return top_tp, "trigger_prior", float(top_prop)

    # 5) Hybrid scoring
    # 5.1 Type label similarity (open_type -> schema label)
    sim_type_label = tfidf_sim_type_label(q)                         # vector over types
    # add a small fuzzy component (scaled to 0..1)
    fuzzy = np.array([fuzz_ratio(q, t)/100 for t in schema_norm])
    type_label_score = 0.65*sim_type_label + 0.35*fuzzy

    # 5.2 Trigger evidence (trigger lemma vs. each type's trigger-bag + prior weight)
    sim_trigger_vec = tfidf_sim_trigger_to_type(trig_lemma)
    prior_vec = np.array([trig_prior.get(tp, {}).get(trig_lemma, 0.0) for tp in schema_types])
    trigger_evidence = 0.6*sim_trigger_vec + 0.4*prior_vec

    # 5.3 Token Jaccard (minor)
    def jaccard(a, b):
        A, B = set(a.split()), set(b.split())
        return len(A & B)/len(A | B) if A and B else 0.0
    jacc = np.array([jaccard(q, t) for t in schema_norm])

    fused = 0.35*type_label_score + 0.55*trigger_evidence + 0.10*jacc

    # Optional embeddings: override if stronger
    method = "hybrid"
    if USE_EMBEDDINGS:
        v = emb_model.encode([q], normalize_embeddings=True)[0]
        sims = (X_types_emb @ v).ravel()
        # soft fuse
        fused = 0.7*fused + 0.3*sims
        method = "hybrid+embed"

    best_idx = int(np.argmax(fused))
    best_score = float(fused[best_idx])

    # Accept if fused score or type-label tf-idf is strong enough
    if best_score >= FUSED_THRESHOLD or float(np.max(sim_type_label)) >= TFIDF_THRESHOLD:
        return schema_types[best_idx], method, best_score

    return "UNMAPPED", "unmapped", best_score

# --------------- GOLD PAIRSETS --------------------
def gold_pairs_for_chunk(events_norm_str: str):
    """'Type|trigger ; Type|trigger' -> set of (norm_type, norm_trigger)."""
    pairs = set()
    s = str(events_norm_str) if isinstance(events_norm_str, str) else ""
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp, tr = item.split("|", 1)
        pairs.add((norm_type(tp), norm_trigger(tr)))
    return pairs

gold_by_chunk = {cid: gold_pairs_for_chunk(ev) for cid, ev in zip(gold["chunk_id"], gold["events_norm"])}

# --------------- EVALUATION (per group) -------------
group_cols = ["model_name","model_version","condition_shots"]
metrics_rows, mapped_frames = [], []

for gkeys, gdf in results.groupby(group_cols):
    group = dict(zip(group_cols, gkeys))

    parsed = []
    for _, r in gdf.iterrows():
        cid = r["chunk_id"]
        for (ot, tr, tri_lemma) in parse_eventsep_output(r["output_raw"]):
            parsed.append({"chunk_id": cid, "open_type": ot, "trigger": tr, "trig_lemma": tri_lemma})

    if not parsed:
        metrics_rows.append({**group, "TP":0,"FP":0,"FN":0,"precision":0.0,"recall":0.0,"f1":0.0,"pred_events":0})
        continue

    mapped = []
    for pr in parsed:
        sch, how, sc = map_open_to_schema(pr["open_type"], pr["trigger"], pr["trig_lemma"])
        mapped.append({**pr, "schema_type": sch, "method": how, "score": sc})
    mapped_df = pd.DataFrame(mapped)

    # build predicted pairs per chunk
    pred_by_chunk = {}
    for cid, sub in mapped_df.groupby("chunk_id"):
        pairs = set((norm_type(t), norm_trigger(tr)) for t, tr in zip(sub["schema_type"], sub["trigger"]))
        pred_by_chunk[cid] = pairs

    # micro TP/FP/FN
    TP = FP = FN = 0
    for cid in gold_by_chunk:
        gset = gold_by_chunk.get(cid, set())
        pset = pred_by_chunk.get(cid, set())
        TP += len(gset & pset)
        FP += len(pset - gset)
        FN += len(gset - pset)

    P = TP/(TP+FP) if (TP+FP) else 0.0
    R = TP/(TP+FN) if (TP+FN) else 0.0
    F1 = 2*P*R/(P+R) if (P+R) else 0.0

    metrics_rows.append({**group, "TP":TP,"FP":FP,"FN":FN,
                         "precision":round(P,4),"recall":round(R,4),"f1":round(F1,4),
                         "pred_events":len(mapped_df)})

    mapped_df["model_name"]      = group["model_name"]
    mapped_df["model_version"]   = group["model_version"]
    mapped_df["condition_shots"] = group["condition_shots"]
    mapped_df["schema_type"]     = mapped_df["schema_type"].apply(lambda x: canon_spaces_to_underscore(x) if x!="UNMAPPED" else x)
    mapped_frames.append(mapped_df)

# --------------- WRITE OUTPUTS -------------------
metrics = pd.DataFrame(metrics_rows).sort_values(group_cols)
metrics_path = f"{OUT_DIR}/Metrics_by_group.csv"
metrics.to_csv(metrics_path, index=False, encoding="utf-8")

if mapped_frames:
    mapped_all = pd.concat(mapped_frames, ignore_index=True)
    mapped_path = f"{OUT_DIR}/Mapped_Results_by_group.csv"
    mapped_all.to_csv(mapped_path, index=False, encoding="utf-8")
else:
    mapped_path = None

print(f"[OK] Wrote metrics -> {metrics_path}")
if mapped_path:
    print(f"[OK] Wrote mapped events -> {mapped_path}")

# Optional: overall micro
TP, FP, FN = metrics["TP"].sum(), metrics["FP"].sum(), metrics["FN"].sum()
P  = TP/(TP+FP) if (TP+FP) else 0.0
R  = TP/(TP+FN) if (TP+FN) else 0.0
F1 = 2*P*R/(P+R) if (P+R) else 0.0
with open(f"{OUT_DIR}/Metrics_overall_micro.txt","w",encoding="utf-8") as f:
    f.write(f"Micro Precision: {P:.4f}\nMicro Recall: {R:.4f}\nMicro F1: {F1:.4f}\nTP={TP}, FP={FP}, FN={FN}\n")
print(f"[OK] Overall micro -> P={P:.4f} R={R:.4f} F1={F1:.4f}")


[OK] Wrote metrics -> /content/out_eval_v2/Metrics_by_group.csv
[OK] Wrote mapped events -> /content/out_eval_v2/Mapped_Results_by_group.csv
[OK] Overall micro -> P=0.4150 R=0.3333 F1=0.3697


In [None]:
# ==========================================================
# Ultra-Smart Post-Processing & Evaluation (Ensembled + Tuner)
# ----------------------------------------------------------
# Signals used for mapping open_type -> canonical schema:
#  1) Manual aliases (optional)
#  2) Exact / light-morph match (death ~ dying ~ died)
#  3) Type-Label similarity (TF-IDF char n-grams + fuzzy + token Jaccard)
#  4) Trigger priors learned from GOLD (lemma-level dominance + frequency)
#  5) Trigger → Type similarity (TF-IDF: trigger vs bag-of-triggers per type)
#  6) Context similarity (TF-IDF: CHUNK TEXT vs bag-of-texts per type)  [requires PackA]
#  7) Optional embeddings for type labels (SentenceTransformers)
#
# Fusion:
#  final_score = w1*TypeLabel + w2*TriggerEvidence + w3*Context + w4*Jaccard
#  where TriggerEvidence mixes (trigger prior + trigger TF-IDF).
#
# A lightweight tuner searches weights to maximize micro F1 on a dev split.
# Then evaluates per (model_name, model_version, condition_shots).
# ==========================================================

import os, re, math, random
import numpy as np
import pandas as pd
from pathlib import Path

# -------------------- CONFIG --------------------
RESULTS_PATH   = "/content/results.xlsx"
PACKB_GOLD     = "/content/PackB_Gold.csv"
PACKA_PATH     = "/content/packs/PackA_TextChunks.csv"      # optional
SCHEMA_ALIASES = "/content/packs/Schema_Aliases.csv"        # optional
OUT_DIR        = "/content/out_eval_ultra"
os.makedirs(OUT_DIR, exist_ok=True)

# Tuning and thresholds
USE_TUNER        = True     # set False to skip weight search
DEV_FRACTION     = 0.25     # fraction of chunks for tuning (stratified by density if PackA available)
N_TRIALS         = 60       # random search trials
FUSED_MIN_ACCEPT = 0.38     # minimum fused score to accept a mapping
TFIDF_MIN_ACCEPT = 0.33     # minimum type-label tfidf to accept (guard)

# Optional semantic embeddings for type label similarity (off by default)
USE_EMBEDDINGS   = False

random.seed(42)
np.random.seed(42)

# -------------- Dependencies (install-once) --------------
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    !pip -q install rapidfuzz
    from rapidfuzz.fuzz import ratio as fuzz_ratio

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

if USE_EMBEDDINGS:
    try:
        from sentence_transformers import SentenceTransformer
    except Exception:
        !pip -q install sentence-transformers
        from sentence_transformers import SentenceTransformer

# ----------------- Normalizers -----------------
def norm_type(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[_\-\s]+', ' ', s)
    return s

def norm_trigger(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.strip().lower()
    s = re.sub(r'[.;,:\-]+$', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

IRREGULAR = {
    "died":"die", "was":"be", "were":"be", "began":"begin",
    "took":"take", "torn":"tear", "led":"lead", "left":"leave",
    "broke":"break", "ruled":"rule", "saw":"see", "made":"make",
}
def simple_lemma(token: str) -> str:
    t = norm_trigger(token)
    if t in IRREGULAR: return IRREGULAR[t]
    for suf in ["ing","ed","es","s"]:
        if t.endswith(suf) and len(t) > len(suf)+2:
            return t[:-len(suf)]
    return t

def canon_spaces_to_underscore(s: str) -> str:
    if not isinstance(s, str): return s
    parts = re.split(r'\s+', s.strip())
    if not parts: return s
    return parts[0].capitalize() + ''.join('_'+p for p in parts[1:])

# ----------------- Parsing ---------------------
EVENT_LINE_RE = re.compile(
    r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*(?P<etype>[^.\n\r:]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)',
    flags=re.IGNORECASE
)

def parse_eventsep_output(text: str):
    if not isinstance(text, str) or not text.strip(): return []
    parts = [p for p in text.split("<EVENTSEP>") if p.strip()] or [text]
    out = []
    for p in parts:
        for m in EVENT_LINE_RE.finditer(p):
            et = norm_type(m.group("etype"))
            tr = norm_trigger(m.group("trig"))
            if et and tr:
                out.append((et, tr, simple_lemma(tr)))
    return out

# --------------- Load tables -------------------
ext = Path(RESULTS_PATH).suffix.lower()
if ext == ".xlsx":
    results = pd.read_excel(RESULTS_PATH, dtype=str).fillna("")
else:
    results = pd.read_csv(RESULTS_PATH, dtype=str).fillna("")

req = {"chunk_id","output_raw","model_name","model_version","condition_shots"}
if not req.issubset(results.columns):
    raise ValueError(f"[Results] must contain: {req}")
results = results[results["output_raw"].astype(str).str.strip() != ""].copy()

gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("[Gold] must contain: chunk_id, events_norm")

# Optional PackA for context text
packA = None
if os.path.exists(PACKA_PATH):
    packA = pd.read_csv(PACKA_PATH, dtype=str).fillna("")
    if not {"chunk_id","text"}.issubset(packA.columns):
        packA = None

# ------------- Schema + priors from GOLD -------------
schema_types = set()
trig_counts_by_type = {}
trig_global_counts  = {}
chunks_by_type      = {}   # for context (list of chunk_ids)

for cid, s in zip(gold["chunk_id"], gold["events_norm"]):
    if not isinstance(s, str): continue
    mentioned_types = set()
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp, tr = item.split("|", 1)
        tp_canon = tp.strip()
        tri = simple_lemma(tr)

        schema_types.add(tp_canon)
        d = trig_counts_by_type.setdefault(tp_canon, {})
        d[tri] = d.get(tri, 0) + 1
        trig_global_counts[tri] = trig_global_counts.get(tri, 0) + 1

        mentioned_types.add(tp_canon)
    for tp in mentioned_types:
        chunks_by_type.setdefault(tp, []).append(cid)

schema_types = sorted(schema_types)
schema_norm  = [norm_type(x) for x in schema_types]
norm2canon   = {norm_type(x): x for x in schema_types}

# Trigger prior ratios
trig_prior = {}
for tri, total in trig_global_counts.items():
    for tp in schema_types:
        c = trig_counts_by_type.get(tp, {}).get(tri, 0)
        if c > 0:
            trig_prior.setdefault(tp, {})[tri] = c / total

# ------------- Alias table (optional) -------------
alias_map = {}
if os.path.exists(SCHEMA_ALIASES):
    alias_df = pd.read_csv(SCHEMA_ALIASES, dtype=str).fillna("")
    if {"alias","schema_type"}.issubset(alias_df.columns):
        for _, r in alias_df.iterrows():
            a = norm_type(r["alias"])
            t = str(r["schema_type"]).strip()
            if a and t:
                alias_map[a] = t

# ------------- Vector spaces (TF-IDF) -------------
def expand_label(s: str) -> str:
    toks = re.split(r'[_\-\s]+', s)
    return " ".join([s] + toks)

# Type label TF-IDF
tfidf_type = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_types = tfidf_type.fit_transform([expand_label(x) for x in schema_norm])

# Trigger bag TF-IDF (per type)
type_trigger_docs = []
for tp in schema_types:
    bag = []
    for tri, cnt in trig_counts_by_type.get(tp, {}).items():
        bag += [tri] * cnt
    type_trigger_docs.append(" ".join(bag))
tfidf_trig = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_trigs = tfidf_trig.fit_transform(type_trigger_docs)

# Context TF-IDF (per type) using chunk texts (if PackA available)
X_ctx = None
if packA is not None:
    chunk_text = dict(zip(packA["chunk_id"], packA["text"]))
    type_ctx_docs = []
    for tp in schema_types:
        texts = [chunk_text[cid] for cid in chunks_by_type.get(tp, []) if cid in chunk_text]
        type_ctx_docs.append(" ".join(texts) if texts else "")
    tfidf_ctx = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
    X_ctx = tfidf_ctx.fit_transform(type_ctx_docs)

# Optional embeddings for type label
if USE_EMBEDDINGS:
    emb_model = SentenceTransformer("all-MiniLM-L6-v2")
    X_types_emb = emb_model.encode(schema_norm, normalize_embeddings=True)

# ------------- Similarity helpers -------------
def tfidf_type_sim(q_norm: str) -> np.ndarray:
    v = tfidf_type.transform([expand_label(q_norm)])
    return cosine_similarity(v, X_types).ravel()

def tfidf_trigger_sim(tri: str) -> np.ndarray:
    v = tfidf_trig.transform([tri])
    return cosine_similarity(v, X_trigs).ravel()

def tfidf_context_sim(text: str) -> np.ndarray:
    if X_ctx is None: return np.zeros(len(schema_types))
    v = tfidf_ctx.transform([text])
    return cosine_similarity(v, X_ctx).ravel()

def token_jaccard(a: str, b: str) -> float:
    A, B = set(a.split()), set(b.split())
    return len(A & B)/len(A | B) if A and B else 0.0

# ------------- Mapping core (parametric) -------------
def map_one(open_type, trig, tri_lemma, chunk_text, weights):
    """
    weights: dict with keys {'w_type','w_trigger','w_ctx','w_jacc'} summing ~1
    """
    q = norm_type(open_type)

    # 0) alias
    if q in alias_map:
        return alias_map[q], "alias", 1.0

    # 1) exact / morph
    if q in norm2canon:
        return norm2canon[q], "exact", 1.0
    base = re.sub(r'(ings|ing|ed|s)$', '', q)
    for cand in {q, base, base+"ing", base+"ed"}:
        if cand in norm2canon:
            return norm2canon[cand], "norm", 1.0

    # 2) type-label similarity
    sims_type = tfidf_type_sim(q)
    fuzzy = np.array([fuzz_ratio(q, t)/100 for t in schema_norm])
    jacc  = np.array([token_jaccard(q, t) for t in schema_norm])
    type_signal = 0.70*sims_type + 0.30*fuzzy

    # 3) trigger evidence
    sims_trig = tfidf_trigger_sim(tri_lemma)
    prior_vec = np.array([trig_prior.get(tp, {}).get(tri_lemma, 0.0) for tp in schema_types])
    trigger_signal = 0.60*sims_trig + 0.40*prior_vec

    # 4) context (if available)
    ctx_signal = tfidf_context_sim(chunk_text) if isinstance(chunk_text, str) else np.zeros(len(schema_types))

    # 5) optional embeddings (small blend into type signal)
    if USE_EMBEDDINGS:
        v = emb_model.encode([q], normalize_embeddings=True)[0]
        sims_emb = (X_types_emb @ v).ravel()
        type_signal = 0.85*type_signal + 0.15*sims_emb

    # 6) fuse
    fused = (weights["w_type"]*type_signal +
             weights["w_trigger"]*trigger_signal +
             weights["w_ctx"]*ctx_signal +
             weights["w_jacc"]*jacc)

    best_idx = int(np.argmax(fused))
    best_sc  = float(fused[best_idx])

    # guard: accept if fused strong or type tfidf strong
    if best_sc >= FUSED_MIN_ACCEPT or float(np.max(sims_type)) >= TFIDF_MIN_ACCEPT:
        return schema_types[best_idx], "fused", best_sc
    return "UNMAPPED", "unmapped", best_sc

# ------------- Utility: score a set (micro) -------------
def micro_scores(gold_pairs_by_chunk, mapped_rows):
    # build pred pairs
    pred_by_chunk = {}
    for cid, sub in mapped_rows.groupby("chunk_id"):
        pairs = set((norm_type(t), norm_trigger(tr)) for t,tr in zip(sub["schema_type"], sub["trigger"]))
        pred_by_chunk[cid] = pairs
    TP=FP= FN=0
    for cid in gold_pairs_by_chunk:
        gset = gold_pairs_by_chunk.get(cid, set())
        pset = pred_by_chunk.get(cid, set())
        TP += len(gset & pset)
        FP += len(pset - gset)
        FN += len(gset - pset)
    P = TP/(TP+FP) if (TP+FP) else 0.0
    R = TP/(TP+FN) if (TP+FN) else 0.0
    F1= 2*P*R/(P+R) if (P+R) else 0.0
    return P,R,F1,TP,FP,FN

def gold_pairs_for_chunk(s: str):
    out=set()
    for item in str(s).split(";"):
        item=item.strip()
        if "|" in item:
            tp,tr=item.split("|",1)
            out.add((norm_type(tp), norm_trigger(tr)))
    return out

gold_by_chunk = {cid: gold_pairs_for_chunk(ev) for cid,ev in zip(gold["chunk_id"], gold["events_norm"])}

# ------------- Build a unified parsed frame -------------
# Merge chunk text (if available)
text_map = {}
if packA is not None:
    text_map = dict(zip(packA["chunk_id"], packA["text"]))
elif "text" in results.columns:
    text_map = dict(zip(results["chunk_id"], results["text"]))

parsed_all = []
for _, r in results.iterrows():
    cid = r["chunk_id"]
    txt = text_map.get(cid, "")
    for (ot, tr, tri_lemma) in parse_eventsep_output(r["output_raw"]):
        parsed_all.append({
            "chunk_id": cid,
            "model_name": r["model_name"],
            "model_version": r["model_version"],
            "condition_shots": r["condition_shots"],
            "open_type": ot,
            "trigger": tr,
            "tri_lemma": tri_lemma,
            "chunk_text": txt
        })
parsed_df = pd.DataFrame(parsed_all)
if parsed_df.empty:
    raise RuntimeError("No events parsed from outputs. Check 'output_raw' formatting.")

# ------------- Split dev/test for tuning -------------
all_cids = parsed_df["chunk_id"].unique().tolist()
random.shuffle(all_cids)
n_dev = int(len(all_cids)*DEV_FRACTION) if USE_TUNER else 0
dev_ids = set(all_cids[:n_dev])
test_ids= set(all_cids[n_dev:])

dev_df  = parsed_df[parsed_df["chunk_id"].isin(dev_ids)].copy() if USE_TUNER else None
test_df = parsed_df[parsed_df["chunk_id"].isin(test_ids)].copy() if USE_TUNER else parsed_df.copy()

# ------------- Random weight tuner -------------
def random_weights():
    # draw Dirichlet-like weights over 4 components
    xs = np.random.rand(4)
    xs = xs/np.sum(xs)
    return {"w_type":xs[0], "w_trigger":xs[1], "w_ctx":xs[2], "w_jacc":xs[3]}

best_w = {"w_type":0.40, "w_trigger":0.45, "w_ctx":0.10, "w_jacc":0.05}  # sensible default

if USE_TUNER and not dev_df.empty:
    best_f1 = -1.0
    # assemble gold subset for fast scoring
    dev_gold = {cid: gold_by_chunk.get(cid,set()) for cid in dev_ids}
    for _ in range(N_TRIALS):
        w = random_weights()
        mapped_rows = []
        for _, row in dev_df.iterrows():
            sch, how, sc = map_one(row["open_type"], row["trigger"], row["tri_lemma"], row["chunk_text"], w)
            mapped_rows.append({"chunk_id":row["chunk_id"],"schema_type":sch,"trigger":row["trigger"]})
        mapped_rows = pd.DataFrame(mapped_rows)
        P,R,F1,_,_,_ = micro_scores(dev_gold, mapped_rows)
        if F1 > best_f1:
            best_f1 = F1
            best_w  = w
    print(f"[TUNER] Best weights on dev: {best_w}  (F1={best_f1:.4f})")
else:
    print(f"[TUNER] Skipped. Using default weights: {best_w}")

# ------------- Evaluate per (model/version/shots) -------------
group_cols = ["model_name","model_version","condition_shots"]
metrics_rows = []
mapped_frames = []

for gkeys, gsub in parsed_df.groupby(group_cols):
    ginfo = dict(zip(group_cols, gkeys))
    mapped = []
    for _, row in gsub.iterrows():
        sch, how, sc = map_one(row["open_type"], row["trigger"], row["tri_lemma"], row["chunk_text"], best_w)
        mapped.append({
            "chunk_id": row["chunk_id"],
            "schema_type": sch,
            "trigger": row["trigger"],
            "open_type": row["open_type"],
            "method": how,
            "score": sc
        })
    mapped = pd.DataFrame(mapped)

    P,R,F1,TP,FP,FN = micro_scores(gold_by_chunk, mapped)
    metrics_rows.append({**ginfo,
                         "TP":TP,"FP":FP,"FN":FN,
                         "precision":round(P,4),"recall":round(R,4),"f1":round(F1,4),
                         "pred_events":len(mapped)})

    mapped["model_name"]      = ginfo["model_name"]
    mapped["model_version"]   = ginfo["model_version"]
    mapped["condition_shots"] = ginfo["condition_shots"]
    mapped["schema_type"]     = mapped["schema_type"].apply(lambda x: canon_spaces_to_underscore(x) if x!="UNMAPPED" else x)
    mapped_frames.append(mapped)

# ------------- Write outputs --------------------
metrics = pd.DataFrame(metrics_rows).sort_values(group_cols)
metrics_path = f"{OUT_DIR}/Metrics_by_group.csv"
metrics.to_csv(metrics_path, index=False, encoding="utf-8")

mapped_all = pd.concat(mapped_frames, ignore_index=True) if mapped_frames else pd.DataFrame()
if not mapped_all.empty:
    mapped_path = f"{OUT_DIR}/Mapped_Results_by_group.csv"
    mapped_all.to_csv(mapped_path, index=False, encoding="utf-8")
else:
    mapped_path = None

print(f"[OK] Wrote metrics -> {metrics_path}")
if mapped_path:
    print(f"[OK] Wrote mapped events -> {mapped_path}")

# Overall micro
TP,FP,FN = metrics["TP"].sum(), metrics["FP"].sum(), metrics["FN"].sum()
P  = TP/(TP+FP) if (TP+FP) else 0.0
R  = TP/(TP+FN) if (TP+FN) else 0.0
F1 = 2*P*R/(P+R) if (P+R) else 0.0
with open(f"{OUT_DIR}/Metrics_overall_micro.txt","w",encoding="utf-8") as f:
    f.write(f"Micro Precision: {P:.4f}\nMicro Recall: {R:.4f}\nMicro F1: {F1:.4f}\nTP={TP}, FP={FP}, FN={FN}\n")
print(f"[OK] Overall micro -> P={P:.4f} R={R:.4f} F1={F1:.4f}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m3.2/3.3 MB[0m [31m96.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[?25h[TUNER] Best weights on dev: {'w_type': np.float64(0.06301231571166711), 'w_trigger': np.float64(0.5829491484483261), 'w_ctx': np.float64(0.22386915722376707), 'w_jacc': np.float64(0.13016937861623953)}  (F1=0.2428)
[OK] Wrote metrics -> /content/out_eval_ultra/Metrics_by_group.csv
[OK] Wrote mapped events -> /content/out_eval_ultra/Mapped_Results_by_group.csv
[OK] Overall micro -> P=0.4060 R=0.3261 F1=0.3617


In [None]:
# ==========================================================
# Cross-Model Self-Consistency Ensemble (CMSE)
#   - Maps each raw output to schema types (smart mapper)
#   - Learns reliability weights per (model,version,shots)
#   - Aggregates predictions across all systems per chunk
#   - Selects a consensus set (duplicate-aware, trigger-checked)
#   - Evaluates micro P/R/F1
# ==========================================================

import os, re, math, random
import numpy as np
import pandas as pd
from pathlib import Path

# -------------------- CONFIG --------------------
RESULTS_PATH   = "/content/results.xlsx"              # must have chunk_id, model_name, model_version, condition_shots, output_raw
PACKB_GOLD     = ("/content/PackB_Gold.csv")          # must have chunk_id, events_norm
PACKA_PATH     = "/content/packs/PackA_TextChunks.csv" # optional (for trigger validation)
SCHEMA_ALIASES = "/content/packs/Schema_Aliases.csv"   # optional (alias,schema_type)

OUT_DIR        = "/content/out_cmse"
os.makedirs(OUT_DIR, exist_ok=True)

# Dev split for tuning reliabilities
DEV_FRAC        = 0.25
N_TRIALS_WEIGHTS= 60

# Thresholds
FUSED_MIN_ACCEPT = 0.38   # mapping accept guard (same as ultra-smart)
TYPE_TFIDF_GUARD = 0.33
CONS_TAU_BASE    = 0.75   # base consensus threshold (scaled)
NEAR_DUP_FUZZ    = 90     # triggers >= this fuzzy similarity are treated as duplicates

random.seed(42)
np.random.seed(42)

# ---------------- Dependencies ----------------
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    !pip -q install rapidfuzz
    from rapidfuzz.fuzz import ratio as fuzz_ratio

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

# ---------------- Normalizers -----------------
def ntype(s):
    if not isinstance(s,str): return ""
    return re.sub(r'[_\-\s]+',' ', s.strip().lower())

def ntrig(s):
    if not isinstance(s,str): return ""
    s = s.strip().lower()
    s = re.sub(r'[.;,:\-]+$','', s)
    return re.sub(r'\s+',' ', s)

IRREG = {
  "died":"die","was":"be","were":"be","began":"begin","took":"take",
  "torn":"tear","led":"lead","left":"leave","broke":"break","ruled":"rule",
  "saw":"see","made":"make",
}
def lemma(tok):
    t = ntrig(tok)
    if t in IRREG: return IRREG[t]
    for suf in ("ing","ed","es","s"):
        if t.endswith(suf) and len(t)>len(suf)+2:
            return t[:-len(suf)]
    return t

def canon_unders(s):
    parts = re.split(r'\s+', s.strip())
    return parts[0].capitalize() + ''.join('_'+p for p in parts[1:]) if parts else s

# --------------- Load data --------------------
# results
ext = Path(RESULTS_PATH).suffix.lower()
res = pd.read_excel(RESULTS_PATH, dtype=str).fillna("") if ext==".xlsx" else pd.read_csv(RESULTS_PATH, dtype=str).fillna("")
need = {"chunk_id","model_name","model_version","condition_shots","output_raw"}
if not need.issubset(res.columns):
    raise ValueError(f"results must have {need}")
res = res[res["output_raw"].astype(str).str.strip()!=""].copy()

# gold
gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("PackB_Gold.csv must have chunk_id, events_norm")

# optional PackA text (for trigger validation)
text_map = {}
if os.path.exists(PACKA_PATH):
    packA = pd.read_csv(PACKA_PATH, dtype=str).fillna("")
    if {"chunk_id","text"}.issubset(packA.columns):
        text_map = dict(zip(packA["chunk_id"], packA["text"]))

# ---------------- Schema & priors from gold ----------------
schema_types = set()
trig_counts_by_type = {}
trig_global_counts  = {}

for s in gold["events_norm"]:
    if not isinstance(s,str): continue
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp,tr = item.split("|",1)
        tp_c = tp.strip()
        tri  = lemma(tr)
        schema_types.add(tp_c)
        trig_counts_by_type.setdefault(tp_c,{}).update({tri: trig_counts_by_type.get(tp_c,{}).get(tri,0)+1})
        trig_global_counts[tri] = trig_global_counts.get(tri,0)+1

schema_types = sorted(schema_types)
schema_norm  = [ntype(x) for x in schema_types]
norm2canon   = {ntype(x):x for x in schema_types}

# priors
trig_prior = {}
for tri,total in trig_global_counts.items():
    for tp in schema_types:
        c = trig_counts_by_type.get(tp,{}).get(tri,0)
        if c>0:
            trig_prior.setdefault(tp,{})[tri] = c/total

# optional alias
alias = {}
if os.path.exists(SCHEMA_ALIASES):
    aldf = pd.read_csv(SCHEMA_ALIASES, dtype=str).fillna("")
    if {"alias","schema_type"}.issubset(aldf.columns):
        for _,r in aldf.iterrows():
            alias[ntype(r["alias"])] = str(r["schema_type"]).strip()

# ----------------- TF-IDF index -----------------
def expand_label(s): return " ".join([s]+re.split(r'[_\-\s]+', s))

tfidf_type = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_types = tfidf_type.fit_transform([expand_label(x) for x in schema_norm])

type_trigger_docs = []
for tp in schema_types:
    bag=[]
    for tri,cnt in trig_counts_by_type.get(tp,{}).items():
        bag += [tri]*cnt
    type_trigger_docs.append(" ".join(bag))
tfidf_trig = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_trigs = tfidf_trig.fit_transform(type_trigger_docs)

def sim_type_label(qn):
    v = tfidf_type.transform([expand_label(qn)])
    return cosine_similarity(v, X_types).ravel()

def sim_trig_to_type(tri):
    v = tfidf_trig.transform([tri])
    return cosine_similarity(v, X_trigs).ravel()

# ----------------- Parse outputs ----------------
EVENT_RE = re.compile(r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*([^.\n\r:]+)\.\s*Trigger\s*:\s*([^.\n\r<]+)', re.I)

def parse_lines(s):
    if not isinstance(s,str) or not s.strip(): return []
    parts = [p for p in s.split("<EVENTSEP>") if p.strip()] or [s]
    out=[]
    for p in parts:
        for m in EVENT_RE.finditer(p):
            ot = ntype(m.group(1))
            tr = ntrig(m.group(2))
            if ot and tr:
                out.append((ot,tr,lemma(tr)))
    return out

# ----------------- Mapper (same spirit as ultra-smart) ----------------
def map_one(open_type_norm, trig_str, trig_lemma):
    # alias
    if open_type_norm in alias:
        return alias[open_type_norm], "alias", 1.0

    # exact/morph
    if open_type_norm in norm2canon:
        return norm2canon[open_type_norm], "exact", 1.0
    base = re.sub(r'(ings|ing|ed|s)$','', open_type_norm)
    for cand in {open_type_norm, base, base+"ing", base+"ed"}:
        if cand in norm2canon:
            return norm2canon[cand], "norm", 1.0

    # trigger prior (soft)
    sims_type = sim_type_label(open_type_norm)
    fuzzy = np.array([fuzz_ratio(open_type_norm, t)/100 for t in schema_norm])
    type_signal = 0.65*sims_type + 0.35*fuzzy

    sims_trig = sim_trig_to_type(trig_lemma)
    prior_vec = np.array([trig_prior.get(tp,{}).get(trig_lemma,0.0) for tp in schema_types])
    trigger_signal = 0.6*sims_trig + 0.4*prior_vec

    fused = 0.45*type_signal + 0.55*trigger_signal
    idx = int(np.argmax(fused))
    best = float(fused[idx])
    if best>=FUSED_MIN_ACCEPT or float(np.max(sims_type))>=TYPE_TFIDF_GUARD:
        return schema_types[idx], "fused", best
    return "UNMAPPED","unmapped",best

# -------------- Build parsed+mapped frame --------------
parsed_rows=[]
for _,r in res.iterrows():
    cid = r["chunk_id"]
    for (ot,tr,trlem) in parse_lines(r["output_raw"]):
        sch,how,sc = map_one(ot,tr,trlem)
        parsed_rows.append({
            "chunk_id": cid,
            "model_name": r["model_name"],
            "model_version": r["model_version"],
            "condition_shots": r["condition_shots"],
            "open_type": ot, "trigger": tr, "tri_lemma": trlem,
            "schema_type": sch, "method": how, "map_score": sc
        })

mapped_df = pd.DataFrame(parsed_rows)
if mapped_df.empty:
    raise RuntimeError("No events parsed/mapped. Check outputs formatting.")

# -------------- Gold pair sets -------------------
def gold_pairs(s):
    pairs=set()
    if not isinstance(s,str): return pairs
    for it in s.split(";"):
        it=it.strip()
        if "|" in it:
            tp,tr=it.split("|",1)
            pairs.add((ntype(tp), ntrig(tr)))
    return pairs

gold_by_chunk = {cid: gold_pairs(ev) for cid,ev in zip(gold["chunk_id"], gold["events_norm"])}

# -------------- Reliability tuning --------------
# we learn alpha weights for each (model,version,shots) to maximize dev F1
groups = mapped_df.groupby(["model_name","model_version","condition_shots"])
keys   = list(groups.groups.keys())

# dev/test split by chunk_id
all_cids = mapped_df["chunk_id"].unique().tolist()
random.shuffle(all_cids)
dev_ids = set(all_cids[:int(len(all_cids)*DEV_FRAC)])
test_ids= set(all_cids[int(len(all_cids)*DEV_FRAC):])

def score_micro(pred_pairs_by_chunk):
    TP=FP=FN=0
    for cid in gold_by_chunk:
        g = gold_by_chunk.get(cid,set())
        p = pred_pairs_by_chunk.get(cid,set())
        TP += len(g & p); FP += len(p - g); FN += len(g - p)
    P = TP/(TP+FP) if (TP+FP) else 0.0
    R = TP/(TP+FN) if (TP+FN) else 0.0
    F = 2*P*R/(P+R) if (P+R) else 0.0
    return P,R,F

def build_pred_pairs(weights, take_ids):
    # aggregate per chunk consensus
    pred_by_chunk={}
    for cid, sub in mapped_df[mapped_df["chunk_id"].isin(take_ids)].groupby("chunk_id"):
        cand = {}  # key=(type_norm, trig_lemma) -> score
        freq = {}  # frequency for bonus
        for _,row in sub.iterrows():
            key = (ntype(row["schema_type"]), row["tri_lemma"])
            # reliability of its source group
            gk = (row["model_name"], row["model_version"], row["condition_shots"])
            alpha = weights.get(gk, 1.0)
            # base score = alpha * map_score
            sc = alpha * float(row["map_score"])
            cand[key] = cand.get(key,0.0) + sc
            freq[key] = freq.get(key,0)+1

        # add bonuses: consensus + prior
        for k in cand:
            tp_norm, tri = k
            tp_canon = norm2canon.get(tp_norm, tp_norm)
            prior = trig_prior.get(tp_canon, {}).get(tri, 0.0)
            cand[k] += 0.10*freq[k] + 0.15*prior

        # select with threshold relative to max
        if not cand:
            pred_by_chunk[cid]=set()
            continue
        m = max(cand.values())
        tau = CONS_TAU_BASE * m
        chosen = [k for k,v in cand.items() if v>=tau]

        # de-duplicate near triggers; if multiple types share same trigger, keep best
        final=[]
        used=[]
        for k in sorted(chosen, key=lambda x: cand[x], reverse=True):
            _,tri = k
            ok=True
            for _,tri2 in used:
                if fuzz_ratio(tri, tri2) >= NEAR_DUP_FUZZ:
                    ok=False; break
            if ok:
                final.append(k)
                used.append(k)

        # Convert to pair set (type_norm, trigger_norm)
        pred_by_chunk[cid] = set((k[0], k[1]) for k in final)
    return pred_by_chunk

# random search for reliability weights
bestW={k:1.0 for k in keys}; bestF=-1.0
for _ in range(N_TRIALS_WEIGHTS):
    trial={k: float(np.random.uniform(0.6,1.6)) for k in keys}  # 0.6..1.6
    pred_pairs = build_pred_pairs(trial, dev_ids)
    _,_,F = score_micro(pred_pairs)
    if F>bestF: bestF=F; bestW=trial
print(f"[tuner] best dev F1={bestF:.4f} with {len(bestW)} weights")

# -------------- Evaluate on all chunks --------------
pred_all = build_pred_pairs(bestW, mapped_df["chunk_id"].unique())
P,R,F = score_micro(pred_all)
with open(f"{OUT_DIR}/Metrics_overall_micro.txt","w",encoding="utf-8") as f:
    f.write(f"Micro Precision: {P:.4f}\nMicro Recall: {R:.4f}\nMicro F1: {F:.4f}\n")
print(f"[overall] P={P:.4f} R={R:.4f} F1={F:.4f}")

# -------------- Save ensembled predictions ----------
rows=[]
for cid, pairs in pred_all.items():
    for (tp_norm, tri) in pairs:
        rows.append({"chunk_id": cid,
                     "schema_type": canon_unders(norm2canon.get(tp_norm, tp_norm)),
                     "trigger": tri})
ens_df = pd.DataFrame(rows).sort_values(["chunk_id","schema_type","trigger"])
ens_df.to_csv(f"{OUT_DIR}/Ensembled_Predictions.csv", index=False, encoding="utf-8")
print(f"[ok] wrote {OUT_DIR}/Ensembled_Predictions.csv")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m2.8/3.3 MB[0m [31m84.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25h[tuner] best dev F1=0.0261 with 16 weights
[overall] P=0.1964 R=0.0772 F1=0.1108
[ok] wrote /content/out_cmse/Ensembled_Predictions.csv


In [None]:
# ==========================================================
# CMSE v2 — Tuned Consensus + Correct Trigger Keying
#   - Robust mapping (type-TFIDF + trigger priors + fuzzy)
#   - Tunes BOTH reliability weights and consensus params on dev
#   - Aggregates on (type_norm, trigger_norm)  [not lemma!]
#   - Optional top-K per density_bin from PackA
#   - Evaluates micro P/R/F1
# ==========================================================

import os, re, random, numpy as np, pandas as pd
from pathlib import Path

# -------------------- CONFIG --------------------
RESULTS_PATH   = "/content/results.xlsx"
PACKB_GOLD     = "/content/PackB_Gold.csv"
PACKA_PATH     = "/content/packs/PackA_TextChunks.csv"   # optional (needs: chunk_id,text,density_bin)
SCHEMA_ALIASES = "/content/packs/Schema_Aliases.csv"     # optional

OUT_DIR        = "/content/out_cmse_v2"
os.makedirs(OUT_DIR, exist_ok=True)

# Dev split + search budgets
DEV_FRAC            = 0.25
N_TRIALS_WEIGHTS    = 60     # reliability weights tuning
# small grid for consensus params
TAU_BASE_SET        = [0.45, 0.55, 0.65]
FREQ_BONUS_SET      = [0.05, 0.10, 0.15]
PRIOR_BONUS_SET     = [0.05, 0.10, 0.15]
DEDUP_FUZZ_SET      = [85, 90, 95]
KEY_BY_LEMMA_FLAGS  = [False, True]  # expect False to win

# Optional top-K by density; requires PackA density_bin
USE_TOPK_BY_DENSITY = True
TOPK_CFG = {"low": 3, "med": 6, "high": 10}  # adjust if needed

# Mapping thresholds (kept reasonable)
FUSED_MIN_ACCEPT = 0.38
TYPE_TFIDF_GUARD = 0.33

random.seed(42); np.random.seed(42)

# ---------------- Dependencies ----------------
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    !pip -q install rapidfuzz
    from rapidfuzz.fuzz import ratio as fuzz_ratio

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

# ---------------- Normalizers -----------------
def ntype(s):
    if not isinstance(s,str): return ""
    return re.sub(r'[_\-\s]+',' ', s.strip().lower())

def ntrig(s):
    if not isinstance(s,str): return ""
    s = s.strip().lower()
    s = re.sub(r'[.;,:\-]+$','', s)
    return re.sub(r'\s+',' ', s)

IRREG = {
  "died":"die","was":"be","were":"be","began":"begin","took":"take",
  "torn":"tear","led":"lead","left":"leave","broke":"break","ruled":"rule",
  "saw":"see","made":"make",
}
def lemma(tok):
    t = ntrig(tok)
    if t in IRREG: return IRREG[t]
    for suf in ("ing","ed","es","s"):
        if t.endswith(suf) and len(t)>len(suf)+2:
            return t[:-len(suf)]
    return t

def canon_unders(s):
    parts = re.split(r'\s+', s.strip())
    return parts[0].capitalize() + ''.join('_'+p for p in parts[1:]) if parts else s

# ---------------- Load data -------------------
# results
ext = Path(RESULTS_PATH).suffix.lower()
res = pd.read_excel(RESULTS_PATH, dtype=str).fillna("") if ext==".xlsx" else pd.read_csv(RESULTS_PATH, dtype=str).fillna("")
need = {"chunk_id","model_name","model_version","condition_shots","output_raw"}
if not need.issubset(res.columns):
    raise ValueError(f"results must have {need}")
res = res[res["output_raw"].astype(str).str.strip()!=""].copy()

# gold
gold = pd.read_csv(PACKB_GOLD, dtype=str).fillna("")
if not {"chunk_id","events_norm"}.issubset(gold.columns):
    raise ValueError("PackB_Gold.csv must have chunk_id, events_norm")

# optional PackA: text + density_bin
packA = None; density_map = {}
if os.path.exists(PACKA_PATH):
    tmp = pd.read_csv(PACKA_PATH, dtype=str).fillna("")
    if {"chunk_id","text"}.issubset(tmp.columns):
        packA = tmp
    if "density_bin" in tmp.columns:
        density_map = dict(zip(tmp["chunk_id"], tmp["density_bin"]))

# ---------------- Schema & priors from gold ----------------
schema_types = set()
trig_counts_by_type = {}
trig_global_counts  = {}

for s in gold["events_norm"]:
    if not isinstance(s,str): continue
    for item in s.split(";"):
        item = item.strip()
        if "|" not in item: continue
        tp,tr = item.split("|",1)
        tp_c = tp.strip()
        tri  = lemma(tr)
        schema_types.add(tp_c)
        trig_counts_by_type.setdefault(tp_c,{}).update({tri: trig_counts_by_type.get(tp_c,{}).get(tri,0)+1})
        trig_global_counts[tri] = trig_global_counts.get(tri,0)+1

schema_types = sorted(schema_types)
schema_norm  = [ntype(x) for x in schema_types]
norm2canon   = {ntype(x):x for x in schema_types}

# priors
trig_prior = {}
for tri,total in trig_global_counts.items():
    for tp in schema_types:
        c = trig_counts_by_type.get(tp,{}).get(tri,0)
        if c>0:
            trig_prior.setdefault(tp,{})[tri] = c/total

# optional alias
alias = {}
if os.path.exists(SCHEMA_ALIASES):
    aldf = pd.read_csv(SCHEMA_ALIASES, dtype=str).fillna("")
    if {"alias","schema_type"}.issubset(aldf.columns):
        for _,r in aldf.iterrows():
            alias[ntype(r["alias"])] = str(r["schema_type"]).strip()

# ----------------- TF-IDF index -----------------
def expand_label(s): return " ".join([s]+re.split(r'[_\-\s]+', s))

tfidf_type = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_types = tfidf_type.fit_transform([expand_label(x) for x in schema_norm])

type_trigger_docs = []
for tp in schema_types:
    bag=[]
    for tri,cnt in trig_counts_by_type.get(tp,{}).items():
        bag += [tri]*cnt
    type_trigger_docs.append(" ".join(bag))
tfidf_trig = TfidfVectorizer(analyzer="char", ngram_range=(3,5))
X_trigs = tfidf_trig.fit_transform(type_trigger_docs)

def sim_type_label(qn):
    v = tfidf_type.transform([expand_label(qn)])
    return cosine_similarity(v, X_types).ravel()

def sim_trig_to_type(tri):
    v = tfidf_trig.transform([tri])
    return cosine_similarity(v, X_trigs).ravel()

# ----------------- Parse outputs ----------------
EVENT_RE = re.compile(r'(?:<EVENTSEP>\s*)?Event\s*type\s*:\s*([^.\n\r:]+)\.\s*Trigger\s*:\s*([^.\n\r<]+)', re.I)

def parse_lines(s):
    if not isinstance(s,str) or not s.strip(): return []
    parts = [p for p in s.split("<EVENTSEP>") if p.strip()] or [s]
    out=[]
    for p in parts:
        for m in EVENT_RE.finditer(p):
            ot = ntype(m.group(1))
            tr = ntrig(m.group(2))
            if ot and tr:
                out.append((ot,tr,lemma(tr)))
    return out

# ----------------- Mapper -----------------------
def map_one(open_type_norm, trig_str, trig_lemma):
    # alias
    if open_type_norm in alias:
        return alias[open_type_norm], "alias", 1.0

    # exact/morph
    if open_type_norm in norm2canon:
        return norm2canon[open_type_norm], "exact", 1.0
    base = re.sub(r'(ings|ing|ed|s)$','', open_type_norm)
    for cand in {open_type_norm, base, base+"ing", base+"ed"}:
        if cand in norm2canon:
            return norm2canon[cand], "norm", 1.0

    # fused (type + trigger evidence)
    sims_type = sim_type_label(open_type_norm)
    fuzzy = np.array([fuzz_ratio(open_type_norm, t)/100 for t in schema_norm])
    type_signal = 0.65*sims_type + 0.35*fuzzy

    sims_trig = sim_trig_to_type(trig_lemma)
    prior_vec = np.array([trig_prior.get(tp,{}).get(trig_lemma,0.0) for tp in schema_types])
    trigger_signal = 0.6*sims_trig + 0.4*prior_vec

    fused = 0.45*type_signal + 0.55*trigger_signal
    idx = int(np.argmax(fused))
    best = float(fused[idx])
    if best>=FUSED_MIN_ACCEPT or float(np.max(sims_type))>=TYPE_TFIDF_GUARD:
        return schema_types[idx], "fused", best
    return "UNMAPPED","unmapped",best

# -------------- Build parsed+mapped frame --------------
parsed_rows=[]
for _,r in res.iterrows():
    cid = r["chunk_id"]
    for (ot,tr,trlem) in parse_lines(r["output_raw"]):
        sch,how,sc = map_one(ot,tr,trlem)
        parsed_rows.append({
            "chunk_id": cid,
            "model_name": r["model_name"],
            "model_version": r["model_version"],
            "condition_shots": r["condition_shots"],
            "open_type": ot, "trigger": tr, "tri_lemma": trlem,
            "schema_type": sch, "method": how, "map_score": sc
        })

mapped_df = pd.DataFrame(parsed_rows)
if mapped_df.empty:
    raise RuntimeError("No events parsed/mapped. Check outputs formatting.")

# -------------- Gold pairs ----------------------
def gold_pairs(s):
    pairs=set()
    if not isinstance(s,str): return pairs
    for it in s.split(";"):
        it=it.strip()
        if "|" in it:
            tp,tr=it.split("|",1)
            pairs.add((ntype(tp), ntrig(tr)))
    return pairs

gold_by_chunk = {cid: gold_pairs(ev) for cid,ev in zip(gold["chunk_id"], gold["events_norm"])}

# -------------- Dev/Test split by chunk ----------
all_cids = mapped_df["chunk_id"].unique().tolist()
random.shuffle(all_cids)
n_dev = max(1, int(len(all_cids)*DEV_FRAC))
dev_ids = set(all_cids[:n_dev])
test_ids= set(all_cids[n_dev:])

# -------------- Reliability weights tuning -------
group_keys = list(mapped_df.groupby(["model_name","model_version","condition_shots"]).groups.keys())

def score_micro(pred_pairs_by_chunk):
    TP=FP=FN=0
    for cid in gold_by_chunk:
        g = gold_by_chunk.get(cid,set())
        p = pred_pairs_by_chunk.get(cid,set())
        TP += len(g & p); FP += len(p - g); FN += len(g - p)
    P = TP/(TP+FP) if (TP+FP) else 0.0
    R = TP/(TP+FN) if (TP+FN) else 0.0
    F = 2*P*R/(P+R) if (P+R) else 0.0
    return P,R,F

def build_pred_pairs(weights, params, take_ids):
    tau_base, freq_bonus, prior_bonus, dedup_fuzz, key_by_lemma = params
    pred_by_chunk = {}

    from rapidfuzz.fuzz import ratio as fuzz_ratio

    for cid, sub in mapped_df[mapped_df["chunk_id"].isin(take_ids)].groupby("chunk_id"):
        # choose keying (namedtuple row)
        def keyrow(row):
            tpn = ntype(row.schema_type)
            tri = row.tri_lemma if key_by_lemma else ntrig(row.trigger)
            return (tpn, tri)

        cand = {}   # key -> aggregated score
        freq = {}   # key -> vote count

        # Use itertuples -> attribute access, not dict indexing
        for row in sub.itertuples(index=False):
            k = keyrow(row)
            gk = (row.model_name, row.model_version, row.condition_shots)
            alpha = weights.get(gk, 1.0)
            sc = alpha * float(row.map_score)
            cand[k] = cand.get(k, 0.0) + sc
            freq[k] = freq.get(k, 0) + 1

        # bonuses (consensus + prior)
        for k in cand:
            tp_norm, tri = k
            tp_canon = norm2canon.get(tp_norm, tp_norm)
            prior = trig_prior.get(tp_canon, {}).get(tri, 0.0)
            cand[k] += freq_bonus * freq[k] + prior_bonus * prior

        if not cand:
            pred_by_chunk[cid] = set()
            continue

        # dynamic threshold
        m = max(cand.values())
        tau = tau_base * m
        chosen = [k for k, v in cand.items() if v >= tau]

        # optional top-K by density
        if USE_TOPK_BY_DENSITY and density_map:
            bin_ = density_map.get(cid, "")
            K = TOPK_CFG.get(bin_, None)
            if K is not None and len(chosen) > K:
                chosen = sorted(chosen, key=lambda x: cand[x], reverse=True)[:K]

        # de-duplicate by fuzzy trigger
        final, used = [], []
        for k in sorted(chosen, key=lambda x: cand[x], reverse=True):
            _, tri = k
            if all(fuzz_ratio(tri, tri2) < dedup_fuzz for _, tri2 in used):
                final.append(k)
                used.append(k)

        pred_by_chunk[cid] = set(final)

    return pred_by_chunk


# tune reliability weights on dev with a fixed reasonable consensus (we'll tune consensus next)
base_params = (0.55, 0.10, 0.10, 90, False)  # tau_base, freq_bonus, prior_bonus, fuzz, key_by_lemma
best_w={k:1.0 for k in group_keys}; bestF=-1.0
for _ in range(N_TRIALS_WEIGHTS):
    trial={k: float(np.random.uniform(0.6,1.6)) for k in group_keys}
    pred_pairs = build_pred_pairs(trial, base_params, dev_ids)
    _,_,F = score_micro(pred_pairs)
    if F>bestF: bestF=F; best_w=trial
print(f"[tuner] dev F1 (weights only)={bestF:.4f}")

# tune consensus params on dev (grid over small sets)
best_params = base_params; bestF2 = -1.0
for tau_base in TAU_BASE_SET:
    for fb in FREQ_BONUS_SET:
        for pb in PRIOR_BONUS_SET:
            for fuzzv in DEDUP_FUZZ_SET:
                for use_lem in KEY_BY_LEMMA_FLAGS:
                    params=(tau_base, fb, pb, fuzzv, use_lem)
                    pred_pairs = build_pred_pairs(best_w, params, dev_ids)
                    _,_,F = score_micro(pred_pairs)
                    if F>bestF2:
                        bestF2 = F; best_params = params
print(f"[tuner] dev F1 (weights+consensus)={bestF2:.4f} with params={best_params}")

# -------------- Evaluate on ALL chunks --------------
pred_all = build_pred_pairs(best_w, best_params, mapped_df["chunk_id"].unique())
P,R,F = score_micro(pred_all)
with open(f"{OUT_DIR}/Metrics_overall_micro.txt","w",encoding="utf-8") as f:
    f.write(f"Micro Precision: {P:.4f}\nMicro Recall: {R:.4f}\nMicro F1: {F:.4f}\n")
print(f"[overall] P={P:.4f} R={R:.4f} F1={F:.4f}")

# -------------- Save ensembled predictions ----------
rows=[]
for cid, pairs in pred_all.items():
    for (tp_norm, tri) in pairs:
        rows.append({"chunk_id": cid,
                     "schema_type": canon_unders(norm2canon.get(tp_norm, tp_norm)),
                     "trigger": tri})
ens_df = pd.DataFrame(rows).sort_values(["chunk_id","schema_type","trigger"])
ens_df.to_csv(f"{OUT_DIR}/Ensembled_Predictions.csv", index=False, encoding="utf-8")
print(f"[ok] wrote {OUT_DIR}/Ensembled_Predictions.csv")


[tuner] dev F1 (weights only)=0.1677
[tuner] dev F1 (weights+consensus)=0.1723 with params=(0.45, 0.05, 0.05, 85, False)
[overall] P=0.6319 R=0.3614 F1=0.4598
[ok] wrote /content/out_cmse_v2/Ensembled_Predictions.csv
