## **Pack A — Text chunks**

In [None]:
# ============================
# Build Pack A — Text Chunks (Unicode-robust + Advanced Lead Cleaning)
# ============================
# - FTFY-based mojibake repair (SeelÃ¶we -> Seelöwe, â€“ -> –)
# - Robust quote extraction across ", “ ”, « »
# - Advanced leading cleanup:
#     * strip leading punctuation/parenthetical/comma-only leads
#     * drop leading connectives (and/but/then/so/or/yet/nor)
#     * remove dangling appositives at start
# - Strict sentence validity (end mark, verb presence, min length, alpha ratio)
# - Attempt repair by extending to next sentence end; else exclude
# - Trim to 1–3 sentences AFTER validation
# - Preserve stratified sampling by (split x density_bin)

import os, re, json, random
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import pandas as pd

# Ensure ftfy is available
try:
    import ftfy
except ImportError:
    import sys
    !pip -q install ftfy
    import ftfy

random.seed(42)

In [None]:
# --------- Paths ----------
TRAIN_PATH = "/content/llm_train_gen.jsonl"
VALID_PATH = "/content/llm_valid_gen.jsonl"
TEST_PATH  = "/content/llm_test_gen.jsonl"

OUT_DIR = "/content/packs"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "PackA_TextChunks.csv")

In [None]:
# --------- Helpers ----------

QUOTE_PATTERNS = [
    r'"(.*?)"',     # straight
    r'“(.*?)”',     # smart
    r'«(.*?)»',     # guillemets
]

SPLIT_SENT_RE = re.compile(r'(?<=[\.!?])\s+')

MOJIBAKE_RESIDUAL = [
    r'Ã.', r'â€', r'â€“', r'â€”', r'â€™', r'â€œ', r'â€\x9d', r'Â', r'â„¢'
]

FRAG_TAIL = set(["the","a","an","of","to","for","and","or","but","if","when","as","in","on","at","by","with","that"])

CONNECTIVE_LEADS = re.compile(r'^(?:and|but|then|so|or|yet|nor)\b[, ]*', flags=re.IGNORECASE)  # NEW

VERB_PAT = re.compile(
    r'\b('
    r'is|are|was|were|be|been|being|has|have|had|does|do|did|'
    r'will|would|can|could|should|may|might|must|'
    r'occurs?|occurred|took|takes|taking|torn|drifted|began|begins|started|'
    r'conquer(ed|ing)?|ruled?|land(ed|ing)?|sailed?|'
    r'\w+(ed|ing)\b'
    r')\b',
    flags=re.IGNORECASE
)

def read_jsonl(path: str) -> List[Dict]:
    rows = []
    if not os.path.exists(path):
        return rows
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                rows.append({"line_no": i, **obj})
            except json.JSONDecodeError:
                continue
    return rows

def fix_unicode(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = ftfy.fix_text(s)
    return s

def normalize_quotes(s: str) -> str:
    return (s.replace("“", '"')
             .replace("”", '"')
             .replace("«", '"')
             .replace("»", '"'))

def find_longest_quoted_span(text: str) -> Optional[str]:
    spans = []
    for pat in QUOTE_PATTERNS:
        for m in re.finditer(pat, text, flags=re.DOTALL):
            spans.append(m.group(1).strip())
    if not spans:
        return None
    spans.sort(key=lambda x: len(x), reverse=True)
    return spans[0]

def strip_guidance(text: str) -> str:
    text = re.sub(r'Use\s*<EVENTSEP>.*', '', text, flags=re.IGNORECASE|re.DOTALL)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_leading_punct(s: str) -> str:
    return s.lstrip(' ,;:)\]}>-—')

def clean_trailing_noise(s: str) -> str:
    return re.sub(r'\s+', ' ', s).strip()

def alphabetic_ratio(s: str) -> float:
    letters = sum(ch.isalpha() for ch in s)
    return letters / max(1, len(s))

def looks_like_fragment_strict(text: str) -> bool:
    if not text:
        return True
    t = text.strip()

    # Must start with plausible start
    if re.match(r'^[,;:\)\]\}>-—]', t):
        return True

    # Require end punctuation
    if not re.search(r'[\.!?]["’»)]*\s*$', t):
        return True

    # Minimal length/words
    words = t.split()
    if len(words) < 8 or len(t) < 12:
        return True

    # No dangling tail
    tail = re.sub(r'[^a-zA-Z]+$', '', words[-1].lower())
    if tail in FRAG_TAIL:
        return True

    # Verb-ish token required
    if not VERB_PAT.search(t):
        return True

    # Alphabetic ratio
    if alphabetic_ratio(t) < 0.6:
        return True

    for pat in MOJIBAKE_RESIDUAL:
        if re.search(pat, t):
            return True

    return False

def remove_leading_connectives_and_appositives(s: str) -> str:
    """
    NEW: Aggressively clean starts like:
      ", a French Navy ..."  -> drop leading comma
      "then shortened sail ..." -> drop 'then' at start
      "), was a Japanese operation ..." -> drop '), ' etc.
    Also drops a leading appositive chunk if the string starts with a comma or closing paren.
    """
    t = s.strip()

    # 1) strip leading punctuation blocks repeatedly
    prev = None
    while prev != t:
        prev = t
        t = t.lstrip(' ,;:)\]}>-—')

    # 2) drop leading connective (and/but/then/so/or/yet/nor) + optional comma
    t = CONNECTIVE_LEADS.sub('', t).strip()

    # 3) if still starts with comma/closing paren, cut until first alpha token
    if re.match(r'^[,)\]\}>-—]', t):
        m = re.search(r'[A-Za-z0-9"“(]', t)
        if m:
            t = t[m.start():].strip()

    # 4) If starts with an opening quote/paren, keep it; otherwise title-case first letter softly if lowercase
    if t and t[0].islower():
        # don't change acronyms/words intentionally lowercase; this is soft and optional
        t = t[0].upper() + t[1:]

    return t

def minimal_safe_corrections(s: str) -> str:
    """
    NEW: extremely conservative phrase fixes only when pattern is exact.
    Avoid aggressive grammar changes; only deterministic & harmless ones.
    """
    # "had of snow fall" -> "had snowfall"
    s = re.sub(r'\bhad of snow fall\b', 'had snowfall', s, flags=re.IGNORECASE)
    # normalize spaces around dashes
    s = s.replace(' - ', ' — ')
    return s

def extract_candidate_text(input_field: str) -> str:
    if not isinstance(input_field, str):
        return ""
    raw = fix_unicode(input_field)
    raw = normalize_quotes(raw)

    quoted = find_longest_quoted_span(raw)
    if quoted:
        candidate = quoted
    else:
        parts = raw.split(":", 1)
        candidate = parts[1] if len(parts) == 2 else raw

    candidate = fix_unicode(candidate)
    candidate = strip_guidance(candidate)
    candidate = remove_leading_connectives_and_appositives(candidate)   # NEW
    candidate = clean_trailing_noise(candidate)
    candidate = minimal_safe_corrections(candidate)                     # NEW
    return candidate

def try_repair_fragment(original_input: str, current_text: str) -> Optional[str]:
    raw = fix_unicode(normalize_quotes(original_input))
    cur = fix_unicode(current_text)

    pos = raw.find(cur)
    if pos < 0:
        compact_raw = re.sub(r'\s+', ' ', raw)
        compact_cur = re.sub(r'\s+', ' ', cur)
        pos = compact_raw.find(compact_cur)
        if pos < 0:
            return None
        raw = compact_raw
        cur = compact_cur

    end_start = pos + len(cur)
    m = re.search(r'[\.!?]', raw[end_start:])
    if m:
        end_idx = end_start + m.end()
        extended = raw[pos:end_idx]
        extended = strip_guidance(extended)
        extended = remove_leading_connectives_and_appositives(extended) # NEW
        extended = clean_trailing_noise(extended)
        extended = minimal_safe_corrections(extended)                   # NEW
        return extended
    return None

def simple_sent_tokenize(text: str) -> List[str]:
    sents = [s.strip() for s in SPLIT_SENT_RE.split(text) if s.strip()]
    return sents if sents else [text.strip()]

def trim_to_max_sentences(text: str, max_sents: int = 3) -> Tuple[str, int]:
    sents = simple_sent_tokenize(text)
    if len(sents) <= max_sents:
        return text, len(sents)
    trimmed = " ".join(sents[:max_sents]).strip()
    return trimmed, max_sents

def count_events_from_gold(gold_output: str) -> int:
    if not isinstance(gold_output, str) or not gold_output:
        return 0
    n = gold_output.count("<EVENTSEP>")
    if n == 0:
        n = len(re.findall(r'Event\s*type\s*:', gold_output, flags=re.IGNORECASE))
    return n

def density_bin(n_events: int) -> str:
    if n_events <= 2:
        return "low"
    if n_events <= 5:
        return "med"
    return "high"

def split_name_from_path(path: str) -> str:
    name = Path(path).name.lower()
    if "valid" in name:
        return "valid"
    if "test" in name:
        return "test"
    return "train"

def assign_chunk_id(split: str, running_index: int) -> str:
    return f"{'VAL' if split=='valid' else 'TST' if split=='test' else 'TRN'}_{running_index:04d}"

  return s.lstrip(' ,;:)\]}>-—')
  t = t.lstrip(' ,;:)\]}>-—')


In [None]:
# --------- Build candidates ----------
all_rows = []
for p in [TRAIN_PATH, VALID_PATH, TEST_PATH]:
    if not os.path.exists(p):
        print(f"WARNING: File not found: {p}")
        continue
    split = split_name_from_path(p)
    data = read_jsonl(p)

    for row in data:
        input_text  = row.get("input", "")
        gold_output = row.get("output", "")

        candidate = extract_candidate_text(input_text)

        # Strict validity & repair
        if looks_like_fragment_strict(candidate):
            repaired = try_repair_fragment(input_text, candidate)
            if repaired and not looks_like_fragment_strict(repaired):
                candidate = repaired
            else:
                # Fallback: everything after colon (fully) cleaned
                parts = normalize_quotes(fix_unicode(input_text)).split(":", 1)
                if len(parts) == 2:
                    alt = strip_guidance(parts[1])
                    alt = remove_leading_connectives_and_appositives(alt)  # NEW
                    alt = clean_trailing_noise(alt)
                    alt = minimal_safe_corrections(alt)                    # NEW
                    if alt and not looks_like_fragment_strict(alt):
                        candidate = alt

        # Still invalid? skip
        if looks_like_fragment_strict(candidate):
            continue

        # Final pass: reject if any residual mojibake appears
        bad_moji = any(re.search(pat, candidate) for pat in MOJIBAKE_RESIDUAL)
        if bad_moji:
            continue

        # Now safe to trim to 1–3 sentences
        candidate, n_sent = trim_to_max_sentences(candidate, max_sents=3)
        n_events = count_events_from_gold(gold_output)
        dens = density_bin(n_events)

        all_rows.append({
            "source_split": split,
            "orig_file": Path(p).name,
            "orig_line": row["line_no"],
            "text": candidate,
            "n_sent": n_sent,
            "n_events": n_events,
            "density_bin": dens,
            "gold_has_eventsep": int("<EVENTSEP>" in (gold_output or "")),
        })

df = pd.DataFrame(all_rows)
if df.empty:
    raise RuntimeError("No usable data after strict filtering. Consider relaxing thresholds slightly.")

# Keep only VALID/TEST
df_packA = df[df["source_split"].isin(["valid", "test"])].copy()
df_packA = df_packA.drop_duplicates(subset=["text"]).reset_index(drop=True)


In [None]:
# --------- Stratified sampling ----------
TARGET_N = 80  # adjust if needed
strata = df_packA.groupby(["source_split", "density_bin"])
counts = strata.size().reset_index(name="count")
print("Available per stratum:\n", counts)

total_available = len(df_packA)
if TARGET_N > total_available:
    print(f"TARGET_N={TARGET_N} > available={total_available}. Using all available.")
    TARGET_N = total_available

df_packA["_rand"] = [random.random() for _ in range(len(df_packA))]

def stratified_take(group, n_group):
    n = min(len(group), max(1, n_group))
    return group.sort_values("_rand").head(n)

desired = []
for (split, dens), gcount in strata.size().items():
    prop = gcount / total_available
    desired.append(((split, dens), prop))

alloc = {k: int(round(prop * TARGET_N)) for k, prop in desired}
diff = TARGET_N - sum(alloc.values())
if diff != 0:
    size_map = {k: strata.size()[k] for k in strata.size().index}
    ordered = sorted(size_map.items(), key=lambda kv: kv[1], reverse=True)
    i = 0
    while diff != 0 and i < len(ordered):
        key = ordered[i][0]
        if diff > 0:
            alloc[key] += 1; diff -= 1
        else:
            if alloc[key] > 0:
                alloc[key] -= 1; diff += 1
        i = (i + 1) % len(ordered)

sampled_frames = []
for key, n_target in alloc.items():
    sub = df_packA[(df_packA["source_split"] == key[0]) &
                   (df_packA["density_bin"]  == key[1])]
    if sub.empty or n_target == 0:
        continue
    sampled_frames.append(stratified_take(sub, n_target))

df_sample = pd.concat(sampled_frames, ignore_index=True).drop(columns=["_rand"])

Available per stratum:
   source_split density_bin  count
0         test        high   2636
1         test         low   1791
2         test         med   4326
3        valid        high    254
4        valid         low   4694
5        valid         med   2554


In [None]:
# --------- Assign chunk_ids ----------
df_sample = df_sample.sort_values(["source_split", "orig_file", "orig_line"]).reset_index(drop=True)
chunk_ids, counters = [], {"valid": 1, "test": 1}
for _, r in df_sample.iterrows():
    sp = r["source_split"]; idx = counters.get(sp, 1)
    chunk_ids.append(assign_chunk_id(sp, idx)); counters[sp] = idx + 1
df_sample.insert(0, "chunk_id", chunk_ids)
df_sample["notes"] = ""

cols = ["chunk_id","source_split","orig_file","orig_line","text","n_sent","density_bin","notes"]
df_final = df_sample[cols].copy()

In [None]:
# --------- Save ----------
df_final.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"\nSaved Pack A to: {OUT_CSV}\n")
print("Stratified breakdown:")
print(df_final.groupby(["source_split", "density_bin"]).size())
print("\nPreview:")
display(df_final.head(10))



Saved Pack A to: /content/packs/PackA_TextChunks.csv

Stratified breakdown:
source_split  density_bin
test          high           13
              low             9
              med            21
valid         high            1
              low            23
              med            13
dtype: int64

Preview:


Unnamed: 0,chunk_id,source_split,orig_file,orig_line,text,n_sent,density_bin,notes
0,TST_0001,test,llm_test_gen.jsonl,38,"""A dam broke along the river, flooding the tow...",1,high,
1,TST_0002,test,llm_test_gen.jsonl,202,"""Their presence was at first tolerated by Prin...",1,high,
2,TST_0003,test,llm_test_gen.jsonl,249,"""The developing cyclone was first detected on ...",1,med,
3,TST_0004,test,llm_test_gen.jsonl,260,"""The storm's large fetch generated enormous sw...",1,high,
4,TST_0005,test,llm_test_gen.jsonl,475,"""The origins to the conflict can be traced bac...",1,med,
5,TST_0006,test,llm_test_gen.jsonl,1034,"""At the onset of the coup, the rebels seized V...",1,low,
6,TST_0007,test,llm_test_gen.jsonl,1227,"""The 2008 Kabul Serena Hotel attack was an att...",1,med,
7,TST_0008,test,llm_test_gen.jsonl,1420,"""After a crowd gathered in the area, a teen-ag...",1,med,
8,TST_0009,test,llm_test_gen.jsonl,2426,"""Southern state legislatures had passed and ma...",1,med,
9,TST_0010,test,llm_test_gen.jsonl,2493,"""Placed throughout Boston, Massachusetts, and ...",1,high,


## **Pack B — Gold outputs**

In [None]:
# ============================
# Build Pack B — Gold Outputs
# ============================
# Inputs:
#   - /content/packs/PackA_TextChunks.csv  (created previously)
#   - llm_train_gen.jsonl, llm_valid_gen.jsonl, llm_test_gen.jsonl
#
# Output:
#   - /content/packs/PackB_Gold.csv
#     columns: chunk_id, gold_raw, n_events, events_norm
#   - /content/packs/PackB_ParsingReport.csv  (optional QC)
#
# Notes:
# - We align using Pack A columns: orig_file + orig_line
# - Robust parsing supports classic "<EVENTSEP> Event type: X. Trigger: y."
#   and falls back to JSON-like structures if present.

import os, re, json, unicodedata
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import pandas as pd

In [None]:
# ------------------ 1) Configure paths (EDIT if needed) ------------------
PACKS_DIR = "/content/packs"
PACKA_CSV = f"{PACKS_DIR}/PackA_TextChunks.csv"

TRAIN_PATH = "/content/llm_train_gen.jsonl"
VALID_PATH = "/content/llm_valid_gen.jsonl"
TEST_PATH  = "/content/llm_test_gen.jsonl"

# Map by filename (so Pack A's orig_file can be resolved to actual full path)
JSONL_PATHS_BY_NAME = {
    Path(TRAIN_PATH).name: TRAIN_PATH,
    Path(VALID_PATH).name: VALID_PATH,
    Path(TEST_PATH).name:  TEST_PATH,
}

os.makedirs(PACKS_DIR, exist_ok=True)
OUT_CSV = f"{PACKS_DIR}/PackB_Gold.csv"
OUT_QC  = f"{PACKS_DIR}/PackB_ParsingReport.csv"

In [None]:
# ------------------ 2) Utils ------------------

def read_jsonl_line(path: str, target_line: int) -> Optional[Dict]:
    """Read exactly the target line (1-based) from a JSONL file and return the parsed object."""
    if not os.path.exists(path):
        return None
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if i == target_line:
                try:
                    obj = json.loads(line)
                    obj["_line_no"] = i
                    return obj
                except json.JSONDecodeError:
                    return None
    return None

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    # Normalize unicode & spaces, strip punctuation at ends
    s = unicodedata.normalize("NFKC", s)
    s = s.strip()
    # Collapse internal whitespace to single spaces
    s = re.sub(r"\s+", " ", s)
    return s

def normalize_trigger(trig: str) -> str:
    t = normalize_text(trig).strip().strip(".;,:-")
    # Lowercase; optionally strip very simple English inflections if you want
    t = t.lower()
    # Light heuristic: remove trailing 'ed' / 'ing' if base exists in text (optional)
    # Keep it conservative to avoid over-normalization
    return t

def normalize_type(tp: str) -> str:
    return normalize_text(tp)

# Regex for classic DEGREE2-style lines:
# <EVENTSEP> Event type: X. Trigger: y.
TYPE_TRIG_RE = re.compile(
    r"Event\s*type\s*:\s*(?P<etype>[^.\n\r]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)",
    flags=re.IGNORECASE
)

def parse_events_from_gold(gold_raw: str) -> List[Tuple[str, str]]:
    """
    Return list of (event_type, trigger) tuples.
    Supports:
      - <EVENTSEP> blocks with 'Event type: ... Trigger: ...'
      - JSON with 'events': [{'type':'..','trigger':'..'}]
    """
    if not gold_raw or not isinstance(gold_raw, str):
        return []

    txt = gold_raw.strip()

    # Try JSON parse first (some users may store JSON gold)
    events = []
    try:
        obj = json.loads(txt)
        if isinstance(obj, dict) and "events" in obj and isinstance(obj["events"], list):
            for e in obj["events"]:
                et = normalize_type(e.get("type", ""))
                tr = normalize_trigger(e.get("trigger", ""))
                if et and tr:
                    events.append((et, tr))
            if events:
                return events
    except Exception:
        pass

    # Non-JSON path: look for many event lines
    # Split on <EVENTSEP> to reduce noise, then extract pairs with regex
    parts = [p for p in txt.split("<EVENTSEP>") if p.strip()]
    if not parts:
        # If no explicit separators, just search whole text
        parts = [txt]

    for p in parts:
        for m in TYPE_TRIG_RE.finditer(p):
            et = normalize_type(m.group("etype"))
            tr = normalize_trigger(m.group("trig"))
            if et and tr:
                events.append((et, tr))

    return events

def count_events_heuristic(gold_raw: str) -> int:
    """Primary count uses <EVENTSEP>; fallback counts explicit 'Event type:' patterns."""
    if not gold_raw:
        return 0
    n = gold_raw.count("<EVENTSEP>")
    if n == 0:
        n = len(re.findall(r'Event\s*type\s*:', gold_raw, flags=re.IGNORECASE))
    return n

In [None]:
# ------------------ 3) Load Pack A ------------------
if not os.path.exists(PACKA_CSV):
    raise FileNotFoundError(f"Pack A not found at: {PACKA_CSV}")

packA = pd.read_csv(PACKA_CSV, dtype={"chunk_id": str, "orig_line": int})
required_cols = {"chunk_id", "orig_file", "orig_line"}
missing = required_cols - set(packA.columns)
if missing:
    raise ValueError(f"Pack A is missing required columns: {missing}")

In [None]:
# ------------------ 4) Build Pack B ------------------
rows_B = []
qc_rows = []

for idx, r in packA.iterrows():
    chunk_id = r["chunk_id"]
    orig_file = str(r["orig_file"])
    orig_line = int(r["orig_line"])

    # Resolve full path for the JSONL referenced by Pack A
    jsonl_path = JSONL_PATHS_BY_NAME.get(orig_file)
    if not jsonl_path or not os.path.exists(jsonl_path):
        qc_rows.append({
            "chunk_id": chunk_id,
            "status": "ERROR_NO_FILE",
            "detail": f"File not found for orig_file={orig_file}",
        })
        continue

    obj = read_jsonl_line(jsonl_path, orig_line)
    if obj is None:
        qc_rows.append({
            "chunk_id": chunk_id,
            "status": "ERROR_BAD_LINE",
            "detail": f"Could not read JSONL line {orig_line} in {orig_file}",
        })
        continue

    gold_raw = obj.get("output", "")
    gold_raw_str = gold_raw if isinstance(gold_raw, str) else json.dumps(gold_raw, ensure_ascii=False)

    # Parse events
    ev_pairs = parse_events_from_gold(gold_raw_str)
    # Build normalized summary string
    events_norm = " ; ".join(f"{et}|{tr}" for et, tr in ev_pairs)

    # Count events (heuristic) — helpful for sanity checks
    n_events_sep = count_events_heuristic(gold_raw_str)
    # If heuristic says 0 but we parsed some pairs, trust the parser length
    n_events = max(n_events_sep, len(ev_pairs))

    rows_B.append({
        "chunk_id": chunk_id,
        "gold_raw": gold_raw_str,
        "n_events": int(n_events),
        "events_norm": events_norm,
    })

    qc_rows.append({
        "chunk_id": chunk_id,
        "status": "OK" if ev_pairs else ("WARN_NO_PARSED_EVENTS" if n_events > 0 else "OK_EMPTY"),
        "parsed_events": len(ev_pairs),
        "heuristic_events": n_events_sep,
        "file": orig_file,
        "line": orig_line,
    })

df_B = pd.DataFrame(rows_B, columns=["chunk_id", "gold_raw", "n_events", "events_norm"]).sort_values("chunk_id")
df_qc = pd.DataFrame(qc_rows).sort_values(["status", "chunk_id"])

# ------------------ 5) Save ------------------
df_B.to_csv(OUT_CSV, index=False, encoding="utf-8")
df_qc.to_csv(OUT_QC, index=False, encoding="utf-8")

print(f"Saved Pack B to: {OUT_CSV}")
print(f"Saved QC report to: {OUT_QC}")

print("\nPack B preview:")
display(df_B.head(10))

print("\nQC breakdown:")
print(df_qc["status"].value_counts())

Saved Pack B to: /content/packs/PackB_Gold.csv
Saved QC report to: /content/packs/PackB_ParsingReport.csv

Pack B preview:


Unnamed: 0,chunk_id,gold_raw,n_events,events_norm
0,TST_0001,<EVENTSEP> Event type: Catastrophe. Trigger: f...,8,Catastrophe|flooding ; Revenge|town ; Destroyi...
1,TST_0002,<EVENTSEP> Event type: Cause_to_amalgamate. Tr...,12,Cause_to_amalgamate|combined ; Request|request...
2,TST_0003,<EVENTSEP> Event type: Placing. Trigger: situa...,3,Placing|situated ; Know|detected ; Cause_to_ma...
3,TST_0004,<EVENTSEP> Event type: Reforming_a_system. Tri...,9,Reforming_a_system|advantage ; Preventing_or_l...
4,TST_0005,<EVENTSEP> Event type: Supporting. Trigger: ba...,3,Supporting|back ; Hostile_encounter|conflict ;...
5,TST_0006,<EVENTSEP> Event type: Supporting. Trigger: ag...,2,Supporting|aguinaldo ; Hold|seized
6,TST_0007,<EVENTSEP> Event type: Statement. Trigger: cla...,4,Statement|claimed ; Placing|taliban ; Attack|a...
7,TST_0008,<EVENTSEP> Event type: Come_together. Trigger:...,4,Come_together|gathered ; Telling|suicide ; Add...
8,TST_0009,<EVENTSEP> Event type: Preserving. Trigger: ma...,3,Preserving|maintained ; Action|century ; Chang...
9,TST_0010,<EVENTSEP> Event type: Removing. Trigger: thea...,8,Removing|theaters ; Suspicion|aqua ; Exchange|...



QC breakdown:
status
OK                       78
WARN_NO_PARSED_EVENTS     2
Name: count, dtype: int64


## **Pack C (Prompt Bank)**

In [None]:
# ===========================================
# Build Pack C — Prompt Bank (Colab script)
# ===========================================
# Outputs:
#   /content/packs/PackC_PromptBank.md
#   /content/packs/PackC_PromptBank.json
#   /content/packs/PackC_Exemplars.csv
#
# Requires:
#   /content/packs/PackA_TextChunks.csv       (built earlier)
#   /content/llm_train_gen.jsonl              (train split; used ONLY for exemplars)
#
# What we create:
#   - System Preface (stable, paste once per chat)
#   - Two shells:
#       A) Plain   -> one line per event: "Event type: X. Trigger: y."
#       B) EVENTSEP-> "<EVENTSEP> Event type: X. Trigger: y."
#   - Shot variants: zero-shot, 1-shot, 3-shot, 5-shot (for both shells)
#   - Exemplar sets are short, diverse, and not overlapping with Pack A texts
#
# Selection policy for exemplars (deterministic):
#   - Use only TRAIN jsonl
#   - 1-2 sentences (prefer concise)
#   - Density: prefer 2-5 events (avoids trivial/overloaded)
#   - Diversity: greedily add rows that introduce new event types (from output)

import os, re, json, random, unicodedata
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set
import pandas as pd

random.seed(42)

In [None]:
# ------------------ 1) Configure paths ------------------
PACKS_DIR = "/content/packs"
PACKA_CSV = f"{PACKS_DIR}/PackA_TextChunks.csv"   # Pack A (for exclusion)
TRAIN_PATH = "/content/llm_train_gen.jsonl"       # TRAIN jsonl for exemplars

os.makedirs(PACKS_DIR, exist_ok=True)

OUT_MD   = f"{PACKS_DIR}/PackC_PromptBank.md"
OUT_JSON = f"{PACKS_DIR}/PackC_PromptBank.json"
OUT_EX   = f"{PACKS_DIR}/PackC_Exemplars.csv"

# Shots we want to generate
SHOT_SIZES = [0, 1, 3, 5]  # 0 = zero-shot

In [None]:
# ------------------ 2) Small utils ------------------

QUOTE_RE = re.compile(r'\"(.*?)\"', re.DOTALL)
SPLIT_SENT_RE = re.compile(r'(?<=[\.!?])\s+')
TYPE_TRIG_RE = re.compile(
    r"Event\s*type\s*:\s*(?P<etype>[^.\n\r]+)\.\s*Trigger\s*:\s*(?P<trig>[^.\n\r<]+)",
    flags=re.IGNORECASE
)

def read_jsonl(path: str) -> List[Dict]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                obj["_line_no"] = i
                rows.append(obj)
            except json.JSONDecodeError:
                # ignore malformed lines
                pass
    return rows

def extract_quoted_text(input_field: str) -> str:
    m = QUOTE_RE.search(str(input_field))
    if m:
        return m.group(1).strip()
    parts = str(input_field).split(":", 1)
    return parts[1].strip() if len(parts) == 2 else str(input_field).strip()

def simple_sent_tokenize(text: str) -> List[str]:
    sents = [s.strip() for s in SPLIT_SENT_RE.split(text) if s.strip()]
    return sents if sents else [text.strip()]

def count_events_heuristic(gold_raw: str) -> int:
    if not gold_raw:
        return 0
    n = gold_raw.count("<EVENTSEP>")
    if n == 0:
        n = len(re.findall(r'Event\s*type\s*:', gold_raw, flags=re.IGNORECASE))
    return n

def density_bin(n_events: int) -> str:
    if n_events <= 2:
        return "low"
    if n_events <= 5:
        return "med"
    return "high"

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", s)
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def normalize_trigger(trig: str) -> str:
    t = normalize_text(trig).strip().strip(".;,:-").lower()
    return t

def parse_type_trigger_pairs(gold_raw: str) -> List[Tuple[str, str]]:
    """Parse (type, trigger) pairs from a DEGREE2-style output block."""
    if not isinstance(gold_raw, str):
        return []
    pairs = []
    parts = [p for p in gold_raw.split("<EVENTSEP>") if p.strip()]
    if not parts:
        parts = [gold_raw]
    for p in parts:
        for m in TYPE_TRIG_RE.finditer(p):
            et = normalize_text(m.group("etype"))
            tr = normalize_trigger(m.group("trig"))
            if et and tr:
                pairs.append((et, tr))
    return pairs

def to_plain_lines(pairs: List[Tuple[str, str]]) -> List[str]:
    return [f"Event type: {t}. Trigger: {tr}." for t, tr in pairs]

def to_eventsep_lines(pairs: List[Tuple[str, str]]) -> List[str]:
    return [f"<EVENTSEP> Event type: {t}. Trigger: {tr}." for t, tr in pairs]

In [None]:
# ------------------ 3) Load Pack A to avoid overlap ------------------
if not os.path.exists(PACKA_CSV):
    raise FileNotFoundError(f"Pack A not found at: {PACKA_CSV}")

packA = pd.read_csv(PACKA_CSV)
texts_packA: Set[str] = set(t.strip() for t in packA["text"].astype(str))

In [None]:
# ------------------ 4) Load TRAIN JSONL and build candidate exemplar pool ------------------
if not os.path.exists(TRAIN_PATH):
    raise FileNotFoundError(f"TRAIN jsonl not found at: {TRAIN_PATH}")

train_rows = read_jsonl(TRAIN_PATH)

candidates = []
for row in train_rows:
    src_input  = row.get("input", "")
    src_output = row.get("output", "")
    text = extract_quoted_text(src_input)
    if not text:
        continue

    # Exclude anything that appears in Pack A (no leakage)
    if text.strip() in texts_packA:
        continue

    # Sentence & density filters (prefer concise, clear exemplars)
    sents = simple_sent_tokenize(text)
    n_sent = len(sents)

    # Keep 1–2 sentences (best for exemplars)
    if n_sent > 2:
        continue

    n_events = count_events_heuristic(src_output)
    dens = density_bin(n_events)

    # Prefer 2–5 events for clarity/diversity; keep 1 as well if needed
    if n_events == 0:
        continue
    if n_events > 10:  # avoid overly dense/long outputs as exemplars
        continue

    pairs = parse_type_trigger_pairs(src_output)
    if not pairs:
        continue

    # Build a set of event types to help diversity
    type_set = {t for (t, _) in pairs}

    candidates.append({
        "text": text,
        "plain_lines": to_plain_lines(pairs),
        "eventsep_lines": to_eventsep_lines(pairs),
        "n_sent": n_sent,
        "n_events": n_events,
        "density_bin": dens,
        "event_types": sorted(list(type_set)),
        "line_no": row.get("_line_no", -1),
    })

cand_df = pd.DataFrame(candidates)
if cand_df.empty:
    raise RuntimeError("No exemplar candidates found. Loosen filters or check TRAIN file formatting.")

In [None]:
# ------------------ 5) Greedy diversity selection for exemplar pool ------------------
# Build a diverse pool (~20 items) maximizing new event types coverage.
TARGET_POOL = 20
selected = []
seen_types: Set[str] = set()

# Sort candidates: prefer med density, then low, then high, then fewer sentences, then moderate n_events
density_rank = {"med": 0, "low": 1, "high": 2}
cand_df["_rank"] = cand_df["density_bin"].map(density_rank).fillna(3)
cand_df = cand_df.sort_values(by=["_rank", "n_sent", "n_events"]).reset_index(drop=True)

for _, r in cand_df.iterrows():
    types = set(r["event_types"])
    # score = how many new types this row contributes
    gain = len(types - seen_types)
    # encourage any gain; if none, still allow to fill if pool small
    if gain > 0 or len(selected) < max(8, TARGET_POOL // 3):
        selected.append(r)
        seen_types |= types
    if len(selected) >= TARGET_POOL:
        break

ex_pool = pd.DataFrame(selected).drop(columns=["_rank"])
ex_pool.reset_index(drop=True, inplace=True)

# Save the exemplar pool to CSV (for auditing)
ex_pool_out = ex_pool.copy()
ex_pool_out["plain_output"] = ex_pool_out["plain_lines"].apply(lambda L: "\n".join(L))
ex_pool_out["eventsep_output"] = ex_pool_out["eventsep_lines"].apply(lambda L: "\n".join(L))
ex_pool_out = ex_pool_out[["text","plain_output","eventsep_output","n_sent","n_events","density_bin","event_types","line_no"]]
ex_pool_out.to_csv(OUT_EX, index=False, encoding="utf-8")

In [None]:
# ------------------ 6) Assemble prompt shells and shot variants ------------------

SYSTEM_PREFACE = (
    "You are an information-extraction assistant.\n"
    "Extract every event mentioned in the text.\n"
    "Return exactly the requested format. No explanations."
)

SHELL_PLAIN = (
    "TASK: Extract every event in the text. For each event, write a line:\n"
    "Event type: <TYPE>. Trigger: <TRIGGER>.\n\n"
    "Text:\n"
    "\"{CHUNK}\"\n\n"
    "Rules:\n"
    "- Include ALL events mentioned.\n"
    "- One line per event; no extra commentary."
)

SHELL_EVENTSEP = (
    "TASK: Extract every event in the text. For each event, write a line:\n"
    "<EVENTSEP> Event type: <TYPE>. Trigger: <TRIGGER>.\n\n"
    "Text:\n"
    "\"{CHUNK}\"\n\n"
    "Rules:\n"
    "- Use <EVENTSEP> exactly as the separator.\n"
    "- Include ALL events mentioned.\n"
    "- One line per event; no extra commentary."
)

def exemplar_block_plain(text: str, plain_lines: List[str]) -> str:
    return (
        "Example\n"
        f'Text: "{text}"\n'
        "Output:\n" + "\n".join(plain_lines)
    )

def exemplar_block_eventsep(text: str, eventsep_lines: List[str]) -> str:
    return (
        "Example\n"
        f'Text: "{text}"\n'
        "Output:\n" + "\n".join(eventsep_lines)
    )

def build_k_shot_prompt(k: int, shell_text: str, variant: str) -> Tuple[str, List[int]]:
    """
    variant in {'plain', 'eventsep'}
    Returns (prompt_text_without_chunk, exemplar_indices_used)
    """
    assert variant in {"plain", "eventsep"}
    # Deterministic selection: take the first k exemplars from the pool
    # For 3- and 5-shot, you get cumulative sets (first 3, first 5)
    k = max(0, k)
    k = min(k, len(ex_pool))
    used_idxs = list(range(k))
    blocks = []
    for i in used_idxs:
        row = ex_pool.iloc[i]
        if variant == "plain":
            blocks.append(exemplar_block_plain(row["text"], row["plain_lines"]))
        else:
            blocks.append(exemplar_block_eventsep(row["text"], row["eventsep_lines"]))

    header = SYSTEM_PREFACE + "\n\n"
    if k > 0:
        header += "\n---\n".join(blocks) + "\n\n--- Now do the same for:\n\n"

    body = shell_text
    prompt = header + body
    return prompt, used_idxs

# Build all prompts
prompts_catalog = {
    "system_preface": SYSTEM_PREFACE,
    "shells": {
        "plain": SHELL_PLAIN,
        "eventsep": SHELL_EVENTSEP
    },
    "shots": {}
}

# For markdown output
md_lines = []
md_lines.append("# Pack C — Prompt Bank\n")
md_lines.append("## System Preface\n")
md_lines.append("```\n" + SYSTEM_PREFACE + "\n```\n")

md_lines.append("## Shells\n")
md_lines.append("### Plain shell\n")
md_lines.append("```\n" + SHELL_PLAIN + "\n```\n")
md_lines.append("### <EVENTSEP> shell\n")
md_lines.append("```\n" + SHELL_EVENTSEP + "\n```\n")

for variant_name, shell in [("plain", SHELL_PLAIN), ("eventsep", SHELL_EVENTSEP)]:
    md_lines.append(f"## {variant_name.upper()} — Shot Variants\n")
    for k in SHOT_SIZES:
        prompt_text, used = build_k_shot_prompt(k, shell, variant_name)
        key = f"{variant_name}_{k}shot"
        prompts_catalog["shots"][key] = {
            "k": k,
            "variant": variant_name,
            "prompt": prompt_text,
            "exemplar_indices": used
        }
        md_lines.append(f"### {variant_name} — {k}-shot\n")
        md_lines.append("> Paste this in a fresh chat. Replace `{{CHUNK}}` with your text.\n")
        md_lines.append("```\n" + prompt_text + "\n```\n")

In [None]:
# ------------------ 7) Save outputs ------------------
with open(OUT_MD, "w", encoding="utf-8") as f:
    f.write("\n".join(md_lines))

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(prompts_catalog, f, ensure_ascii=False, indent=2)

print(f"Saved Prompt Bank markdown to: {OUT_MD}")
print(f"Saved Prompt Bank json to:     {OUT_JSON}")
print(f"Saved Exemplar pool to:        {OUT_EX}")

# Quick preview
display(pd.read_csv(OUT_EX).head(5))

Saved Prompt Bank markdown to: /content/packs/PackC_PromptBank.md
Saved Prompt Bank json to:     /content/packs/PackC_PromptBank.json
Saved Exemplar pool to:        /content/packs/PackC_Exemplars.csv


Unnamed: 0,text,plain_output,eventsep_output,n_sent,n_events,density_bin,event_types,line_no
0,The project was an ambitious and risky venture...,Event type: Self_motion. Trigger: venture.\nEv...,<EVENTSEP> Event type: Self_motion. Trigger: v...,1,3,med,"['Aiming', 'Conquering', 'Self_motion']",3
1,The sea venture was the only desired action th...,Event type: Self_motion. Trigger: venture.\nEv...,<EVENTSEP> Event type: Self_motion. Trigger: v...,1,3,med,"['Deciding', 'Self_motion']",5
2,"The murder of Leigh Leigh, born Leigh Rennea M...",Event type: Coming_to_be. Trigger: occurred.\n...,<EVENTSEP> Event type: Coming_to_be. Trigger: ...,1,3,med,"['Coming_to_be', 'Committing_crime', 'Particip...",8
3,Her naked body was found in the sand dunes nea...,Event type: Bodily_harm. Trigger: crushed.\nEv...,<EVENTSEP> Event type: Bodily_harm. Trigger: c...,1,3,med,"['Bodily_harm', 'Damaging', 'Know']",11
4,"Matthew Grant Webster, an 18-year-old who acte...",Event type: Legal_rulings. Trigger: sentenced....,<EVENTSEP> Event type: Legal_rulings. Trigger:...,1,3,med,"['Committing_crime', 'Legal_rulings', 'Request']",12
