In [1]:
pip install datasets accelerate evaluate sentencepiece

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: C:\Users\rezag\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: C:\Users\rezag\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# -----------------------------
# 1) Load data
# -----------------------------
def load_event_dataset(jsonl_path):
    return load_dataset('json', data_files={'data': jsonl_path})['data']

train_ds = load_event_dataset('/content/output/llm_train_gen.jsonl')
valid_ds = load_event_dataset('/content/output/llm_valid_gen.jsonl')

In [None]:
# -----------------------------
# 2) Tokenizer & preprocess
# -----------------------------
model_ckpt = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

max_input_length = 512
max_target_length = 256

def preprocess_fn(examples):
    model_inputs = tokenizer(
        examples["input"], max_length=max_input_length,
        truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"], max_length=max_target_length,
            truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess_fn, batched=True, remove_columns=train_ds.column_names)
valid_tok = valid_ds.map(preprocess_fn, batched=True, remove_columns=valid_ds.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/32431 [00:00<?, ? examples/s]



Map:   0%|          | 0/8042 [00:00<?, ? examples/s]

In [None]:
# -----------------------------
# 3) Model & collator
# -----------------------------
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# -----------------------------
# 4) Training args (early stop + best ckpt)
# -----------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_mavenere",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    logging_steps=100,
    report_to="none",
    save_safetensors=True
)

In [None]:
# -----------------------------
# 5) Early stopping callback
# -----------------------------
early_stop = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

In [None]:
# -----------------------------
# 6) Trainer
# -----------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stop],
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
500,0.0821,0.057334
1000,0.0583,0.047122
1500,0.052,0.041703
2000,0.0465,0.038767
2500,0.0459,0.036426
3000,0.0422,0.034634
3500,0.0409,0.033408
4000,0.0399,0.032543
4500,0.04,0.033203
5000,0.0407,0.033245


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=5500, training_loss=0.07936556339263916, metrics={'train_runtime': 6995.9822, 'train_samples_per_second': 13.907, 'train_steps_per_second': 1.738, 'total_flos': 2.679353822674944e+16, 'train_loss': 0.07936556339263916, 'epoch': 1.3566847557967439})

In [None]:
trainer.save_model("./t5_mavenere_final")
tokenizer.save_pretrained("./t5_mavenere_final")

('./t5_mavenere_final/tokenizer_config.json',
 './t5_mavenere_final/special_tokens_map.json',
 './t5_mavenere_final/spiece.model',
 './t5_mavenere_final/added_tokens.json',
 './t5_mavenere_final/tokenizer.json')

In [None]:
import re
import torch
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

EVENT_PATTERN = re.compile(r"Event type:\s*([^\.]+)\.\s*Trigger:\s*([^\.]+)\.", re.IGNORECASE)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def postprocess_events(generated_text):
    events = EVENT_PATTERN.findall(generated_text)
    cleaned = set()
    for etype, trigger in events:
        etype_norm = etype.strip().lower()
        trigger_norm = lemmatizer.lemmatize(trigger.strip().lower())
        cleaned.add((etype_norm, trigger_norm))

    sorted_events = sorted(cleaned)
    return " <EVENTSEP> ".join([f"Event type: {etype}. Trigger: {trigger}." for etype, trigger in sorted_events])

In [None]:
def generate_events(prompt, model, tokenizer, max_len=256, beams=5, device=None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.eval().to(device)
    with torch.no_grad():
        ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        out = model.generate(ids, max_length=max_len, num_beams=beams, num_return_sequences=1)
        decoded = tokenizer.decode(out[0], skip_special_tokens=True)
        return postprocess_events(decoded)

In [None]:
# -----------------------------
# 7) Quick sanity check (inference)
# -----------------------------
test_prompt = (
    'Extract all events from the following sentence:\n'
    '"A corps of volunteers led by Giuseppe Garibaldi sailed from Quarto, near Genoa (now Quarto dei Mille) '
    'and landed in Marsala, Sicily, in order to conquer the Kingdom of the Two Sicilies, ruled by the '
    'House of Bourbon-Two Sicilies."\n'
    'Use <EVENTSEP> to separate events.'
)

print(generate_events(test_prompt, trainer.model, tokenizer))

Event type: arriving. Trigger: landed. <EVENTSEP> Event type: conquering. Trigger: conquer. <EVENTSEP> Event type: control. Trigger: ruled. <EVENTSEP> Event type: motion. Trigger: sailed.


Eval

In [None]:
import json
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from collections import Counter
import csv
import nltk
from nltk.stem import WordNetLemmatizer
import re
import math

nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# ---------- Normalizers ----------
EVENT_SPLIT = re.compile(r'\s*<EVENTSEP>\s*', flags=re.IGNORECASE)
TYPE_RE = re.compile(r'event\s*type\s*:\s*(.+?)\.', re.IGNORECASE)
TRIG_RE = re.compile(r'trigger\s*:\s*(.+?)\.', re.IGNORECASE)
ARGS_RE = re.compile(r'arguments\s*:\s*(.+)$', re.IGNORECASE)
TOKEN_RE = re.compile(r"\w+", re.UNICODE)

def _norm(s):
    return re.sub(r'\s+', ' ', s.strip().lower()) if isinstance(s, str) else s

def token_set(s: str):
    return set(TOKEN_RE.findall(s.lower())) if s else set()

In [None]:
# ---------- Robust parser: always returns list of (type, trigger, args_str) ----------
def parse_event_structure(text):
    """
    Support both training-style:
      '<EVENTSEP> Event type: X. Trigger: Y. Arguments: ...'
    and key-value style:
      'EventType: X; Trigger: Y; Arguments: ...'
    Always returns: [(etype, trigger, args_str), ...] with normalized lowercase.
    """
    if not text or "no events" in text.lower():
        return []

    events = []
    # split on <EVENTSEP>
    chunks = [c for c in EVENT_SPLIT.split(text) if c.strip()]
    for ch in chunks:
        ch_norm = ch.strip()

        # Try training-style regex
        t = TYPE_RE.search(ch_norm)
        g = TRIG_RE.search(ch_norm)
        a = ARGS_RE.search(ch_norm)

        if t or g:
            etype = _norm(t.group(1)) if t else None
            trig  = _norm(g.group(1)) if g else None
            args  = _norm(a.group(1)) if a else ""
            events.append((etype, trig, args))
            continue

        # Try key-value fallback: 'EventType: ...; Trigger: ...; Arguments: ...'
        parts = {}
        for seg in ch_norm.split(';'):
            if ':' in seg:
                k, v = seg.split(':', 1)
                parts[_norm(k)] = _norm(v)
        etype = parts.get('eventtype') or parts.get('event type')
        trig  = parts.get('trigger')
        args  = parts.get('arguments', '')
        if etype or trig:
            events.append((etype, trig, args))

    return events

In [None]:
# ---------- Matching helpers ----------
def ensure_triple(e):
    """Convert any event item to (etype, trigger, args_str)."""
    if isinstance(e, tuple):
        if len(e) == 3: return e
        if len(e) == 2: return (e[0], e[1], "")
        if len(e) == 1: return (e[0], None, "")
    if isinstance(e, dict):
        return (_norm(e.get('event_type')), _norm(e.get('trigger')), _norm(e.get('arguments','')))
    return (None, None, "")

def trigger_overlap(pred_trig: str, gold_trig: str) -> float:
    if not pred_trig or not gold_trig:
        return 0.0
    if _norm(pred_trig) == _norm(gold_trig):
        return 1.0
    ps, gs = token_set(pred_trig), token_set(gold_trig)
    return 0.5 if (ps and gs and (ps & gs)) else 0.0

def match_events_partial(preds, golds, partial_weight=0.5):
    """Type must match exactly (after norm). Trigger exact=1.0, overlap=0.5."""
    preds3 = [ensure_triple(p) for p in preds]
    golds3 = [ensure_triple(g) for g in golds]

    used = set()
    exact = partial = 0

    for (pt, ptrig, _ ) in preds3:
        if not pt:
            continue
        best = (0.0, -1)  # (score, gold_idx)
        for j, (gt, gtrig, _) in enumerate(golds3):
            if j in used:
                continue
            if not gt or pt != gt:
                continue
            ov = trigger_overlap(ptrig, gtrig)  # 1.0 / 0.5 / 0.0
            if ov > best[0]:
                best = (ov, j)
        if best[1] != -1:
            used.add(best[1])
            if math.isclose(best[0], 1.0):
                exact += 1
            elif best[0] >= 0.5:
                partial += 1

    return exact, partial, len(preds3), len(golds3)

def prf(correct_exact, correct_partial, pred_total, gold_total, partial_weight=0.5):
    weighted = correct_exact + partial_weight * correct_partial
    precision = weighted / pred_total if pred_total else 0.0
    recall    = weighted / gold_total if gold_total else 0.0
    f1 = (2*precision*recall)/(precision+recall) if (precision+recall) else 0.0
    return precision, recall, f1

In [None]:
# ---------- Evaluation (safe) ----------
def evaluate_dataset_and_save(dataset, output_csv="evaluation_results_cleaned.csv", partial_weight=0.5, show_samples=3):
    strict_tp = strict_pred = strict_gold = 0
    part_exact = part_partial = part_pred = part_gold = 0

    examples = []

    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Gold Events", "Predicted Events", "Strict_TP", "Exact_Partial", "Partial_Partial"])

        for row in dataset:
            gold_raw = parse_event_structure(row["output"])
            pred_raw = parse_event_structure(row["pred"])

            gold = [ensure_triple(e) for e in gold_raw]
            pred = [ensure_triple(e) for e in pred_raw]

            # strict (type+trigger exact)
            gold_pairs = {(et, tr) for et, tr, _ in gold if et and tr}
            pred_pairs = {(et, tr) for et, tr, _ in pred if et and tr}
            tp = len(gold_pairs & pred_pairs)
            strict_tp   += tp
            strict_pred += len(pred_pairs)
            strict_gold += len(gold_pairs)

            # partial-aware
            ce, cp, pt, gt = match_events_partial(pred, gold, partial_weight=partial_weight)
            part_exact   += ce
            part_partial += cp
            part_pred    += pt
            part_gold    += gt

            writer.writerow([gold, pred, tp, ce, cp])

            if len(examples) < show_samples:
                examples.append({"gold": gold, "pred": pred, "strict_tp": tp, "partial_match": (ce, cp)})

    # strict metrics
    strict_precision = strict_tp / strict_pred if strict_pred else 0.0
    strict_recall    = strict_tp / strict_gold if strict_gold else 0.0
    strict_f1        = (2*strict_precision*strict_recall)/(strict_precision+strict_recall) if (strict_precision+strict_recall) else 0.0

    # partial metrics
    p, r, f1 = prf(part_exact, part_partial, part_pred, part_gold, partial_weight=partial_weight)

    print("===== STRICT (type+trigger exact) =====")
    print(f"Precision: {strict_precision:.4f}")
    print(f"Recall   : {strict_recall:.4f}")
    print(f"F1       : {strict_f1:.4f}")

    print("\n===== PARTIAL-AWARE (type exact + trigger overlap) =====")
    print(f"Exact matches         : {part_exact}")
    print(f"Partial matches(0.5)  : {part_partial}")
    print(f"Pred / Gold           : {part_pred} / {part_gold}")
    print(f"Precision (weighted)  : {p:.4f}")
    print(f"Recall (weighted)     : {r:.4f}")
    print(f"F1 (weighted)         : {f1:.4f}")

    print(f"\n📄 Results saved to: {output_csv}")

    print("\n--- Debug samples ---")
    for i, ex in enumerate(examples, 1):
        print(f"\nSample {i}:")
        print("GOLD:", ex["gold"])
        print("PRED:", ex["pred"])
        print("strict_tp:", ex["strict_tp"], " | partial:", ex["partial_match"])

In [None]:
# --------------------
# Load model & tokenizer
# --------------------
model_dir = "/content/t5_mavenere_final"
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
# --------------------
# Load test dataset
# --------------------
test_path = "/content/output/llm_test_gen.jsonl"  # update path
dataset = load_dataset("json", data_files={"test": test_path})["test"]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# --------------------
# Generate outputs
# --------------------
def generate_output(text, max_length=256):
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, num_beams=5)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
# --------------------
# Run generation + cleaning + evaluation
# --------------------
dataset = dataset.map(lambda x: {"pred": generate_output(x["input"])})

Map:   0%|          | 0/9400 [00:00<?, ? examples/s]

In [None]:
evaluate_dataset_and_save(dataset, partial_weight=0.5, show_samples=3)

===== STRICT (type+trigger exact) =====
Precision: 0.5802
Recall   : 0.1243
F1       : 0.2047

===== PARTIAL-AWARE (type exact + trigger overlap) =====
Exact matches         : 5102
Partial matches(0.5)  : 21
Pred / Gold           : 8794 / 41486
Precision (weighted)  : 0.5814
Recall (weighted)     : 0.1232
F1 (weighted)         : 0.2034

📄 Results saved to: evaluation_results_cleaned.csv

--- Debug samples ---

Sample 1:
GOLD: [('terrorism', 'terrorism', ''), ('rescuing', 'lufthansa', ''), ('change_sentiment', 'flight', ''), ('change_of_leadership', 'liberation', ''), ('giving', 'perpetrators', ''), ('aiming', 'aimed', ''), ('having_or_lacking_access', 'act', ''), ('kidnapping', 'hijacking', ''), ('testing', 'surviving', ''), ('coming_to_be', 'occurred', '')]
PRED: [('coming_to_be', 'occurred', '')]
strict_tp: 1  | partial: (1, 0)

Sample 2:
GOLD: [('containing', 'black', ''), ('commerce_pay', 'german', ''), ('request', 'demand', ''), ('agree_or_refuse_to_act', 'complied', ''), ('cause_

In [4]:
# ===============================================
# DEGREE2-style Event Extraction: T5 / FLAN-T5
# ===============================================
import os, re, json, math, random, difflib
from collections import Counter, defaultdict
from typing import List, Tuple

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)

In [9]:
# -----------------------------
# 0) Config
# -----------------------------
SEED = 42
random.seed(SEED); torch.manual_seed(SEED)

DATA_DIR   = "/output"
TRAIN_FILE = f"{DATA_DIR}/llm_train_gen.jsonl"
VALID_FILE = f"{DATA_DIR}/llm_valid_gen.jsonl"
TEST_FILE  = f"{DATA_DIR}/llm_test_gen.jsonl"

# Pick base model
BASE_MODEL = "t5-base"          # or: "google/flan-t5-base" | "google/flan-t5-large"
OUT_DIR    = "/content/t5_degree2_ckpt"
FINAL_DIR  = "/content/t5_degree2_final"

CHUNK_N_SENT = None

MAX_IN_LEN  = 512
MAX_OUT_LEN = 256
EVENT_TOKEN = "<EVENTSEP>"

In [6]:
# -----------------------------
# 1) Regex & utils
# -----------------------------
EVENT_SEP = re.compile(r"\s*<EVENTSEP>\s*", re.IGNORECASE)
TYPE_RE   = re.compile(r'event\s*type\s*:\s*(.+?)\.', re.IGNORECASE)
TRIG_RE   = re.compile(r'trigger\s*:\s*(.+?)\.', re.IGNORECASE)
TOK_RE    = re.compile(r"\w+", re.UNICODE)

def norm(s: str) -> str:
    return re.sub(r"\s+"," ", s.strip().lower()) if s else s

def parse_pairs(text: str) -> List[Tuple[str,str]]:
    if not text or "no events" in text.lower(): return []
    out=[]
    for ch in [c for c in EVENT_SEP.split(text) if c.strip()]:
        t = TYPE_RE.search(ch); g = TRIG_RE.search(ch)
        et = norm(t.group(1)) if t else None
        tr = norm(g.group(1)) if g else None
        if et and tr: out.append((et,tr))
    return out

def dedup_events_str(text: str) -> str:
    if not text: return ""
    seen=set(); kept=[]
    for p in [p.strip() for p in EVENT_SEP.split(text) if p.strip()]:
        key = norm(p)
        if key and key not in seen:
            seen.add(key); kept.append(p)
    return f" {EVENT_TOKEN} ".join(kept)

def token_set(s): return set(TOK_RE.findall(s.lower())) if s else set()

def trigger_overlap(p,g):
    if not p or not g: return 0.0
    if norm(p)==norm(g): return 1.0
    ps,gs = token_set(p), token_set(g)
    return 0.5 if (ps and gs and ps&gs) else 0.0

In [8]:
# -----------------------------
# 2) Load plain sets (to build few-shot & ontology)
# -----------------------------
def load_plain(path): return load_dataset("json", data_files={"data": path})["data"]

train_plain = load_plain(TRAIN_FILE)
valid_plain = load_plain(VALID_FILE)
test_plain  = load_plain(TEST_FILE)

TYPE_FREQ = Counter()
for ex in train_plain:
    for et,tr in parse_pairs(ex["output"]):
        TYPE_FREQ[et]+=1
ONTOLOGY = sorted(TYPE_FREQ.keys())

def nearest_type(t: str) -> str:
    if not t or not ONTOLOGY: return t
    cand = difflib.get_close_matches(t, ONTOLOGY, n=1, cutoff=0.8)
    return cand[0] if cand else t

# Few-shot 
def build_fewshot_bank(ds, k=60):
    bank=[]
    for ex in ds:
        out = ex["output"]
        if (EVENT_TOKEN in out) and (len(out)<600):
            src = ex["input"]
            m = re.search(r'\"(.+?)\"\s*\nUse <EVENTSEP>', src, re.DOTALL)
            sent = m.group(1) if m else src
            bank.append((sent.strip(), out.strip()))
    random.shuffle(bank)
    return bank[:k]

FEWSHOT = build_fewshot_bank(train_plain, k=60)

def sample_k_shots(k=3):
    k = min(k, len(FEWSHOT))
    return random.sample(FEWSHOT, k) if k>0 else []

FileNotFoundError: Unable to find '/content/output/llm_train_gen.jsonl'

In [7]:
# -----------------------------
# 3) Prompt builder (DEGREE2-style)
# -----------------------------
PROMPT_HEADER = (
    "Extract ALL events from the sentence below.\n"
    f"Output only lines like: {EVENT_TOKEN} Event type: <TYPE>. Trigger: <TRIGGER>.\n"
    "If no events, output exactly: No events.\n\n"
)

def build_prompt(sentence: str, k=3) -> str:
    head = PROMPT_HEADER
    shots = sample_k_shots(k)
    if shots:
        head += "### Examples\n"
        for s,o in shots:
            head += f'Sentence: "{s}"\n{o}\n\n'
    head += "### Now extract\n"
    head += f'Sentence: "{sentence}"\nOutput:\n'
    return head

In [8]:
# -----------------------------
# 4) Chunk text to sentences
# -----------------------------
def chunk_text_to_sentences(text: str) -> list:

    return re.split(r'(?<=[\.\!\?])\s+', text.strip())

def chunk_doc(sentence_or_doc: str, n=3):
    sents = chunk_text_to_sentences(sentence_or_doc)
    if len(sents)<=n: return [sentence_or_doc]
    chunks=[]
    for i in range(0,len(sents),n):
        chunks.append(" ".join(sents[i:i+n]))
    return chunks

In [9]:
# -----------------------------
# 5) Supervised datasets => prompted IO
# -----------------------------
def to_prompted(ds, fewshot_k=3):
    def _map(ex):
        txt = ex["input"]
        if CHUNK_N_SENT:
            pieces = chunk_doc(txt, n=CHUNK_N_SENT)
            txt = pieces[0]
        return {"input": build_prompt(txt, k=fewshot_k), "output": ex["output"]}
    return ds.map(_map, remove_columns=[c for c in ds.column_names if c not in ("input","output")])

train_ds = to_prompted(train_plain, fewshot_k=3)
valid_ds = to_prompted(valid_plain, fewshot_k=3)
test_ds  = to_prompted(test_plain,  fewshot_k=3)

Map:   0%|          | 0/32431 [00:00<?, ? examples/s]

Map:   0%|          | 0/8042 [00:00<?, ? examples/s]

Map:   0%|          | 0/9400 [00:00<?, ? examples/s]

In [10]:
# -----------------------------
# 6) Tokenizer & model
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
added = tokenizer.add_special_tokens({"additional_special_tokens":[EVENT_TOKEN]})

def preprocess(batch):
    enc = tokenizer(batch["input"], max_length=MAX_IN_LEN, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        lab = tokenizer(batch["output"], max_length=MAX_OUT_LEN, truncation=True, padding="max_length")
    enc["labels"] = lab["input_ids"]
    return enc

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
valid_tok = valid_ds.map(preprocess, batched=True, remove_columns=valid_ds.column_names)

model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
if added>0: model.resize_token_embeddings(len(tokenizer))
model.gradient_checkpointing_enable()

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/32431 [00:00<?, ? examples/s]



Map:   0%|          | 0/8042 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# -----------------------------
# 7) Training (Adafactor, 10 epochs per paper)
# -----------------------------
use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=800,
    save_steps=800,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    learning_rate=1e-4,
    weight_decay=1e-5,
    lr_scheduler_type="constant",
    optim="adafactor",

    label_smoothing_factor=0.1,
    max_grad_norm=1.0,
    predict_with_generate=False,

    fp16=False,
    bf16=use_bf16,
    report_to="none",
    logging_steps=100,
    save_safetensors=True,
    seed=SEED, data_seed=SEED,
    remove_unused_columns=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()
trainer.save_model(FINAL_DIR); tokenizer.save_pretrained(FINAL_DIR)
print("Saved best model to:", FINAL_DIR)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
800,1.5001,1.46787
1600,1.4686,1.438593
2400,1.4557,1.426441
3200,1.4475,1.411881
4000,1.4401,1.412003
4800,1.4308,1.404725
5600,1.4307,1.40936
6400,1.4281,1.403879
7200,1.4252,1.405518
8000,1.4245,1.404045


In [None]:
# --- Zip & Download a model folder in Colab ---

import os, shutil, time
from google.colab import files

FINAL_DIR = "/content/t5_mavenere_final"

stamp = time.strftime("%Y%m%d-%H%M%S")
zip_base = f"/content/{os.path.basename(FINAL_DIR)}_{stamp}"

for ext in (".zip",):
    if os.path.exists(zip_base + ext):
        os.remove(zip_base + ext)

print("Zipping ... این کار ممکنه چند دقیقه طول بکشه.")
archive_path = shutil.make_archive(zip_base, 'zip', FINAL_DIR)
size_gb = os.path.getsize(archive_path) / (1024**3)
print(f"Done: {archive_path}  (~{size_gb:.2f} GB)")

files.download(archive_path)

In [None]:
# -----------------------------
# 8) Inference helpers (recall-friendly decoding)
# -----------------------------
def clean_and_canon(text: str) -> str:
    text = dedup_events_str(text)
    parts=[]
    for ch in [c for c in EVENT_SEP.split(text) if c.strip()]:
        t = TYPE_RE.search(ch); g = TRIG_RE.search(ch)
        et = norm(t.group(1)) if t else None
        tr = norm(g.group(1)) if g else None
        if et:
            et = nearest_type(et)
        if et and tr:
            parts.append(f"Event type: {et}. Trigger: {tr}.")
    return f" {EVENT_TOKEN} ".join(parts) if parts else "No events."

def generate_batch(prompts: List[str], mdl, tok, bs=8, device=None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    mdl.eval().to(device)
    gen_kwargs = dict(
        max_new_tokens=160, min_new_tokens=16,
        num_beams=6, num_beam_groups=3, diversity_penalty=0.2,
        no_repeat_ngram_size=3, length_penalty=0.9, repetition_penalty=1.05,
        early_stopping=False,
    )
    outs=[]
    with torch.no_grad():
        for i in range(0,len(prompts),bs):
            enc = tok(prompts[i:i+bs], return_tensors="pt", padding=True, truncation=True, max_length=MAX_IN_LEN).to(device)
            gen = mdl.generate(**enc, **gen_kwargs)
            dec = tok.batch_decode(gen, skip_special_tokens=True)
            outs += [clean_and_canon(t) for t in dec]
    return outs

In [None]:
# -----------------------------
# 9) Evaluation (Strict / Partial / Relaxed)
# -----------------------------
def match_partial(preds, golds):
    used=set(); exact=partial=0
    for pt,ptr in preds:
        best=(0.0,-1)
        for j,(gt,gtr) in enumerate(golds):
            if j in used: continue
            if pt!=gt:   continue
            ov = trigger_overlap(ptr,gtr)
            if ov>best[0]: best=(ov,j)
        if best[1]!=-1:
            used.add(best[1])
            if math.isclose(best[0],1.0): exact+=1
            elif best[0]>=0.5: partial+=1
    return exact, partial, len(preds), len(golds)

def prf(e,p,pt,gt, w=0.5):
    wtp = e + w*p
    P = wtp/pt if pt else 0.0
    R = wtp/gt if gt else 0.0
    F = (2*P*R)/(P+R) if (P+R) else 0.0
    return P,R,F

def relaxed_recall_by_chunks(pairs_pred, pairs_gold, chunk_size=1):
    correct=partial=extra=possible=impossible=0
    for pp,gg in zip(pairs_pred, pairs_gold):
        # per-chunk
        ce,cp,pt,gt = match_partial(pp,gg)
        matched = ce + 0.5*cp
        possible += gt
        impossible += max(0, gt-1)
        extra += max(0.0, matched-1.0)
        correct += ce; partial += cp
    denom = max(1, possible - impossible)
    num   = max(0.0, (correct + 0.5*partial) - extra)
    return num/denom

def evaluate(ds, mdl, tok):
    prompts = [ex["input"] for ex in ds]
    gtexts  = [ex["output"] for ex in ds]
    preds   = generate_batch(prompts, mdl, tok, bs=8)

    strict_tp=strict_pred=strict_gold=0
    part_e=part_p=part_pt=part_gt=0
    chunks_pred=[]; chunks_gold=[]

    for ptxt,gtxt in zip(preds, gtexts):
        pp = parse_pairs(ptxt)
        gg = parse_pairs(gtxt)
        # strict
        sp,sg = set(pp), set(gg)
        tp = len(sp & sg)
        strict_tp += tp; strict_pred += len(sp); strict_gold += len(sg)
        # partial
        ce,cp,pt,gt = match_partial(pp,gg)
        part_e += ce; part_p += cp; part_pt += pt; part_gt += gt
        chunks_pred.append(pp); chunks_gold.append(gg)

    sP = strict_tp/strict_pred if strict_pred else 0.0
    sR = strict_tp/strict_gold if strict_gold else 0.0
    sF = (2*sP*sR)/(sP+sR) if (sP+sR) else 0.0

    pP,pR,pF = prf(part_e,part_p,part_pt,part_gt, w=0.5)
    r_rel = relaxed_recall_by_chunks(chunks_pred, chunks_gold)

    print("\n===== STRICT =====")
    print(f"P={sP:.4f} R={sR:.4f} F1={sF:.4f}")
    print("===== PARTIAL (MUC 0.5) =====")
    print(f"P={pP:.4f} R={pR:.4f} F1={pF:.4f}")
    print("===== RELAXED (DEGREE2) =====")
    rF = (2*pP*r_rel)/(pP+r_rel) if (pP+r_rel)>0 else 0.0
    print(f"Relaxed-Recall={r_rel:.4f} | Relaxed-F1≈{rF:.4f}")
    return dict(strict_f1=sF, partial_f1=pF, relaxed_recall=r_rel, relaxed_f1=rF)

In [None]:
# -----------------------------
# 10) Quick sanity + Eval
# -----------------------------
best_tok   = AutoTokenizer.from_pretrained(FINAL_DIR)
best_model = AutoModelForSeq2SeqLM.from_pretrained(FINAL_DIR)

print("\n--- VALID ---")
evaluate(valid_ds, best_model, best_tok)
print("\n--- TEST ---")
evaluate(test_ds, best_model, best_tok)

# نمونه‌ی sanity
demo_sent = 'A corps of volunteers ... Two Sicilies.'
demo_prompt = build_prompt(demo_sent, k=3)
print("\nDEMO:\n", generate_batch([demo_prompt], best_model, best_tok)[0])