In [None]:
!pip install transformers datasets stanza

In [None]:
!unzip lessons_final.zip -d lessons_final

# Pretraining

In [None]:
import os
import pickle
import random
import torch
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5Config, T5ForConditionalGeneration,
    Trainer, TrainingArguments, EarlyStoppingCallback,
    DataCollatorWithPadding
)

# ──────── Device ────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ──────── Simple span-masking ────────
def simple_t5_mask(text):
    words = text.strip().split()
    if len(words) < 4:
        return text, text
    span_len = random.randint(1, min(3, len(words) - 1))
    start = random.randint(0, len(words) - span_len)
    masked = words[:start] + ["<extra_id_0>"] + words[start + span_len:]
    return " ".join(masked), "<extra_id_0> " + " ".join(words[start:start + span_len])

# ──────── Preprocess ────────
def preprocess(example, tokenizer):
    src = tokenizer(
        example["input"],
        padding="max_length", truncation=True, max_length=256
    )
    tgt = tokenizer(
        example["output"],
        padding="max_length", truncation=True, max_length=64
    )
    src["labels"] = [t if t != tokenizer.pad_token_id else -100
                     for t in tgt["input_ids"]]
    return src

# ──────── Prediction Cleaning ────────
def clean_prediction(raw, tokenizer):
    return raw.replace(tokenizer.pad_token, "")\
              .replace(tokenizer.eos_token, "")\
              .strip()

# ──────── Evaluation ────────
def evaluate(model, tokenizer, dataset):
    model.eval()
    correct = 0
    for ex in dataset:
        enc = tokenizer(
            ex["input"], return_tensors="pt",
            padding=True, truncation=True, max_length=256
        ).to(device)
        with torch.no_grad():
            ids = model.generate(
                input_ids=enc["input_ids"],
                attention_mask=enc["attention_mask"],
                max_new_tokens=20, do_sample=False
            )
        pred = clean_prediction(tokenizer.decode(ids[0], skip_special_tokens=False), tokenizer)
        if pred == ex["output"].strip():
            correct += 1
    acc = correct / len(dataset)
    print(f"✅ Eval Accuracy: {correct}/{len(dataset)} = {acc:.2f}")
    torch.cuda.empty_cache()

# ──────── Training per‐level ────────
def train_level(dataset, tokenizer, model, args, level):
    print(f"\n🔁 Training curriculum level {level}")
    tokenized = dataset.map(
        lambda ex: preprocess(ex, tokenizer),
        remove_columns=["input", "output"]
    )
    split = tokenized.train_test_split(test_size=0.05, seed=42)
    print(f"{len(split['train'])} train | {len(split['test'])} val")
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=split['train'],
        eval_dataset=split['test'],
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        data_collator=DataCollatorWithPadding(tokenizer)
    )
    trainer.train()
    del trainer
    torch.cuda.empty_cache()

# ──────── Setup tokenizer + **random‐init model** ────────
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# load the default T5‐base config, but do NOT load its weights:
config = T5Config.from_pretrained("t5-base")
model  = T5ForConditionalGeneration(config).to(device)

args = TrainingArguments(
    output_dir="./t5_scratch_ablation",
    per_device_train_batch_size=16,
    num_train_epochs=10,
    logging_strategy="epoch",
    report_to="none",

    # do evaluation & checkpointing every epoch
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    # automatically reload best checkpoint
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    remove_unused_columns=False
)


# ──────── Build curriculum ────────
pkl_folder = "./lessons_final/lessons_final"
curriculum = []
import re

def extract_vol_lesson(fn):
    m = re.match(r'v(\d+)_l(\d+)\.pkl', fn)
    return (int(m[1]), int(m[2])) if m else (999,999)

files = sorted(
    [f for f in os.listdir(pkl_folder) if f.endswith(".pkl")],
    key=extract_vol_lesson
)

for fname in files:
    with open(os.path.join(pkl_folder, fname), "rb") as f:
        docs = pickle.load(f)
    examples = []
    for d in docs:
        text = d.text if hasattr(d, "text") else str(d)
        m, t = simple_t5_mask(text)
        examples.append({"input": m, "output": t})
    curriculum.append((fname, Dataset.from_list(examples)))
    print(f"✅ Loaded {fname} ({len(examples)} examples)")

# ──────── Run curriculum ────────
for i, (lesson_name, ds) in enumerate(curriculum, start=1):
    # if i == 1: break
    print(f"▶︎ Lesson {lesson_name}")
    train_level(ds, tokenizer, model, args, i)

print("\n🧪 Final Test Generation:")
test_text = "They are <extra_id_0> the car at the <extra_id_1>."
print("INPUT :", test_text)


In [None]:
save_dir = "my_syntax_t5_model"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


('my_syntax_t5_model/tokenizer_config.json',
 'my_syntax_t5_model/special_tokens_map.json',
 'my_syntax_t5_model/spiece.model',
 'my_syntax_t5_model/added_tokens.json')

In [None]:
save_dir = "my_syntax_t5_model"

# config = T5Config.from_pretrained("t5-base")
# model  = T5ForConditionalGeneration(config).to(device)
model = T5ForConditionalGeneration.from_pretrained('my_syntax_t5_model').to(device)
tokenizer = T5Tokenizer.from_pretrained(save_dir)


In [None]:
import os
import pickle
import torch
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, recall_score, f1_score

# ─── Device ───────────────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ─── Tokenizer & Model ─────────────────────────────────────────────────────────
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# add 6 special label‐tokens
label_names  = ["ABBR", "ENTY", "DESC", "HUM", "LOC", "NUM"]
label_tokens = [f"<LABEL_{i}>" for i in range(len(label_names))]
tokenizer.add_tokens(label_tokens)

# load T5‐base (pretrained) for classification
model = T5ForConditionalGeneration.from_pretrained("my_syntax_t5_model")
model.resize_token_embeddings(len(tokenizer))
model.to(device)


# mappings: int → token → human label
label_map     = {i: label_tokens[i] for i in range(len(label_tokens))}
inv_label_map = {tok: label_names[i] for i, tok in label_map.items()}

# ─── Load Dataset & Preparsed .pkl ─────────────────────────────────────────────
trec = load_dataset("CogComp/trec")
with open("trec_train_docs.pkl", "rb") as f:
    train_docs = pickle.load(f)
with open("trec_test_docs.pkl", "rb") as f:
    test_docs  = pickle.load(f)

train_exs = [
    {"text": doc.text, "coarse_label": trec["train"][i]["coarse_label"]}
    for i, doc in enumerate(train_docs)
]
test_exs = [
    {"text": doc.text, "coarse_label": trec["test"][i]["coarse_label"]}
    for i, doc in enumerate(test_docs)
]

train_ds = Dataset.from_list(train_exs)
test_ds  = Dataset.from_list(test_exs)

# ─── Preprocessing ─────────────────────────────────────────────────────────────
def preprocess_clf(examples):
    inputs  = examples["text"]
    targets = [label_map[l] for l in examples["coarse_label"]]

    model_inputs = tokenizer(
        inputs,
        max_length=64,
        padding="max_length",
        truncation=True
    )

    # tokenize targets; we only want the *first* token as the label
    lbl = tokenizer(
        targets,
        max_length=1,
        padding="max_length",
        truncation=False
    )["input_ids"]

    # build labels: first token is the label, rest is -100
    model_inputs["labels"] = [
        [seq[0]] + [-100] * (len(seq) - 1) for seq in lbl
    ]
    return model_inputs

train_tok = train_ds.map(
    preprocess_clf,
    batched=True,
    remove_columns=["text", "coarse_label"]
)
test_tok = test_ds.map(
    preprocess_clf,
    batched=True,
    remove_columns=["text", "coarse_label"]
)

train_tok.set_format("torch")
test_tok.set_format("torch")

# ─── Torch Dataset Wrapper ────────────────────────────────────────────────────
class ClfDataset(TorchDataset):
    def __init__(self, hf_ds): self.ds = hf_ds
    def __len__(self): return len(self.ds)
    def __getitem__(self, i):
        ex = self.ds[i]
        return {
            "input_ids":      ex["input_ids"],
            "attention_mask": ex["attention_mask"],
            "labels":         ex["labels"],
        }

train_torch = ClfDataset(train_tok)
test_torch  = ClfDataset(test_tok)

# ─── Metrics ───────────────────────────────────────────────────────────────────
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = logits.argmax(-1)

    # replace -100 with pad_token_id for decoding
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    decoded_preds  = tokenizer.batch_decode(preds, skip_special_tokens=False)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

    y_true, y_pred = [], []
    for p_str, l_str in zip(decoded_preds, decoded_labels):
        p_tok = p_str.strip().split()[0] if p_str.strip() else ""
        l_tok = l_str.strip().split()[0] if l_str.strip() else ""
        y_pred.append(inv_label_map.get(p_tok, "???"))
        y_true.append(inv_label_map.get(l_tok, "???"))

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall":   recall_score(y_true, y_pred, average="macro"),
        "f1":       f1_score(y_true, y_pred, average="macro"),
    }

# ─── TrainingArguments & Trainer ──────────────────────────────────────────────
args = TrainingArguments(
    output_dir="./t5_trec_labeltok",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=1e-5,
    report_to="none",
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_torch,
    eval_dataset=test_torch,
    compute_metrics=compute_metrics
)

# ─── Run ───────────────────────────────────────────────────────────────────────
trainer.train()
trainer.evaluate()
