In [1]:
import pandas as pd, numpy as np, torch, torch.nn as nn
from datasets import Dataset, DatasetDict
from collections import defaultdict
from transformers import (
    AutoTokenizer, AutoConfig,
    Trainer, TrainingArguments,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)

In [2]:
!nvidia-smi

Mon Nov  3 21:49:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:8B:00.0 Off |                    0 |
| N/A   42C    P0             82W /  400W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
### 1-a  Read tagged train ###
tagged = pd.read_csv(
    "../data/Tagged_Titles_Train.tsv",
    sep="\t", keep_default_na=False, na_values=None
)

### 1-b  Per-category allow-set ###
allow = (
    tagged.groupby("Category")["Tag"]
          .apply(lambda s: set(s.unique()) - {""})
          .to_dict()
)

### 1-c  Union BIO label list ###
BASE = set(t for t in tagged["Tag"].unique() if t and t != "O")
label_list = ["O"] + sorted(
    f"{p}-{t}" for t in BASE for p in ("B","I")
)
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

### 1-d  Category→mask tensor (bool[num_labels]) ###
allow_mask = {}
for cat in [1,2]:
    ok = {"O"}
    for t in allow[cat]:
        ok.add(f"B-{t}"); ok.add(f"I-{t}")
    allow_mask[cat] = torch.tensor([l in ok for l in label_list])

In [4]:
def rows_to_examples(df):
    """
    Correctly handle multi-token entities and separate consecutive same-tag entities.
    """
    records = []
    
    for record_num, group in df.groupby("Record Number"):
        group = group.sort_index()
        
        tokens = []
        bio_tags = []
        
        prev_tag = None
        entity_started = False
        
        for idx, row in group.iterrows():
            token = row["Token"]
            tag = row["Tag"]
            
            # Handle empty tags (continuation of previous entity)
            if pd.isna(tag) or tag == "":
                if prev_tag and prev_tag != "O":
                    # Continue previous entity
                    tokens.append(token)
                    bio_tags.append(f"I-{prev_tag}")
                else:
                    # Empty tag but no previous entity - treat as O
                    tokens.append(token)
                    bio_tags.append("O")
                    prev_tag = "O"
                continue
            
            # Non-empty tag
            tokens.append(token)
            
            if tag == "O":
                bio_tags.append("O")
                prev_tag = "O"
                entity_started = False
            else:
                # Always use B- for new non-empty tag (each is a separate entity)
                bio_tags.append(f"B-{tag}")
                prev_tag = tag
                entity_started = True
        
        records.append({
            "tokens": tokens,
            "ner_tags": [label2id[b] for b in bio_tags],
            "Category": int(group["Category"].iloc[0])
        })
    
    return records

hf_ds = Dataset.from_list(rows_to_examples(tagged))
splits = hf_ds.train_test_split(test_size=0.15, seed=42)
splits = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"]
})
splits



DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'Category'],
        num_rows: 4250
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'Category'],
        num_rows: 750
    })
})

In [None]:
base_model = "../models/deberta-improved-weak-ner-mk-2"
tok = AutoTokenizer.from_pretrained(base_model)

In [6]:
def tok_fn(batch):
    enc = tok(
        batch["tokens"],
        is_split_into_words=True,
        padding=False,     # fixes input_ids length
        # padding="longest",
        truncation=True,
        max_length=256
    )

    # ---- build label matrix *already padded* ----
    all_labels = []
    for i, word_ids in enumerate(enc.word_ids(batch_index=i)
                                 for i in range(len(enc["input_ids"]))):
        gold = batch["ner_tags"][i]
        seq  = []
        prev = None
        for wid in word_ids:
            if wid is None:               # CLS / SEP / PAD
                seq.append(-100)
            elif wid != prev:             # first sub-word
                seq.append(gold[wid])
                prev = wid
            else:                         # non-first sub-word
                seq.append(-100)
        all_labels.append(seq)            # len(seq) == len(input_ids[i])
    enc["labels"]       = all_labels
    enc["word_ids"]     = [enc.word_ids(i) for i in range(len(enc["input_ids"]))]
    enc["category_id"]  = batch["Category"]
    return enc


# tokenised = splits.map(tok_fn, batched=True, remove_columns=["tokens","ner_tags","Category"])
tokenised = hf_ds.map(tok_fn, batched=True, remove_columns=["tokens","ner_tags","Category"])


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModel, PreTrainedModel, AutoConfig
from torchcrf import CRF
import torch.nn as nn
import torch

class CatAwareCRF(PreTrainedModel):
    config_class = AutoConfig
    
    def __init__(self, config, num_labels=None, allow_mask=None, 
                 base_model_name=None, use_dapt=False, **kwargs):
        super().__init__(config)
        self.num_labels = num_labels
        self.allow_mask = {k: v.bool() for k, v in allow_mask.items()} if allow_mask else {}
        
        # IMPROVED: Load DAPT encoder if specified
        if use_dapt and base_model_name:
            print(f"Loading DAPT encoder from {base_model_name}")
            self.encoder = AutoModel.from_pretrained(
                base_model_name, 
            )
        elif base_model_name and not hasattr(config, "_name_or_path"):
            self.encoder = AutoModel.from_pretrained(
                base_model_name, 
            )
        else:
            self.encoder = AutoModel.from_config(config)
        
        # Task-specific layers
        self.cat_embed = nn.Embedding(3, 64)  # INCREASED from 32
        self.dropout = nn.Dropout(0.2)  # INCREASED from config default
        self.proj = nn.Linear(config.hidden_size + 64, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        
        # Initialize new layers only (not encoder)
        self._init_task_layers()
    
    def _init_task_layers(self):
        """Initialize only task-specific layers, not encoder."""
        nn.init.normal_(self.cat_embed.weight, std=0.02)
        nn.init.normal_(self.proj.weight, std=0.02)
        nn.init.zeros_(self.proj.bias)
    
    def forward(self, input_ids=None, attention_mask=None, 
                labels=None, category_id=None, **ignored):
        h = self.encoder(input_ids, attention_mask=attention_mask).last_hidden_state
        
        # Category embedding
        cat = self.cat_embed(category_id).unsqueeze(1).expand(-1, h.size(1), -1)
        combined = torch.cat([h, cat], dim=-1)
        
        # Projection
        logits = self.proj(self.dropout(combined))
        
        # IMPROVED: Category-aware masking with proper numerical stability
        for c in (1, 2):
            bad = ~self.allow_mask[c].to(logits.device)
            idx = (category_id == c).nonzero(as_tuple=True)[0]
            if len(idx):
                # Use -1e10 instead of -1e4 for better masking
                logits[idx][:, :, bad] = -1e10
        
        if labels is not None:
            mask = attention_mask.bool()
            safe_labels = labels.clone()
            safe_labels[labels == -100] = 0
            
            # Token-mean reduction for stability
            log_lik = self.crf(logits, safe_labels, mask=mask, reduction="token_mean")
            return {"loss": -log_lik, "logits": logits}
        else:
            paths = self.crf.decode(logits, mask=attention_mask.bool())
            max_len = logits.size(1)
            out = torch.full((len(paths), max_len), -100, 
                           dtype=torch.long, device=logits.device)
            for i, seq in enumerate(paths):
                out[i, :len(seq)] = torch.tensor(seq, device=logits.device)
            return {"logits": out}

In [7]:
from transformers import AutoModel, PreTrainedModel, AutoConfig
from torchcrf import CRF
import torch.nn as nn
import torch
import torch.nn.functional as F

class CatAwareCRF(PreTrainedModel):
    config_class = AutoConfig
    
    def __init__(self, config, num_labels=None, allow_mask=None, 
                 base_model_name=None, use_dapt=False, label_smoothing=0.02, **kwargs):
        super().__init__(config)
        self.num_labels = num_labels
        self.allow_mask = {k: v.bool() for k, v in allow_mask.items()} if allow_mask else {}
        self.label_smoothing = label_smoothing
        
        # IMPROVED: Load DAPT encoder if specified
        if use_dapt and base_model_name:
            print(f"Loading DAPT encoder from {base_model_name}")
            self.encoder = AutoModel.from_pretrained(
                base_model_name, 
            )
        elif base_model_name and not hasattr(config, "_name_or_path"):
            self.encoder = AutoModel.from_pretrained(
                base_model_name, 
            )
        else:
            self.encoder = AutoModel.from_config(config)
        
        # Task-specific layers
        self.cat_embed = nn.Embedding(3, 64)  # INCREASED from 32
        self.dropout = nn.Dropout(0.2)  # INCREASED from config default
        self.proj = nn.Linear(config.hidden_size + 64, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        
        # Initialize new layers only (not encoder)
        self._init_task_layers()
    
    def _init_task_layers(self):
        """Initialize only task-specific layers, not encoder."""
        nn.init.normal_(self.cat_embed.weight, std=0.02)
        nn.init.normal_(self.proj.weight, std=0.02)
        nn.init.zeros_(self.proj.bias)
    
    def forward(self, input_ids=None, attention_mask=None, 
                labels=None, category_id=None, **ignored):
        h = self.encoder(input_ids, attention_mask=attention_mask).last_hidden_state
        
        # Category embedding
        cat = self.cat_embed(category_id).unsqueeze(1).expand(-1, h.size(1), -1)
        combined = torch.cat([h, cat], dim=-1)
        
        # Projection
        logits = self.proj(self.dropout(combined))
        
        # IMPROVED: Category-aware masking with proper numerical stability
        for c in (1, 2):
            bad = ~self.allow_mask[c].to(logits.device)
            idx = (category_id == c).nonzero(as_tuple=True)[0]
            if len(idx):
                # Use -1e10 instead of -1e4 for better masking
                logits[idx][:, :, bad] = -1e10
        
        if labels is not None:
            mask = attention_mask.bool()
            safe_labels = labels.clone()
            safe_labels[labels == -100] = 0
            
            # Add label smoothing
            if self.label_smoothing > 0:
                # Smooth the labels before CRF
                num_labels = logits.size(-1)
                smooth_labels = safe_labels.float()
                # CRF expects hard labels, so apply smoothing to loss
                log_lik = self.crf(logits, safe_labels, mask=mask, reduction="token_mean")
                
                # Add regularization term
                uniform_dist = torch.full_like(logits, 1.0 / num_labels)
                kl_loss = F.kl_div(
                    F.log_softmax(logits, dim=-1),
                    uniform_dist,
                    reduction='batchmean'
                )
                loss = -log_lik + self.label_smoothing * kl_loss
            else:
                log_lik = self.crf(logits, safe_labels, mask=mask, reduction="token_mean")
                loss = -log_lik
            
            return {"loss": loss, "logits": logits}
        else:
            paths = self.crf.decode(logits, mask=attention_mask.bool())
            max_len = logits.size(1)
            out = torch.full((len(paths), max_len), -100, 
                           dtype=torch.long, device=logits.device)
            for i, seq in enumerate(paths):
                out[i, :len(seq)] = torch.tensor(seq, device=logits.device)
            return {"logits": out}

In [8]:
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

cuda


In [None]:
use_dapt = True

cfg = AutoConfig.from_pretrained(
    base_model,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

model = CatAwareCRF(
    cfg, 
    num_labels=len(label_list), 
    allow_mask=allow_mask, 
    base_model_name=base_model,
    use_dapt=use_dapt
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")



In [9]:
import numpy as np

def make_weighted_fbeta(eval_ds):
    cats = np.array(eval_ds["category_id"])        # 1-D, same order as eval_ds
    wids = eval_ds["word_ids"]                     # list-of-lists, same order
    β, β2 = 0.2, 0.2 ** 2

    def metric_weighted_fbeta(p):                  # ← give this to Trainer
        # ── 1. get label-ids from predictions ──────────────────────────
        if p.predictions.ndim == 3:                # logits  [B,L,C]
            preds = p.predictions.argmax(-1)
        else:                                      # already [B,L] ids (CRF decode)
            preds = p.predictions
        golds = p.label_ids

        # ── 2. build token-level maps ──────────────────────────────────
        gold_map, pred_map = {}, {}                # (cat, aspect) → {(seq,tok)}

        for i, (plab, glab, wid_list) in enumerate(zip(preds, golds, wids)):
            cat = int(cats[i])
            for tok_idx, pid, gid in zip(wid_list, plab, glab):
                if tok_idx is None or gid == -100:
                    continue                       # skip pads / second sub-words

                # gold
                g_lbl = id2label[gid]
                if g_lbl != "O":
                    asp = g_lbl.split("-", 1)[-1]
                    gold_map.setdefault((cat, asp), set()).add((i, tok_idx))

                # pred
                p_lbl = id2label[pid]
                if p_lbl != "O":
                    asp = p_lbl.split("-", 1)[-1]
                    pred_map.setdefault((cat, asp), set()).add((i, tok_idx))

        # ── 3. weighted Fβ per category ────────────────────────────────
        out = {}
        for cat in (1, 2):
            f_cat = 0.0
            aspects = {a for (c, a) in gold_map if c == cat}
            total_gold_tok = sum(len(gold_map[(cat, a)]) for a in aspects)

            for asp in aspects:
                g_set = gold_map.get((cat, asp), set())
                p_set = pred_map.get((cat, asp), set())
                tp = len(g_set & p_set)
                prec = tp / len(p_set) if p_set else 0.0
                rec  = tp / len(g_set) if g_set else 0.0
                fβ = (1 + β2) * prec * rec / (β2 * prec + rec) if (prec + rec) else 0.0
                f_cat += len(g_set) * fβ                      # weight by true count

            out[f"Fbeta_cat{cat}"] = f_cat / total_gold_tok if total_gold_tok else 0.0

        out["final_score"] = 0.5 * (out["Fbeta_cat1"] + out["Fbeta_cat2"])
        return out

    return metric_weighted_fbeta

compute_metrics_ = make_weighted_fbeta(tokenised["validation"])


KeyError: "Column validation not in the dataset. Current columns in the dataset: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids', 'category_id']"

In [9]:
def new_collator(features):
    # Determine max sequence length dynamically for this batch
    max_len = max(len(f["input_ids"]) for f in features)

    # pad function
    def pad_to_max(seq, pad_value=0):
        return seq + [pad_value] * (max_len - len(seq))

    # Build tensor fields
    input_ids = torch.tensor([pad_to_max(f["input_ids"], pad_value=tok.pad_token_id) for f in features], dtype=torch.long)
    attention_mask = torch.tensor([pad_to_max(f["attention_mask"], pad_value=0) for f in features], dtype=torch.long)

    # For labels, use -100 as ignore index pad
    labels = torch.tensor([pad_to_max(f["labels"], pad_value=-100) for f in features], dtype=torch.long)
    
    category_id = torch.tensor([f["category_id"] for f in features], dtype=torch.long)

    batch = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "category_id": category_id,
    }

    # word_ids (for metrics)
    if "word_ids" in features[0]:
        batch["word_ids"] = [f["word_ids"] for f in features]

    return batch


In [15]:
args = TrainingArguments(
    "deberta-improved-weak-ner-finetuned",
    # BATCH SIZE: Optimized for CRF memory requirements
    per_device_train_batch_size=64,      # REDUCED from 128
    per_device_eval_batch_size=48,       # REDUCED from 64
    gradient_accumulation_steps=4,       # Effective = 128
    
    # LEARNING RATE: Lower after DAPT
    learning_rate=3e-6,                  # REDUCED from 2e-5 (critical!)
    weight_decay=0.01,
    warmup_ratio=0.2,                   # INCREASED warmup
    max_grad_norm=1.0,
    
    # EPOCHS: Reduced to prevent overfitting
    num_train_epochs=20,                 # REDUCED from 60
    
    # OPTIMIZATION
    optim="adamw_torch_fused",
    lr_scheduler_type="cosine",  # CHANGED
    
    # MIXED PRECISION: BF16 for CRF stability
    bf16=True,                           # CHANGED from FP16
    fp16=False,
    
    # EVALUATION & SAVING
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="final_score",
    greater_is_better=True,
    
    # LOGGING
    logging_steps=50,
    logging_first_step=True,
    
    # EFFICIENCY
    dataloader_num_workers=16,
    dataloader_pin_memory=True,
    gradient_checkpointing=False,
    
    # REPRODUCIBILITY
    seed=42,
    data_seed=42,
    
    # MISC
    report_to="none",
)

# num_training_steps = len(tokenised["train"]) // args.per_device_train_batch_size * args.num_train_epochs
# lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=int(0.1*num_training_steps),
#                              num_training_steps=num_training_steps)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenised["train"],
    eval_dataset=tokenised["validation"],
    data_collator=new_collator,
    processing_class=tok,
    compute_metrics=compute_metrics_,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,      # Stop after 3 epochs no improvement
            early_stopping_threshold=0.001   # 0.1% threshold
        )
    ]
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Fbeta Cat1,Fbeta Cat2,Final Score
1,5.6733,1.210911,0.694115,0.546409,0.620262
2,5.6733,1.017413,0.777951,0.686205,0.732078
3,4.8115,0.701683,0.818401,0.81572,0.817061
4,4.8115,0.472982,0.858223,0.817573,0.837898
5,4.8115,0.316786,0.877354,0.881534,0.879444
6,1.9791,0.235765,0.886563,0.904933,0.895748
7,1.9791,0.195822,0.920782,0.92007,0.920426
8,1.9791,0.169747,0.93319,0.938921,0.936056
9,0.8609,0.153683,0.939481,0.941709,0.940595
10,0.8609,0.142016,0.94412,0.94668,0.9454


TrainOutput(global_step=272, training_loss=1.6684912267853231, metrics={'train_runtime': 634.4067, 'train_samples_per_second': 133.983, 'train_steps_per_second': 0.536, 'total_flos': 4830915362263992.0, 'train_loss': 1.6684912267853231, 'epoch': 16.0})

In [17]:
test_metrics = trainer.evaluate()
print(test_metrics)

{'eval_loss': 0.12460056692361832, 'eval_Fbeta_cat1': 0.9485711999839511, 'eval_Fbeta_cat2': 0.9541353925154055, 'eval_final_score': 0.9513532962496782, 'eval_runtime': 1.5941, 'eval_samples_per_second': 470.471, 'eval_steps_per_second': 10.037, 'epoch': 16.0}


In [32]:
trainer.save_model("../models/deberta-improved-weak-ner-finetuned")

In [None]:
#### Applying to the Quiz Data

In [None]:
### ---------------------------------------- ###

In [18]:
quiz = (
    pd.read_csv("../data/Listing_Titles.tsv", sep="\t", keep_default_na=False, na_values=None)
      .query("5001 <= `Record Number` <= 30000")               # <- slice
)

# keep tokens AND metadata so we can write them back later
quiz["tokens"] = quiz["Title"].str.split()
quiz_ds = Dataset.from_pandas(
    quiz[["Record Number","Category","tokens"]],   # no Title column needed
    preserve_index=False
)

In [19]:
allow = (
    tagged.groupby("Category")["Tag"]
          .apply(lambda s: {t for t in s.unique() if t and t != "O"})
          .to_dict()
)

In [20]:
def tok_quiz(batch):
    enc = tok(
        batch["tokens"],
        is_split_into_words=True,
        padding=False,  # CHANGED: Let collator handle padding
        truncation=True,
        max_length=256  # INCREASED
    )
    
    enc["labels"] = [[-100] * len(ids) for ids in enc["input_ids"]]
    enc["word_ids"] = [enc.word_ids(i) for i in range(len(enc["input_ids"]))]
    enc["category_id"] = batch["Category"]
    enc["record_id"] = batch["Record Number"]
    enc["tokens"] = batch["tokens"]
    return enc

quiz_ds = Dataset.from_pandas(
    quiz[["Record Number","Category","tokens"]],
    preserve_index=False
)

tokenised_quiz = quiz_ds.map(tok_quiz, batched=True, remove_columns=[])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [21]:
trainer.compute_metrics = None # Don't run metrics on the quiz set
pred_logits = trainer.predict(tokenised_quiz).predictions
pred_ids    = pred_logits.argmax(-1)  


In [22]:
# Decode predictions
records = []

for i, ex in enumerate(tokenised_quiz):
    rec = int(ex["record_id"])
    cat = int(ex["category_id"])
    words = ex["tokens"]
    wids = ex["word_ids"]
    labs = [id2label[idx] if idx != -100 else "O" for idx in pred_ids[i]]
    
    # Keep only first sub-word label per word
    word_labels = []
    prev_wid = None
    for wid, lab in zip(wids, labs):
        if wid is not None and wid != prev_wid:
            word_labels.append((wid, lab))
            prev_wid = wid
    
    # Decode entities correctly
    current_tokens = []
    current_tag = None
    
    for wid, label in word_labels:
        word = words[wid]
        
        if label == "O":
            # Flush any current entity
            if current_tokens and current_tag:
                records.append((rec, cat, current_tag, " ".join(current_tokens)))
                current_tokens = []
                current_tag = None
            # Add O token
            records.append((rec, cat, "O", word))
            continue
        
        prefix, tag = label.split("-", 1)
        
        # Check if tag is legal for this category
        if tag not in allow[cat]:
            continue
        
        if prefix == "B":
            # NEW entity starts - flush previous
            if current_tokens and current_tag:
                records.append((rec, cat, current_tag, " ".join(current_tokens)))
            # Start new entity
            current_tokens = [word]
            current_tag = tag
        elif prefix == "I":
            # Continuation of entity
            if tag == current_tag:
                current_tokens.append(word)
            else:
                # I- tag doesn't match current - treat as new entity (model error)
                if current_tokens and current_tag:
                    records.append((rec, cat, current_tag, " ".join(current_tokens)))
                current_tokens = [word]
                current_tag = tag
    
    # Flush final entity
    if current_tokens and current_tag:
        records.append((rec, cat, current_tag, " ".join(current_tokens)))

# Filter out O tags for submission
submission = pd.DataFrame(records, columns=["Record Number", "Category", "Tag", "Token"])
submission = submission[submission["Tag"] != "O"]

In [23]:

# BEFORE saving submission, validate categories
quiz_categories = quiz.set_index("Record Number")["Category"].to_dict()

# Validate each submission row
for idx, row in submission.iterrows():
    rec_num = row["Record Number"]
    expected_cat = quiz_categories.get(rec_num)
    if row["Category"] != expected_cat:
        print(f"⚠️ Category mismatch at record {rec_num}!")



In [24]:
import csv

submission.to_csv(
    "../results/deberta-improved-weak-ner-finetuned-1.tsv",
    sep="\t",
    header=False,
    index=False,
    encoding="utf-8",
    quoting=csv.QUOTE_NONE,
    escapechar="\\"
)


In [36]:
import gc
del model, trainer            # or any large tensors / optimizers
gc.collect()                  # Python-side ref-count sweep
torch.cuda.empty_cache()  

In [None]:
## ENSEMBLE TRY ---------------------------------

In [None]:
def train_single_model(seed, model_name, train_dataset):
    """Train one model with specific seed."""
    print(f"\n{'='*60}")
    print(f"Training Model {seed} - {model_name}")
    print(f"{'='*60}\n")
    
    # Set seed
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # Reinitialize model (fresh weights)
    cfg = AutoConfig.from_pretrained(
        base_model,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    
    model = CatAwareCRF(
        cfg,
        num_labels=len(label_list),
        allow_mask=allow_mask,
        base_model_name=base_model,
        use_dapt=True,
        label_smoothing=0.005
    ).to(device)
    
    # Training args
    args = TrainingArguments(
        output_dir=f"../models/deberta-ner-ensemble-seed{seed}",
        
        # A100-optimized batch size
        per_device_train_batch_size=48,   # ✅ Use A100 memory efficiently
        gradient_accumulation_steps=3,    # ✅ Effective = 144
        
        # Learning rate
        learning_rate=4e-6,               # ✅ Revert to what worked
        weight_decay=0.01,
        warmup_ratio=0.15,
        max_grad_norm=1.0,
        
        # Epochs
        num_train_epochs=45,             # ✅ Proven sweet spot
        
        # Optimization
        optim="adamw_torch_fused",        # ✅ Fastest for A100
        lr_scheduler_type="cosine_with_restarts",
        
        # Mixed precision
        bf16=True,
        fp16=False,
        
        # Evaluation
        eval_strategy="no",
        save_strategy="no",
        
        # Logging
        logging_steps=25,
        
        # Efficiency
        dataloader_num_workers=16,
        dataloader_pin_memory=True,
        gradient_checkpointing=False,
        
        # Seed
        seed=seed,
        data_seed=seed,
        
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        data_collator=new_collator,
        processing_class=tok,
    )
    
    # Train
    trainer.train()
    
    # Save
    trainer.save_model(f"../models/deberta-ner-ensemble-seed{seed}-final-confidence")
    
    return f"../models/deberta-ner-ensemble-seed{seed}-final"

In [10]:
# Train 10 models
ensemble_seeds = [42, 123, 456, 789, 2024]
ensemble_paths = []

for seed in ensemble_seeds:
    model_path = train_single_model(seed, f"model_{seed}")
    ensemble_paths.append(model_path)
    
    # Clear GPU memory
    torch.cuda.empty_cache()

print(f"\n✓ Trained {len(ensemble_paths)} models")
print("Ensemble models saved at:")
for path in ensemble_paths:
    print(f"  - {path}")


Training Model 42 - model_42

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,13.2724
50,12.6923
75,11.5056
100,9.722
125,7.52
150,5.5461
175,4.0106
200,3.2355
225,2.8708
250,2.6869



Training Model 123 - model_123

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,12.9741
50,12.3844
75,11.2461
100,9.5724
125,7.5061
150,5.4787
175,4.0698
200,3.264
225,2.8839
250,2.669



Training Model 456 - model_456

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,13.4122
50,12.8409
75,11.6812
100,9.8691
125,7.4778
150,5.3384
175,3.962
200,3.2254
225,2.8645
250,2.6937



Training Model 789 - model_789

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,12.4801
50,11.8739
75,10.665
100,8.8635
125,6.8069
150,4.986
175,3.7298
200,3.0954
225,2.8228
250,2.6657



Training Model 2024 - model_2024

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,13.1768
50,12.5944
75,11.4488
100,9.7268
125,7.5217
150,5.4437
175,3.976
200,3.2218
225,2.8808
250,2.6812



✓ Trained 5 models
Ensemble models saved at:
  - ../models/deberta-ner-ensemble-seed42-final
  - ../models/deberta-ner-ensemble-seed123-final
  - ../models/deberta-ner-ensemble-seed456-final
  - ../models/deberta-ner-ensemble-seed789-final
  - ../models/deberta-ner-ensemble-seed2024-final


In [11]:
torch.cuda.empty_cache()

In [None]:
# ============================================================================
# ENSEMBLE PREDICTION (Majority Voting)
# ============================================================================
# ensemble_paths = [f"../models/deberta-ner-ensemble-seed{42}-final", f"../models/deberta-ner-ensemble-seed{123}-final", f"../models/deberta-ner-ensemble-seed{456}-final", f"../models/deberta-ner-ensemble-seed{789}-final", f"../models/deberta-ner-ensemble-seed{2024}-final"]
def load_ensemble_models(model_paths):
    """Load all trained models."""
    models = []
    
    for path in model_paths:
        print(f"Loading model from {path}...")
        
        # Let AutoModel handle the loading
        from transformers import AutoModelForTokenClassification
        
        # This won't work directly, so use manual approach:
        cfg = AutoConfig.from_pretrained(path)
        
        # Reinitialize the full model structure
        model = CatAwareCRF(
            cfg,
            num_labels=len(label_list),
            allow_mask=allow_mask,
            base_model_name=None,
            use_dapt=True
        )
        
        # Now load the task-specific layers
        from safetensors.torch import load_file
        state_dict = load_file(f"{path}/model.safetensors")
        model.load_state_dict(state_dict)  # Load everything!
        
        model.to(device)
        model.eval()
        models.append(model)
    
    return models


def ensemble_predict(models, tokenised_quiz, trainer_args):
    """
    Get predictions from all models and vote.
    Returns: voted predictions (batch_size, seq_len)
    """
    all_predictions = []
    
    # Get predictions from each model
    for i, model in enumerate(models):
        print(f"Predicting with model {i+1}/{len(models)}...")
        
        # Create temporary trainer for prediction
        temp_trainer = Trainer(
            model=model,
            args=trainer_args,
            data_collator=new_collator,
            processing_class=tok,
        )
        
        # Ensure labels are NOT passed (triggers decode path)
        pred_output = temp_trainer.predict(tokenised_quiz)
        pred_output_array = pred_output.predictions
        
        # Check shape and handle accordingly
        if pred_output_array.ndim == 3:
            # Training mode was triggered - logits returned
            pred_ids = pred_output_array.argmax(-1)
        elif pred_output_array.ndim == 2:
            # Inference mode - already decoded IDs
            pred_ids = pred_output_array
        else:
            raise ValueError(f"Unexpected prediction shape: {pred_output_array.shape}")
        
        # CRF already decoded, predictions are label IDs
        all_predictions.append(pred_ids)
        
        # Clear memory
        torch.cuda.empty_cache()
    
    # Voting: majority wins
    print("Performing majority voting...")
    all_predictions = np.array(all_predictions)  # (num_models, batch_size, seq_len)
    
    voted_predictions = []
    for i in range(all_predictions.shape[1]):  # For each example
        example_preds = all_predictions[:, i, :]  # (num_models, seq_len)
        
        # Majority vote per token
        voted_seq = []
        for j in range(example_preds.shape[1]):  # For each token
            token_votes = example_preds[:, j]
            
            # Ignore -100 (padding/special tokens)
            valid_votes = token_votes[token_votes != -100]
            
            if len(valid_votes) == 0:
                voted_seq.append(-100)
            else:
                # Most common prediction
                from collections import Counter
                vote_counts = Counter(valid_votes)
                majority_vote = vote_counts.most_common(1)[0][0]
                # voted_seq.append(majority_vote)
                voted_seq.append(int(majority_vote))
        
        voted_predictions.append(voted_seq)
    
    # return np.array(voted_predictions)
    return np.array(voted_predictions, dtype=np.int64)
    
# ============================================================================
# LOAD QUIZ DATA
# ============================================================================

quiz = (
    pd.read_csv("../data/Listing_Titles.tsv", sep="\t", 
                keep_default_na=False, na_values=None)
    .query("5001 <= `Record Number` <= 30000")
)

quiz["tokens"] = quiz["Title"].str.split()
quiz_ds = Dataset.from_pandas(
    quiz[["Record Number", "Category", "tokens"]],
    preserve_index=False
)


def tok_quiz(batch):
    enc = tok(
        batch["tokens"],
        is_split_into_words=True,
        padding=False,
        truncation=True,
        max_length=256
    )
    
    enc["labels"] = [[-100] * len(ids) for ids in enc["input_ids"]]
    enc["word_ids"] = [enc.word_ids(i) for i in range(len(enc["input_ids"]))]
    enc["category_id"] = batch["Category"]
    enc["record_id"] = batch["Record Number"]
    enc["tokens"] = batch["tokens"]
    return enc

tok = AutoTokenizer.from_pretrained(base_model)
tokenised_quiz = quiz_ds.map(tok_quiz, batched=True, remove_columns=[])


# ============================================================================
# ENSEMBLE INFERENCEFl
# ============================================================================

# Load all ensemble models

ensemble_models = load_ensemble_models(ensemble_paths)

# Dummy trainer args for prediction
dummy_args = TrainingArguments(
    output_dir="../temp",
    per_device_eval_batch_size=48,
    dataloader_num_workers=16,
    bf16=True,
)

# Get ensemble predictions
pred_ids = ensemble_predict(ensemble_models, tokenised_quiz, dummy_args)

print(f"✓ Ensemble predictions shape: {pred_ids.shape}")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Loading model from ../models/deberta-ner-ensemble-seed42-final...
Loading model from ../models/deberta-ner-ensemble-seed123-final...
Loading model from ../models/deberta-ner-ensemble-seed456-final...
Loading model from ../models/deberta-ner-ensemble-seed789-final...
Loading model from ../models/deberta-ner-ensemble-seed2024-final...
Predicting with model 1/5...


Predicting with model 2/5...


Predicting with model 3/5...


Predicting with model 4/5...


Predicting with model 5/5...


Performing majority voting...
✓ Ensemble predictions shape: (25000, 49)


In [12]:
# Decode predictions
records = []

for i, ex in enumerate(tokenised_quiz):
    rec = int(ex["record_id"])
    cat = int(ex["category_id"])
    words = ex["tokens"]
    wids = ex["word_ids"]
    labs = [id2label[idx] if idx != -100 else "O" for idx in pred_ids[i]]
    
    # Keep only first sub-word label per word
    word_labels = []
    prev_wid = None
    for wid, lab in zip(wids, labs):
        if wid is not None and wid != prev_wid:
            word_labels.append((wid, lab))
            prev_wid = wid
    
    # Decode entities correctly
    current_tokens = []
    current_tag = None
    
    for wid, label in word_labels:
        word = words[wid]
        
        if label == "O":
            # Flush any current entity
            if current_tokens and current_tag:
                records.append((rec, cat, current_tag, " ".join(current_tokens)))
                current_tokens = []
                current_tag = None
            # Add O token
            records.append((rec, cat, "O", word))
            continue
        
        prefix, tag = label.split("-", 1)
        
        # Check if tag is legal for this category
        if tag not in allow[cat]:
            continue
        
        if prefix == "B":
            # NEW entity starts - flush previous
            if current_tokens and current_tag:
                records.append((rec, cat, current_tag, " ".join(current_tokens)))
            # Start new entity
            current_tokens = [word]
            current_tag = tag
        elif prefix == "I":
            # Continuation of entity
            if tag == current_tag:
                current_tokens.append(word)
            else:
                # I- tag doesn't match current - treat as new entity (model error)
                if current_tokens and current_tag:
                    records.append((rec, cat, current_tag, " ".join(current_tokens)))
                current_tokens = [word]
                current_tag = tag
    
    # Flush final entity
    if current_tokens and current_tag:
        records.append((rec, cat, current_tag, " ".join(current_tokens)))

In [None]:
def build_gazetteers(tagged_df):
    """Extract known entities from training data."""
    
    gazetteers = {}
    
    for tag in tagged_df["Tag"].unique():
        if tag and tag != "O":
            values = set(
                tagged_df[tagged_df["Tag"] == tag]["Token"]
                .str.lower()
                .str.strip()
                .unique()
            )
            # Remove very common/short tokens (noise)
            gazetteers[tag] = {v for v in values if len(v) > 0}
    
    return gazetteers


# Load tagged data to get all possible tags
tagged = pd.read_csv(
    "../data/Tagged_Titles_Train.tsv",
    sep="\t", keep_default_na=False, na_values=None
)


# Create BIO label list
BASE = set(t for t in tagged["Tag"].unique() if t and t != "O")
label_list = ["O"] + sorted(
    f"{p}-{t}" for t in BASE for p in ("B", "I")
)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print(f"Created {len(label_list)} labels: {label_list[:10]}...")

gazetteers = build_gazetteers(tagged)

print(f"Gazetteers created for {len(gazetteers)} tags")
for tag, values in gazetteers.items():
    print(f"  {tag}: {len(values)} entries")


def post_process_predictions(records, gazetteers, allow):
    """Fix obvious errors using domain knowledge."""
    corrected = []
    
    for rec, cat, tag, token in records:
        token_lower = token.lower().strip()
        
        # Rule 1: Check gazetteer exclusivity
        if tag != "O":
            found_tags = [gaz_tag for gaz_tag, entities in gazetteers.items() 
                         if token_lower in entities and gaz_tag in allow[cat]]
            
            # If token exclusively in different gazetteer, correct it
            if len(found_tags) == 1 and found_tags[0] != tag:
                tag = found_tags[0]
        
        # Rule 2: Known brand/manufacturer disambiguation
        known_brands = {'bmw', 'audi', 'vw', 'mercedes', 'opel', 'ford'}
        known_manufacturers = {'bosch', 'ate', 'brembo', 'zimmermann', 'febi'}
        
        if token_lower in known_brands and tag == 'Hersteller':
            # Likely should be Marke
            tag = 'Kompatible_Fahrzeug_Marke'
        elif token_lower in known_manufacturers and tag == 'Kompatible_Fahrzeug_Marke':
            # Likely should be Hersteller
            tag = 'Hersteller'
        
        # Rule 3: Position indicators
        position_words = {'va', 'ha', 'vorne', 'hinten', 'links', 'rechts', 'vl', 'vr', 'hl', 'hr'}
        if token_lower in position_words and tag != 'Einbauposition':
            tag = 'Einbauposition'
        
        # Rule 4: Number patterns for Anzahl
        if token.isdigit() and tag != 'Anzahl_Der_Einheiten':
            # Check context - if followed by "stück", "x", etc.
            # (You'd need to track context here)
            pass
        
        corrected.append((rec, cat, tag, token))
    
    return corrected

# Apply after decoding
records = post_process_predictions(records, gazetteers, allow)

print("new records")

Created 59 labels: ['O', 'B-Anwendung', 'B-Anzahl_Der_Einheiten', 'B-Besonderheiten', 'B-Breite', 'B-Bremsscheiben-Aussendurchmesser', 'B-Bremsscheibenart', 'B-Einbauposition', 'B-Farbe', 'B-Größe']...
Gazetteers created for 29 tags
  Kompatible_Fahrzeug_Marke: 173 entries
  Kompatibles_Fahrzeug_Modell: 1902 entries
  Herstellernummer: 1084 entries
  Produktart: 196 entries
  Im_Lieferumfang_Enthalten: 340 entries
  Hersteller: 155 entries
  Modell: 40 entries
  Einbauposition: 41 entries
  Bremsscheiben-Aussendurchmesser: 345 entries
  Bremsscheibenart: 45 entries
  Oe/Oem_Referenznummer(N): 228 entries
  Maßeinheit: 10 entries
  Anzahl_Der_Einheiten: 17 entries
  Kompatibles_Fahrzeug_Jahr: 183 entries
  Produktlinie: 4 entries
  Material: 5 entries
  Größe: 11 entries
  Länge: 3 entries
  Breite: 3 entries
  Besonderheiten: 31 entries
  Menge: 11 entries
  Farbe: 1 entries
  Stärke: 10 entries
  Anwendung: 14 entries
  Oberflächenbeschaffenheit: 2 entries
  SAE_Viskosität: 2 entries


In [13]:
# Filter out O tags for submission
submission = pd.DataFrame(records, columns=["Record Number", "Category", "Tag", "Token"])
submission = submission[submission["Tag"] != "O"]

In [14]:
# BEFORE saving submission, validate categories
quiz_categories = quiz.set_index("Record Number")["Category"].to_dict()

# Validate each submission row
for idx, row in submission.iterrows():
    rec_num = row["Record Number"]
    expected_cat = quiz_categories.get(rec_num)
    if row["Category"] != expected_cat:
        print(f"⚠️ Category mismatch at record {rec_num}!")



In [15]:
import csv

submission.to_csv(
    "../results/weak-nerMK2-smooth0-01-ensemble-120-epochs.tsv",
    sep="\t",
    header=False,
    index=False,
    encoding="utf-8",
    quoting=csv.QUOTE_NONE,
    escapechar="\\"
)


In [None]:
## CONFIDENCE

In [None]:
ensemble_paths = [f"../models/deberta-ner-ensemble-seed{42}-final", f"../models/deberta-ner-ensemble-seed{123}-final", f"../models/deberta-ner-ensemble-seed{456}-final", f"../models/deberta-ner-ensemble-seed{789}-final", f"../models/deberta-ner-ensemble-seed{2024}-final"]
def load_ensemble_models(model_paths):
    """Load all trained models."""
    models = []
    
    for path in model_paths:
        print(f"Loading model from {path}...")
        
        # Let AutoModel handle the loading
        from transformers import AutoModelForTokenClassification
        
        # This won't work directly, so use manual approach:
        cfg = AutoConfig.from_pretrained(path)
        
        # Reinitialize the full model structure
        model = CatAwareCRF(
            cfg,
            num_labels=len(label_list),
            allow_mask=allow_mask,
            base_model_name=None,
            use_dapt=True
        )
        
        # Now load the task-specific layers
        from safetensors.torch import load_file
        state_dict = load_file(f"{path}/model.safetensors")
        model.load_state_dict(state_dict)  # Load everything!
        
        model.to(device)
        model.eval()
        models.append(model)
    
    return models

ensemble_models = load_ensemble_models(ensemble_paths)

Loading model from ../models/deberta-ner-ensemble-seed42-final...
Loading model from ../models/deberta-ner-ensemble-seed123-final...
Loading model from ../models/deberta-ner-ensemble-seed456-final...
Loading model from ../models/deberta-ner-ensemble-seed789-final...
Loading model from ../models/deberta-ner-ensemble-seed2024-final...


In [11]:
# ============================================================================
# ACTIVE LEARNING: Find confident predictions to add to training
# ============================================================================
from tqdm import tqdm
def get_confident_predictions_fast(ensemble_models, unlabeled_data):
    """
    Fast batch prediction using Trainer.
    """
    from collections import Counter
    from tqdm.auto import tqdm
    
    # Trainer args for fast batch prediction
    dummy_args = TrainingArguments(
        output_dir="../temp",
        per_device_eval_batch_size=128,  # ✅ Batch processing
        dataloader_num_workers=16,
        bf16=True,
    )
    
    high_confidence_examples = []
    
    # Get predictions from all models (batched)
    print("Getting predictions from all models...")
    all_model_predictions = []
    
    for model_idx, model in enumerate(ensemble_models):
        print(f"Predicting with model {model_idx+1}/{len(ensemble_models)}...")
        model.eval()
        
        trainer = Trainer(
            model=model,
            args=dummy_args,
            data_collator=new_collator,
            processing_class=tok,
        )
        
        predictions = trainer.predict(unlabeled_data)
        pred_array = predictions.predictions
        
        # Handle argmax if needed
        if pred_array.ndim == 3:
            pred_array = pred_array.argmax(-1)
        
        all_model_predictions.append(pred_array)
        torch.cuda.empty_cache()
    
    # Check agreement
    print("Checking agreement across models...")
    all_model_predictions = np.array(all_model_predictions)  # (num_models, num_examples, seq_len)
    
    for example_idx in tqdm(range(len(unlabeled_data))):
        example = unlabeled_data[example_idx]
        example_preds = all_model_predictions[:, example_idx, :]  # (num_models, seq_len)
        
        # Check agreement for each token
        agreed_sequence = []
        all_agree = True
        
        for token_idx in range(example_preds.shape[1]):
            token_preds = example_preds[:, token_idx]
            
            # Skip padding
            if token_preds[0] == -100:
                continue
            
            # All models must agree
            if len(set(token_preds)) != 1:
                all_agree = False
                break
            
            agreed_sequence.append(int(token_preds[0]))
        
        if all_agree and len(agreed_sequence) > 0:
            high_confidence_examples.append({
                'example': example,
                'predicted_labels': agreed_sequence,
            })
    
    return high_confidence_examples


# Run on unlabeled data (records 5001-30000 that aren't in quiz scoring)
df_unsup = pd.read_csv("../data/Listing_Titles.tsv",
    sep="\t", keep_default_na=False, na_values=None
)

unlabeled_titles = df_unsup[
    (df_unsup["Record Number"] >= 30001) & 
    (df_unsup["Record Number"] <= 200000)
].sample(120000, random_state=42)  # Sample 50K

# Tokenize
unlabeled_titles["tokens"] = unlabeled_titles["Title"].str.split()
unlabeled_ds = Dataset.from_pandas(unlabeled_titles[["Record Number", "Category", "tokens"]])

def tok_quiz(batch):
    enc = tok(
        batch["tokens"],
        is_split_into_words=True,
        padding=False,
        truncation=True,
        max_length=256
    )
    
    enc["labels"] = [[-100] * len(ids) for ids in enc["input_ids"]]
    enc["word_ids"] = [enc.word_ids(i) for i in range(len(enc["input_ids"]))]
    enc["category_id"] = batch["Category"]
    enc["record_id"] = batch["Record Number"]
    enc["tokens"] = batch["tokens"]
    return enc

tok = AutoTokenizer.from_pretrained(ensemble_paths[0])
tokenised_unlabeled = unlabeled_ds.map(tok_quiz, batched=True, remove_columns=[])

# Get high-confidence predictions

# Use this instead
confident_preds = get_confident_predictions_fast(
    ensemble_models, 
    tokenised_unlabeled
)
print(f"Found {len(confident_preds)} high-confidence examples")

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Getting predictions from all models...
Predicting with model 1/5...


Predicting with model 2/5...


Predicting with model 3/5...


KeyboardInterrupt: 

In [None]:
def get_confident_predictions_two_models(ensemble_models, unlabeled_data, confidence_threshold=0.98):
    """
    Use 2 models, keep intersection where BOTH are 98%+ confident.
    """
    from tqdm.auto import tqdm
    
    # Pick 2 best models (first 2 from ensemble)
    model1, model2 = ensemble_models[0], ensemble_models[1]
    
    # Trainer args
    dummy_args = TrainingArguments(
        output_dir="../temp",
        per_device_eval_batch_size=128,
        dataloader_num_workers=16,
        bf16=True,
    )
    
    print("Getting predictions from model 1...")
    model1.eval()
    trainer1 = Trainer(
        model=model1,
        args=dummy_args,
        data_collator=new_collator,
        processing_class=tok,
    )
    preds1 = trainer1.predict(unlabeled_data)
    pred_array1 = preds1.predictions
    if pred_array1.ndim == 3:
        # Get confidences before argmax
        probs1 = torch.softmax(torch.from_numpy(pred_array1), dim=-1).numpy()
        conf1 = probs1.max(axis=-1)  # (num_examples, seq_len)
        pred_array1 = pred_array1.argmax(-1)
    else:
        # Already decoded, assume high confidence
        conf1 = np.ones_like(pred_array1, dtype=np.float32) * 0.99
    
    torch.cuda.empty_cache()
    
    print("Getting predictions from model 2...")
    model2.eval()
    trainer2 = Trainer(
        model=model2,
        args=dummy_args,
        data_collator=new_collator,
        processing_class=tok,
    )
    preds2 = trainer2.predict(unlabeled_data)
    pred_array2 = preds2.predictions
    if pred_array2.ndim == 3:
        probs2 = torch.softmax(torch.from_numpy(pred_array2), dim=-1).numpy()
        conf2 = probs2.max(axis=-1)
        pred_array2 = pred_array2.argmax(-1)
    else:
        conf2 = np.ones_like(pred_array2, dtype=np.float32) * 0.99
    
    torch.cuda.empty_cache()
    
    # Find intersection where both agree AND both confident
    print("Finding high-confidence agreement...")
    high_confidence_examples = []
    
    for example_idx in tqdm(range(len(unlabeled_data))):
        example = unlabeled_data[example_idx]
        
        preds_m1 = pred_array1[example_idx]
        preds_m2 = pred_array2[example_idx]
        conf_m1 = conf1[example_idx]
        conf_m2 = conf2[example_idx]
        
        agreed_sequence = []
        high_conf = True
        
        for token_idx in range(len(preds_m1)):
            # Skip padding
            if preds_m1[token_idx] == -100:
                continue
            
            # Both must agree
            if preds_m1[token_idx] != preds_m2[token_idx]:
                high_conf = False
                break
            
            # Both must be confident
            if conf_m1[token_idx] < confidence_threshold or conf_m2[token_idx] < confidence_threshold:
                high_conf = False
                break
            
            agreed_sequence.append(int(preds_m1[token_idx]))
        
        if high_conf and len(agreed_sequence) > 0:
            high_confidence_examples.append({
                'example': example,
                'predicted_labels': agreed_sequence,
            })
    
    return high_confidence_examples


# Use 2 models only
confident_preds = get_confident_predictions_two_models(
    ensemble_models[:2],  # Use first 2 models
    tokenised_unlabeled,
    confidence_threshold=0.98
)

print(f"Found {len(confident_preds)} high-confidence examples")

Getting predictions from model 1...


Getting predictions from model 2...


In [12]:
torch.cuda.empty_cache()

In [None]:
def convert_confident_to_training(confident_preds, tokenised_examples):
    """Convert high-confidence predictions to training format."""
    new_training_examples = []
    
    for conf_pred in confident_preds:
        example = conf_pred['example']
        pred_labels = conf_pred['predicted_labels']
        
        # Get original tokens
        tokens = example['tokens']
        word_ids = example['word_ids']
        
        # Map back to word-level labels
        word_labels = []
        prev_wid = None
        label_idx = 0
        
        for wid in word_ids:
            if wid is not None and wid != prev_wid:
                if label_idx < len(pred_labels):
                    word_labels.append(pred_labels[label_idx])
                    label_idx += 1
                prev_wid = wid
        
        if len(word_labels) == len(tokens):
            new_training_examples.append({
                'tokens': tokens,
                'ner_tags': word_labels,
                'Category': example['category_id']
            })
    
    return new_training_examples


# Convert
pseudo_labeled = convert_confident_to_training(confident_preds, tokenised_unlabeled)

# Combine with original 5K
original_examples = rows_to_examples(tagged)
combined_training = original_examples + pseudo_labeled

print(f"Original: {len(original_examples)}, Pseudo: {len(pseudo_labeled)}")
print(f"Total training: {len(combined_training)}")
# Expect: 5K + 5-10K = 10-15K total

In [15]:
# Create new dataset
expanded_ds = Dataset.from_list(combined_training)
expanded_tokenised = expanded_ds.map(tok_fn, batched=True, remove_columns=["tokens", "ner_tags", "Category"])

# Retrain 5 models (not 10, to save time)
ensemble_seeds_v2 = [42, 123, 456, 789, 2024]

for seed in ensemble_seeds_v2:
    model_path = train_single_model(
        seed, 
        f"model_{seed}_expanded",
        expanded_tokenised  # Use expanded data
    )

Map:   0%|          | 0/6187 [00:00<?, ? examples/s]


Training Model 42 - model_42_expanded

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,13.2974
50,12.4695
75,10.7576
100,8.2514
125,5.8829
150,4.3118
175,3.6248
200,3.3113
225,3.1158
250,3.025



Training Model 123 - model_123_expanded

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,12.878
50,12.046
75,10.412
100,8.1081
125,5.7797
150,4.31
175,3.5858
200,3.2543
225,3.1162
250,2.9969



Training Model 456 - model_456_expanded

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,13.4484
50,12.627
75,10.9319
100,8.2558
125,5.7201
150,4.2749
175,3.5719
200,3.307
225,3.1076
250,3.0207



Training Model 789 - model_789_expanded

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,12.5148
50,11.6562
75,9.9068
100,7.4779
125,5.3212
150,4.0522
175,3.5034
200,3.2566
225,3.101
250,3.0136



Training Model 2024 - model_2024_expanded

Loading DAPT encoder from ../models/deberta-improved-weak-ner-mk-2


Step,Training Loss
25,13.1947
50,12.3731
75,10.7277
100,8.3252
125,5.825
150,4.2714
175,3.5839
200,3.2912
225,3.1184
250,3.0424


In [16]:
ensemble_paths = [f"../models/deberta-ner-ensemble-seed{42}-final-confidence", f"../models/deberta-ner-ensemble-seed{123}-final-confidence", 
                  f"../models/deberta-ner-ensemble-seed{456}-final-confidence", f"../models/deberta-ner-ensemble-seed{789}-final-confidence", f"../models/deberta-ner-ensemble-seed{2024}-final-confidence"]

In [17]:
# ============================================================================
# ENSEMBLE PREDICTION (Majority Voting)
# ============================================================================
# ensemble_paths = [f"../models/deberta-ner-ensemble-seed{42}-final", f"../models/deberta-ner-ensemble-seed{123}-final", f"../models/deberta-ner-ensemble-seed{456}-final", f"../models/deberta-ner-ensemble-seed{789}-final", f"../models/deberta-ner-ensemble-seed{2024}-final"]
def load_ensemble_models(model_paths):
    """Load all trained models."""
    models = []
    
    for path in model_paths:
        print(f"Loading model from {path}...")
        
        # Let AutoModel handle the loading
        from transformers import AutoModelForTokenClassification
        
        # This won't work directly, so use manual approach:
        cfg = AutoConfig.from_pretrained(path)
        
        # Reinitialize the full model structure
        model = CatAwareCRF(
            cfg,
            num_labels=len(label_list),
            allow_mask=allow_mask,
            base_model_name=None,
            use_dapt=True
        )
        
        # Now load the task-specific layers
        from safetensors.torch import load_file
        state_dict = load_file(f"{path}/model.safetensors")
        model.load_state_dict(state_dict)  # Load everything!
        # # Load only the task layers (cat_embed, proj, crf)
        # task_dict = {k: v for k, v in state_dict.items() 
        #              if k.startswith(('cat_embed', 'proj', 'crf'))}
        # model.load_state_dict(task_dict, strict=False)
        
        model.to(device)
        model.eval()
        models.append(model)
    
    return models


def ensemble_predict(models, tokenised_quiz, trainer_args):
    """
    Get predictions from all models and vote.
    Returns: voted predictions (batch_size, seq_len)
    """
    all_predictions = []
    
    # Get predictions from each model
    for i, model in enumerate(models):
        print(f"Predicting with model {i+1}/{len(models)}...")
        
        # Create temporary trainer for prediction
        temp_trainer = Trainer(
            model=model,
            args=trainer_args,
            data_collator=new_collator,
            processing_class=tok,
        )
        
        # Ensure labels are NOT passed (triggers decode path)
        pred_output = temp_trainer.predict(tokenised_quiz)
        pred_output_array = pred_output.predictions
        
        # Check shape and handle accordingly
        if pred_output_array.ndim == 3:
            # Training mode was triggered - logits returned
            pred_ids = pred_output_array.argmax(-1)
        elif pred_output_array.ndim == 2:
            # Inference mode - already decoded IDs
            pred_ids = pred_output_array
        else:
            raise ValueError(f"Unexpected prediction shape: {pred_output_array.shape}")
        
        # CRF already decoded, predictions are label IDs
        all_predictions.append(pred_ids)
        
        # Clear memory
        torch.cuda.empty_cache()
    
    # Voting: majority wins
    print("Performing majority voting...")
    all_predictions = np.array(all_predictions)  # (num_models, batch_size, seq_len)
    
    voted_predictions = []
    for i in range(all_predictions.shape[1]):  # For each example
        example_preds = all_predictions[:, i, :]  # (num_models, seq_len)
        
        # Majority vote per token
        voted_seq = []
        for j in range(example_preds.shape[1]):  # For each token
            token_votes = example_preds[:, j]
            
            # Ignore -100 (padding/special tokens)
            valid_votes = token_votes[token_votes != -100]
            
            if len(valid_votes) == 0:
                voted_seq.append(-100)
            else:
                # Most common prediction
                from collections import Counter
                vote_counts = Counter(valid_votes)
                majority_vote = vote_counts.most_common(1)[0][0]
                # voted_seq.append(majority_vote)
                voted_seq.append(int(majority_vote))
        
        voted_predictions.append(voted_seq)
    
    # return np.array(voted_predictions)
    return np.array(voted_predictions, dtype=np.int64)
    
# ============================================================================
# LOAD QUIZ DATA
# ============================================================================

quiz = (
    pd.read_csv("../data/Listing_Titles.tsv", sep="\t", 
                keep_default_na=False, na_values=None)
    .query("5001 <= `Record Number` <= 30000")
)

quiz["tokens"] = quiz["Title"].str.split()
quiz_ds = Dataset.from_pandas(
    quiz[["Record Number", "Category", "tokens"]],
    preserve_index=False
)


def tok_quiz(batch):
    enc = tok(
        batch["tokens"],
        is_split_into_words=True,
        padding=False,
        truncation=True,
        max_length=256
    )
    
    enc["labels"] = [[-100] * len(ids) for ids in enc["input_ids"]]
    enc["word_ids"] = [enc.word_ids(i) for i in range(len(enc["input_ids"]))]
    enc["category_id"] = batch["Category"]
    enc["record_id"] = batch["Record Number"]
    enc["tokens"] = batch["tokens"]
    return enc

tok = AutoTokenizer.from_pretrained(base_model)
tokenised_quiz = quiz_ds.map(tok_quiz, batched=True, remove_columns=[])


# ============================================================================
# ENSEMBLE INFERENCEFl
# ============================================================================

# Load all ensemble models

ensemble_models = load_ensemble_models(ensemble_paths)

# Dummy trainer args for prediction
dummy_args = TrainingArguments(
    output_dir="../temp",
    per_device_eval_batch_size=48,
    dataloader_num_workers=16,
    bf16=True,
)

# Get ensemble predictions
pred_ids = ensemble_predict(ensemble_models, tokenised_quiz, dummy_args)

print(f"✓ Ensemble predictions shape: {pred_ids.shape}")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Loading model from ../models/deberta-ner-ensemble-seed42-final-confidence...
Loading model from ../models/deberta-ner-ensemble-seed123-final-confidence...
Loading model from ../models/deberta-ner-ensemble-seed456-final-confidence...
Loading model from ../models/deberta-ner-ensemble-seed789-final-confidence...
Loading model from ../models/deberta-ner-ensemble-seed2024-final-confidence...
Predicting with model 1/5...


Predicting with model 2/5...


Predicting with model 3/5...


Predicting with model 4/5...


Predicting with model 5/5...


Performing majority voting...
✓ Ensemble predictions shape: (25000, 49)


In [18]:
# Decode predictions
records = []

for i, ex in enumerate(tokenised_quiz):
    rec = int(ex["record_id"])
    cat = int(ex["category_id"])
    words = ex["tokens"]
    wids = ex["word_ids"]
    labs = [id2label[idx] if idx != -100 else "O" for idx in pred_ids[i]]
    
    # Keep only first sub-word label per word
    word_labels = []
    prev_wid = None
    for wid, lab in zip(wids, labs):
        if wid is not None and wid != prev_wid:
            word_labels.append((wid, lab))
            prev_wid = wid
    
    # Decode entities correctly
    current_tokens = []
    current_tag = None
    
    for wid, label in word_labels:
        word = words[wid]
        
        if label == "O":
            # Flush any current entity
            if current_tokens and current_tag:
                records.append((rec, cat, current_tag, " ".join(current_tokens)))
                current_tokens = []
                current_tag = None
            # Add O token
            records.append((rec, cat, "O", word))
            continue
        
        prefix, tag = label.split("-", 1)
        
        # Check if tag is legal for this category
        if tag not in allow[cat]:
            continue
        
        if prefix == "B":
            # NEW entity starts - flush previous
            if current_tokens and current_tag:
                records.append((rec, cat, current_tag, " ".join(current_tokens)))
            # Start new entity
            current_tokens = [word]
            current_tag = tag
        elif prefix == "I":
            # Continuation of entity
            if tag == current_tag:
                current_tokens.append(word)
            else:
                # I- tag doesn't match current - treat as new entity (model error)
                if current_tokens and current_tag:
                    records.append((rec, cat, current_tag, " ".join(current_tokens)))
                current_tokens = [word]
                current_tag = tag
    
    # Flush final entity
    if current_tokens and current_tag:
        records.append((rec, cat, current_tag, " ".join(current_tokens)))

In [19]:
# Filter out O tags for submission
submission = pd.DataFrame(records, columns=["Record Number", "Category", "Tag", "Token"])
submission = submission[submission["Tag"] != "O"]

In [20]:
# BEFORE saving submission, validate categories
quiz_categories = quiz.set_index("Record Number")["Category"].to_dict()

# Validate each submission row
for idx, row in submission.iterrows():
    rec_num = row["Record Number"]
    expected_cat = quiz_categories.get(rec_num)
    if row["Category"] != expected_cat:
        print(f"⚠️ Category mismatch at record {rec_num}!")



In [21]:
import csv

submission.to_csv(
    "../results/weak-nerMK2-smooth0-01-ensemble-confidence.tsv",
    sep="\t",
    header=False,
    index=False,
    encoding="utf-8",
    quoting=csv.QUOTE_NONE,
    escapechar="\\"
)
