<a href="https://colab.research.google.com/github/r-kovalch/acter-ner/blob/main/notebooks/acter-gliner-multi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/r-kovalch/acter-ner

In [None]:
!git clone https://github.com/AylaRT/ACTER

In [None]:
%cd /content/ACTER

In [None]:
%cd /content/acter-ner/term_extractor

In [None]:
ls

In [None]:
!bash combine_corpora_gliner_multi.sh

In [None]:
!cp train_full.tsv train_full.jsonl && \
  cp test_full.tsv test_full.jsonl && \
  cp val_full.tsv val_full.jsonl

In [None]:
!pip install -U "gliner>=0.2.19" "transformers>=4.51.0" \
               datasets accelerate evaluate seqeval --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
from datasets import load_dataset
from gliner import GLiNER
from gliner.data_processing import GLiNERDataset, WordsSplitter
from gliner.data_processing.collator import DataCollatorWithPadding
from gliner.training import Trainer, TrainingArguments
import evaluate
import torch
from transformers import EarlyStoppingCallback

In [None]:
GLINER_THRESHOLD=0.35
GLINER_MODEL="urchade/gliner_multi-v2.1"


# 2.  Load ACTER JSONL
raw = load_dataset(
    "json",
    data_files={"train": "./train_full.jsonl",
                "validation": "./test_full.jsonl"},
)

# 3.  Convert char-level spans  ➜  GLiNER format
LABEL_SET = set()
def to_gliner(ex):
    tokens, char2tok, off = [], {}, 0
    for i, tok in enumerate(ex["text"].split()):
        tokens.append(tok)
        char2tok.update({off + j: i for j in range(len(tok))})
        off += len(tok) + 1
    ner = []
    for ent in ex["entities"]:
        s = char2tok.get(ent["start"]); e = char2tok.get(ent["end"] - 1)
        if s is not None and e is not None:
            lbl = ent["label"].lower()
            ner.append([s, e, lbl]); LABEL_SET.add(lbl)
    item = {"tokenized_text": tokens, "ner": ner}
    if not ner:                               # sentences without entities
        item["label"] = sorted(LABEL_SET)     # see issue #139
    return item

train_py = [to_gliner(x) for x in raw["train"]]
dev_py   = [to_gliner(x) for x in raw["validation"]]
labels   = sorted(LABEL_SET)

# 4.  Wrap with GLiNERDataset
model     = GLiNER.from_pretrained(GLINER_MODEL)
tok       = model.data_processor.transformer_tokenizer
splitter  = WordsSplitter(model.config.words_splitter_type)

train_ds = GLiNERDataset(train_py, model.config, tok, splitter,
                         entities=labels)
dev_ds   = GLiNERDataset(dev_py,  model.config, tok, splitter,
                         entities=labels)

# 5.  Use *DataCollatorWithPadding* (NOT DataCollator)
collator = DataCollatorWithPadding(model.config)

# ---------------- 6.  compute_metrics  --------------------
seqeval = evaluate.load("seqeval")

def char_to_tokens(txt):
    """helper: char idx ➜ token idx map for whitespace split text"""
    m, p = {}, 0
    for i, t in enumerate(txt.split()):
        m.update({p + j: i for j in range(len(t))}); p += len(t) + 1
    return m

def spans_to_bio(tokens, spans, label2idx):
    tags = ["O"] * len(tokens)
    for s, e, lab in spans:
        tags[s] = f"B-{lab}"
        for i in range(s + 1, e + 1):
            tags[i] = f"I-{lab}"
    return tags

import re

# pre-compile once – matches “P: 78.42%    R: 71.95%    F1: 75.03%”
_PRF_RE = re.compile(
    r"P:\s*([0-9.]+)%\s*R:\s*([0-9.]+)%\s*F1:\s*([0-9.]+)%", re.I
)

def compute_metrics(_eval_pred):
    """
    Handles   model.evaluate() -> (output_str, f1)
    where output_str looks like:  "P: 78.42%\\tR: 71.95%\\tF1: 75.03%\\n"
    Returns ents_p / ents_r / ents_f (in %) and 'score' = F1 (0-1).
    """
    out_str, f1 = model.evaluate(                 # <- your Evaluator method
        dev_py,
        threshold=GLINER_THRESHOLD,
        entity_types=labels
    )

    # -------- extract P and R from the string -----------------------------
    m = _PRF_RE.search(out_str)
    if not m:
        raise ValueError(f"Cannot parse PRF from: {out_str!r}")
    p, r = (float(m.group(1)), float(m.group(2)))   # already %
    # f1 returned by evaluate() is 0-1, convert to %
    f1_pct = f1 * 100

    return {
        "ents_p": p,
        "ents_r": r,
        "ents_f": f1_pct,
        "score":  f1,           # 0-1 scalar for best-model tracking
    }


# 6.  TrainingArguments
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ucu/ner/gliner_multi",
    learning_rate=5e-6,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    report_to="none",
    seed=42,
    remove_unused_columns=False,   # keep custom keys like 'label'
)

# 7.  Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tok,                 # still accepted; FutureWarning OK
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train(resume_from_checkpoint="/content/drive/MyDrive/ucu/ner/gliner_multi_/checkpoint-226725/")


In [None]:
# path to the best-loss checkpoint
best_path = "/content/drive/MyDrive/ucu/ner/gliner_multi/"
print("Best checkpoint:", best_path)
best_model = GLiNER.from_pretrained(best_path).to("cuda")
out_str, f1 = best_model.evaluate(
    dev_py,
    threshold=0.75,
    entity_types=labels,
    batch_size=1         # adjust until it fits
)
print(out_str)          # P: 65.06%	R: 88.52%	F1: 75.00%


In [None]:
# from google.colab import runtime
# runtime.unassign()