<a href="https://colab.research.google.com/github/r-kovalch/acter-ner/blob/main/notebooks/acter-gliner-small-en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/r-kovalch/acter-ner

fatal: destination path 'acter-ner' already exists and is not an empty directory.


In [2]:
!git clone https://github.com/AylaRT/ACTER

fatal: destination path 'ACTER' already exists and is not an empty directory.


In [3]:
%cd /content/ACTER

/content/ACTER


In [4]:
%cd /content/acter-ner/term_extractor

/content/acter-ner/term_extractor


In [5]:
ls

combine_corpora-en-fr-nl.sh  [0m[01;34mgliner_acter_ft[0m/            train_full.jsonl
combine_corpora-en-fr.sh     [01;34moutput[0m/                     train_full.tsv
combine_corpora_gliner.sh    preprocess_acter_gliner.py  train_model.py
combine_corpora.sh           preprocess_acter.py         train_spacy_model.py
[01;34mconfigs[0m/                     test_full.jsonl             Untitled.ipynb
dataset_processor.py         test_full.tsv


In [6]:
!bash combine_corpora_gliner.sh

Writing JSON for GLiNER


In [7]:
!cp train_full.tsv train_full.jsonl && \
  cp test_full.tsv test_full.jsonl && \
  cp val_full.tsv val_full.jsonl

cp: cannot stat 'val_full.tsv': No such file or directory


In [8]:
!pip install -U "gliner>=0.2.19" "transformers>=4.51.0" \
               datasets accelerate evaluate seqeval --quiet

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# 1.  Imports
from datasets import load_dataset
from gliner import GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing import GLiNERDataset, WordsSplitter
from gliner.data_processing.collator import DataCollatorWithPadding
import evaluate, torch, random, itertools, json

# 2.  Load ACTER JSONL
raw = load_dataset(
    "json",
    data_files={"train": "./train_full.jsonl",
                "validation": "./test_full.jsonl"},
)

# 3.  Convert char-level spans  ➜  GLiNER format
LABEL_SET = set()
def to_gliner(ex):
    tokens, char2tok, off = [], {}, 0
    for i, tok in enumerate(ex["text"].split()):
        tokens.append(tok)
        char2tok.update({off + j: i for j in range(len(tok))})
        off += len(tok) + 1
    ner = []
    for ent in ex["entities"]:
        s = char2tok.get(ent["start"]); e = char2tok.get(ent["end"] - 1)
        if s is not None and e is not None:
            lbl = ent["label"].lower()
            ner.append([s, e, lbl]); LABEL_SET.add(lbl)
    item = {"tokenized_text": tokens, "ner": ner}
    if not ner:                               # sentences without entities
        item["label"] = sorted(LABEL_SET)     # see issue #139
    return item

train_py = [to_gliner(x) for x in raw["train"]]
dev_py   = [to_gliner(x) for x in raw["validation"]]
labels   = sorted(LABEL_SET)

# 4.  Wrap with GLiNERDataset
model     = GLiNER.from_pretrained("gliner-community/gliner_small-v2.5")
tok       = model.data_processor.transformer_tokenizer
splitter  = WordsSplitter(model.config.words_splitter_type)

train_ds = GLiNERDataset(train_py, model.config, tok, splitter,
                         entities=labels)
dev_ds   = GLiNERDataset(dev_py,  model.config, tok, splitter,
                         entities=labels)

# 5.  Use *DataCollatorWithPadding* (NOT DataCollator)
collator = DataCollatorWithPadding(model.config)

# ---------------- 6.  compute_metrics  --------------------
seqeval = evaluate.load("seqeval")

def char_to_tokens(txt):
    """helper: char idx ➜ token idx map for whitespace split text"""
    m, p = {}, 0
    for i, t in enumerate(txt.split()):
        m.update({p + j: i for j in range(len(t))}); p += len(t) + 1
    return m

def spans_to_bio(tokens, spans, label2idx):
    tags = ["O"] * len(tokens)
    for s, e, lab in spans:
        tags[s] = f"B-{lab}"
        for i in range(s + 1, e + 1):
            tags[i] = f"I-{lab}"
    return tags

import re

# pre-compile once – matches “P: 78.42%    R: 71.95%    F1: 75.03%”
_PRF_RE = re.compile(
    r"P:\s*([0-9.]+)%\s*R:\s*([0-9.]+)%\s*F1:\s*([0-9.]+)%", re.I
)

def compute_metrics(_eval_pred):
    """
    Handles   model.evaluate() -> (output_str, f1)
    where output_str looks like:  "P: 78.42%\\tR: 71.95%\\tF1: 75.03%\\n"
    Returns ents_p / ents_r / ents_f (in %) and 'score' = F1 (0-1).
    """
    out_str, f1 = model.evaluate(                 # <- your Evaluator method
        dev_py,
        threshold=0.35,
        entity_types=labels
    )

    # -------- extract P and R from the string -----------------------------
    m = _PRF_RE.search(out_str)
    if not m:
        raise ValueError(f"Cannot parse PRF from: {out_str!r}")
    p, r = (float(m.group(1)), float(m.group(2)))   # already %
    # f1 returned by evaluate() is 0-1, convert to %
    f1_pct = f1 * 100

    return {
        "ents_p": p,
        "ents_r": r,
        "ents_f": f1_pct,
        "score":  f1,           # 0-1 scalar for best-model tracking
    }


# 6.  TrainingArguments
args = TrainingArguments(
    output_dir="gliner_acter_ft",
    learning_rate=5e-6,
    per_device_train_batch_size=8*7,
    per_device_eval_batch_size=8*7,
    num_train_epochs=35,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    report_to="none",
    seed=42,
    remove_unused_columns=False,   # keep custom keys like 'label'
)

# 7.  Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tok,                 # still accepted; FutureWarning OK
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()
model.save_pretrained("/content/drive/ucu/ner/gliner_acter_en_small_v2.5_ft")


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

  trainer = Trainer(
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch,Training Loss,Validation Loss,Ents P,Ents R,Ents F,Score
1,No log,7.237331,59.46,81.15,68.630849,0.686308
2,No log,7.022026,60.53,84.84,70.648464,0.706485
3,10.295800,7.06157,63.91,85.66,73.204904,0.732049
4,10.295800,6.129638,73.55,83.2,78.076923,0.780769


Epoch,Training Loss,Validation Loss,Ents P,Ents R,Ents F,Score
1,No log,7.237331,59.46,81.15,68.630849,0.686308
2,No log,7.022026,60.53,84.84,70.648464,0.706485
3,10.295800,7.06157,63.91,85.66,73.204904,0.732049
4,10.295800,6.129638,73.55,83.2,78.076923,0.780769
5,4.648500,7.253981,76.92,77.87,77.393075,0.773931
6,4.648500,7.675869,76.19,78.69,77.419355,0.774194
7,4.648500,9.382392,74.17,82.38,78.058252,0.780583
8,2.986500,10.686533,79.12,80.74,79.918864,0.799189
9,2.986500,13.530726,76.92,81.97,79.365079,0.793651
10,2.077100,19.018234,72.95,84.02,78.095238,0.780952


OSError: [Errno 95] Operation not supported: '/content/drive/ucu'

In [None]:
from google.colab import runtime
runtime.unassign()