<a href="https://colab.research.google.com/github/r-kovalch/acter-ner/blob/main/notebooks/acter-gliner-large-en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/r-kovalch/acter-ner

fatal: destination path 'acter-ner' already exists and is not an empty directory.


In [2]:
!git clone https://github.com/AylaRT/ACTER

fatal: destination path 'ACTER' already exists and is not an empty directory.


In [3]:
%cd /content/ACTER

/content/ACTER


In [4]:
%cd /content/acter-ner/term_extractor

/content/acter-ner/term_extractor


In [5]:
ls

combine_corpora-en-fr-nl.sh  [0m[01;34moutput[0m/                     train_full.tsv
combine_corpora-en-fr.sh     preprocess_acter_gliner.py  train_model.py
combine_corpora_gliner.sh    preprocess_acter.py         train_spacy_model.py
combine_corpora.sh           test_full.jsonl             Untitled.ipynb
[01;34mconfigs[0m/                     test_full.tsv
dataset_processor.py         train_full.jsonl


In [6]:
!bash combine_corpora_gliner.sh

Writing JSON for GLiNER


In [7]:
!cp train_full.tsv train_full.jsonl && \
  cp test_full.tsv test_full.jsonl && \
  cp val_full.tsv val_full.jsonl

cp: cannot stat 'val_full.tsv': No such file or directory


In [8]:
!pip install -U "gliner>=0.2.19" "transformers>=4.51.0" \
               datasets accelerate evaluate seqeval --quiet

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
from pathlib import Path
from datasets import load_dataset
from gliner import GLiNER
from gliner.data_processing import GLiNERDataset, WordsSplitter
from gliner.data_processing.collator import DataCollatorWithPadding
from gliner.training import Trainer, TrainingArguments

import torch
from transformers import EarlyStoppingCallback

In [None]:
# 2. Load ACTER JSONL
raw = load_dataset(
    "json",
    data_files={"train": "./train_full.jsonl",
                "validation": "./test_full.jsonl"},
)

# 3. Char→token conversion (unchanged)
LABEL_SET=set()
def to_gliner(ex):
    tokens, c2t, p = [], {}, 0
    for i, tok in enumerate(ex["text"].split()):
        tokens.append(tok); c2t.update({p+j:i for j in range(len(tok))}); p+=len(tok)+1
    ner=[]
    for ent in ex["entities"]:
        s,e=c2t.get(ent["start"]),c2t.get(ent["end"]-1)
        if s is not None and e is not None:
            lbl=ent["label"].lower(); ner.append([s,e,lbl]); LABEL_SET.add(lbl)
    item={"tokenized_text":tokens,"ner":ner}
    if not ner: item["label"]=sorted(LABEL_SET)
    return item
train_py=[to_gliner(x) for x in raw["train"]]
dev_py  =[to_gliner(x) for x in raw["validation"]]
labels=sorted(LABEL_SET)

MODEL_NAME = "numind/NuNerZero"           # "numind/NuNerZero_long_context" for 2048‑tok context
model = GLiNER.from_pretrained(MODEL_NAME)

# Optional: longer max_len
model.set_sampling_params(max_len=384 if "long_context" not in MODEL_NAME else 2048)

# 4. Datasets
tok       = model.data_processor.transformer_tokenizer
splitter  = WordsSplitter(model.config.words_splitter_type)
train_ds  = GLiNERDataset(train_py, model.config, tok, splitter, entities=labels)
dev_ds    = GLiNERDataset(dev_py,  model.config, tok, splitter, entities=labels)

# 5. Collator
collator = DataCollatorWithPadding(model.config)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ucu/ner/nuner_acter_ft",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,      # effective batch ≈32
    num_train_epochs=5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",
    seed=42,
)

# 7. Trainer
# ------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tok,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    preprocess_logits_for_metrics=lambda l, _: l.float().cpu(),
)
trainer.train()


In [None]:
# path to the best-loss checkpoint
best_path = "/content/drive/MyDrive/ucu/ner/nuner_acter_ft/"
print("Best checkpoint:", best_path)
best_model = GLiNER.from_pretrained(best_path).to("cuda")
out_str, f1 = best_model.evaluate(
    dev_py,
    threshold=0.35,
    entity_types=labels,
    batch_size=1         # adjust until it fits
)
print(out_str)          # P: 65.06%	R: 88.52%	F1: 75.00%


In [None]:
# from google.colab import runtime
# runtime.unassign()