<a href="https://colab.research.google.com/github/r-kovalch/acter-ner/blob/main/notebooks/acter-nuner-rerank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import random

K_w = 16

# Paths (adjust as needed)
input_iob_tsv_path = "/content/drive/MyDrive/ucu/ner/threshold_datasets/ReRank_max_0.569_train_full.tsv"  # Path to your TSV IOB file
output_json_path = f"/content/drive/MyDrive/ucu/ner/threshold_datasets/ReRank_max_0.569_K_w_{K_w}.jsonl"       # Output JSON for chosen K_w

def load_iob_tsv(path):
    """
    Reads a TSV IOB file (token<TAB>tag per line) and returns a list of (tokens, tags) per sentence.
    """
    sentences = []
    current_tokens = []
    current_tags = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_tokens:
                    sentences.append((current_tokens, current_tags))
                    current_tokens = []
                    current_tags = []
                continue
            # Split on tab since data is TSV
            parts = line.split('\t')
            if len(parts) != 2:
                continue  # Skip malformed lines
            token, tag = parts
            current_tokens.append(token)
            current_tags.append(tag)
        # Catch last sentence if file doesn’t end with newline
        if current_tokens:
            sentences.append((current_tokens, current_tags))
    return sentences

def count_term_tokens(tags):
    """Counts number of TERM tokens in an IOB tag list."""
    return sum(1 for t in tags if t.endswith("TERM"))

def sample_sentences_for_kw(pool, kw, seed=42):
    """
    Randomly shuffle all sentences, then accumulate until total TERM-token count ≥ kw.
    Returns a list of selected (tokens, tags) pairs.
    """
    random.seed(seed)
    sentences_with_counts = [(tokens, tags, count_term_tokens(tags)) for tokens, tags in pool]
    random.shuffle(sentences_with_counts)

    selected = []
    total_terms = 0
    for tokens, tags, count in sentences_with_counts:
        if total_terms >= kw:
            break
        selected.append((tokens, tags))
        total_terms += count

    return selected

def convert_to_json_records(selected_sentences):
    """
    Converts a list of (tokens, tags) into JSON-serializable dicts.
    """
    records = []
    for tokens, tags in selected_sentences:
        records.append({"tokens": tokens, "ner_tags": tags})
    return records
# Load the TSV IOB file
all_sentences = load_iob_tsv(input_iob_tsv_path)

# Sample sentences to reach K_w TERM tokens
selected_sentences = sample_sentences_for_kw(all_sentences, K_w)

# Convert to JSON format
json_records = convert_to_json_records(selected_sentences)

# Save to JSON file
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(json_records, f, ensure_ascii=False, indent=2)

# Output path for user
output_json_path


'/content/drive/MyDrive/ucu/ner/threshold_datasets/ReRank_max_0.569_K_w_16.jsonl'

In [2]:
!pip install gliner==0.1.12
!pip install --upgrade transformers




In [3]:
import json
from gliner import GLiNER

import torch
from tqdm import tqdm
from transformers import get_cosine_schedule_with_warmup
import os

In [9]:
train_path = output_json_path

with open(train_path, "r") as f:
    data = json.load(f)

new_data = []
for example in data:
    tokens = example["tokens"]
    tags   = example["ner_tags"]  # e.g. ["B-TERM","I-TERM","O",...]

    spans = []
    i = 0
    while i < len(tags):
        tag = tags[i]
        if tag.startswith(("B-", "b-")):
            # begin a new span
            label = tag.split("-", 1)[1].lower()   # "TERM" → "term"
            start = i
            i += 1
            # consume any following I- tags
            while i < len(tags) and tags[i].startswith(("I-", "i-")):
                i += 1
            end = i - 1
            spans.append((start, end, label))
        else:
            i += 1

    new_data.append({
        "tokenized_text": tokens,
        "ner": spans
    })

data = new_data

In [11]:
model = GLiNER.from_pretrained("numind/NuNerZero")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/1.80G [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
from types import SimpleNamespace

# Define the hyperparameters in a config variable
config = SimpleNamespace(
    num_steps=10000, # regualte number train, eval steps depending on the data size
    eval_every=1000,
    train_batch_size=2, # regulate batch size depending on GPU memory available

    max_len=384, # maximum sentence length, 2048 for NuNerZero_long_context
    save_directory="logs", # log dir
    device='cuda', # training device - cpu or cuda

    warmup_ratio=0.1, # keep other parameters unchanged
    lr_encoder=1e-5,
    lr_others=5e-5,
    freeze_token_rep=False,

    max_types=25,
    shuffle_types=True,
    random_drop=True,
    max_neg_type_ratio=1,
)

In [None]:
def train(model, config, train_data, eval_data=None):
    model = model.to(config.device)

    # Set sampling parameters from config
    model.set_sampling_params(
        max_types=config.max_types,
        shuffle_types=config.shuffle_types,
        random_drop=config.random_drop,
        max_neg_type_ratio=config.max_neg_type_ratio,
        max_len=config.max_len
    )

    model.train()

    # Initialize data loaders
    train_loader = model.create_dataloader(train_data, batch_size=config.train_batch_size, shuffle=True)

    # Optimizer
    optimizer = model.get_optimizer(config.lr_encoder, config.lr_others, config.freeze_token_rep)

    pbar = tqdm(range(config.num_steps))

    if config.warmup_ratio < 1:
        num_warmup_steps = int(config.num_steps * config.warmup_ratio)
    else:
        num_warmup_steps = int(config.warmup_ratio)

    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=config.num_steps
    )

    iter_train_loader = iter(train_loader)

    for step in pbar:
        try:
            x = next(iter_train_loader)
        except StopIteration:
            iter_train_loader = iter(train_loader)
            x = next(iter_train_loader)

        for k, v in x.items():
            if isinstance(v, torch.Tensor):
                x[k] = v.to(config.device)

        loss = model(x)  # Forward pass

        # Check if loss is nan
        if torch.isnan(loss):
            continue

        loss.backward()  # Compute gradients
        optimizer.step()  # Update parameters
        scheduler.step()  # Update learning rate schedule
        optimizer.zero_grad()  # Reset gradients

        description = f"step: {step} | epoch: {step // len(train_loader)} | loss: {loss.item():.2f}"
        pbar.set_description(description)

        if (step + 1) % config.eval_every == 0:

            model.eval()

            if eval_data is not None:
                results, f1 = model.evaluate(eval_data["samples"], flat_ner=True, threshold=0.5, batch_size=12,
                                     entity_types=eval_data["entity_types"])

                print(f"Step={step}\n{results}")

            if not os.path.exists(config.save_directory):
                os.makedirs(config.save_directory)

            model.save_pretrained(f"{config.save_directory}/finetuned_{step}")

            model.train()

In [None]:
val_tsv_path = "/content/drive/MyDrive/ucu/ner/threshold_datasets/val_full.tsv"


def extract_entity_types(sentences):
    """
    (Reused) Given a list of (tokens, tags), collect unique entity-type suffixes
    (e.g. "TERM" from "B-TERM"), then return them lower-cased. Ignores "O".
    """
    types = set()
    for _, tags in sentences:
        for t in tags:
            if t == "O":
                continue
            suffix = t.split("-", maxsplit=1)[1]
            types.add(suffix.lower())
    return sorted(types)


def to_json_records(sentences):
    """
    (Reused) Convert list of (tokens, tags) → list of dicts {"tokens": [...], "ner_tags": [...]}.
    """
    records = []
    for toks, tgs in sentences:
        records.append({
            "tokens": toks,
            "ner_tags": tgs
        })
    return records



val_sentences = load_iob_tsv(val_tsv_path)

entity_types = extract_entity_types(val_sentences)

val_records = to_json_records(val_sentences)

# 2.5 Finally, assemble eval_data exactly as NuNER expects:
eval_data = {
    "entity_types": entity_types,
    "samples": val_records
}

In [None]:
# modify this to your own test data!

# don't forget to do the same preprocessing as for the train data:
# * converting entities-level data to token-level data
# * making entity_types lower-cased!!!
eval_data = {
    "entity_types": ["person", "date", "publisher"],
    "samples": data[:10]
}

train(model, config, data, eval_data)