In [1]:
%%capture
!pip install accelerate -U
!pip install transformers huggingface_hub
!pip install gliner[gpu]


import os
import json
import random
import torch
from gliner import GLiNER
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

if not os.path.exists("models"):
        os.makedirs("models")
if not os.path.exists("data"):
        os.makedirs("data")

In [2]:
with open('train_gliner.json', 'r') as file:
    annotated_data = json.load(file)

with open('dev_gliner.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

all_labels = []
for example in annotated_data:
    ner_data = example.get("ner", [])
    for entity in ner_data:
        label = entity[2] 
        if label not in all_labels:
            all_labels.append(label)

In [3]:
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from transformers import TrainerCallback

class MyCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} ended.")
        my_custom_function()  # Call your function here.
        return control

def my_custom_function():
    # Your custom logic here
    results, f1 = model.evaluate(test_data, flat_ner=True, threshold=0.95, batch_size=12, entity_types=all_labels)
    print(f1)


# Assuming GLiNER and GLiNERDataset are already defined/imported
os.environ["TOKENIZERS_PARALLELISM"] = "true"

def create_models_directory():
    if not os.path.exists("models"):
        os.makedirs("models")

def train_model(model_name, custom_model_name, learning_rate, weight_decay, batch_size, epochs, compile_model):
    

    train_dataset = GLiNERDataset(train_data, model.config, data_processor=model.data_processor)
    test_dataset = GLiNERDataset(test_data, model.config, data_processor=model.data_processor)

    # use it for better performance, it mimics original implementation but it's less memory efficient
    data_collator = DataCollatorWithPadding(model.config)

    if compile_model:
        print("Compiling model for faster training...")
        torch.set_float32_matmul_precision('high')
        model.to(device)
        model.compile_for_training()
    else:
        model.to(device)

    training_args = TrainingArguments(
        output_dir="models",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        others_lr=learning_rate,
        others_weight_decay=weight_decay,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        eval_strategy="epoch",
        save_total_limit=3,
        dataloader_num_workers=1,
        use_cpu=(device == torch.device('cpu')),
        report_to="none",
        save_strategy="epoch",
        metric_for_best_model="eval_loss",      # Specify the metric to monitor
        greater_is_better=False,                # Set based on the metric (False for loss)
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=model.data_processor.transformer_tokenizer,
        data_collator=data_collator,
        callbacks=[MyCallback]
    )

    print("Starting training...")
    trainer.train()
    model.save_pretrained(f"models/{custom_model_name}")

    print("Training completed successfully.")
    return model


# Replace these with actual values as needed
model_name = "urchade/gliner_multi-v2.1"
custom_model_name = "my_custom_model"
weight_decay = 0.05
batch_size = 32
learning_rate = 0.00001
epochs = 25
compile_model = False

create_models_directory()
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

print(f"Using device: {device}")
print("Loading model...")

model = GLiNER.from_pretrained(model_name)

print("Loading and preparing data...")
with open("train_gliner.json", "r", encoding='utf-8') as f:
    train_data = json.load(f)
with open("dev_gliner.json", "r", encoding='utf-8') as f:
    test_data = json.load(f)

trained_model = train_model(model_name, custom_model_name,learning_rate, weight_decay, batch_size, epochs, compile_model)
print("Model is trained and returned.")

Using device: cuda:0
Loading model...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



Loading and preparing data...
Collecting all entities...


100%|██████████| 8928/8928 [00:00<00:00, 1816921.21it/s]


Total number of entity classes:  13
Collecting all entities...


100%|██████████| 2233/2233 [00:00<00:00, 1874300.75it/s]

Total number of entity classes:  13





Starting training...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,33.599777
2,50.975700,24.002638
3,50.975700,19.792355
4,20.712600,23.617664
5,20.712600,24.734045
6,12.717300,23.25017
7,12.717300,29.923637
8,9.460300,28.631353
9,7.477300,31.627865
10,7.477300,33.498848


Epoch 1.0 ended.
0.7037842540234885
Epoch 2.0 ended.
0.7948822095857027
Epoch 3.0 ended.
0.8270676691729324
Epoch 4.0 ended.
0.8530643319375713
Epoch 5.0 ended.
0.8666423090975521
Epoch 6.0 ended.
0.8702428416092788
Epoch 7.0 ended.
0.874061135371179
Epoch 8.0 ended.
0.8816412414518675
Epoch 9.0 ended.
0.8809688581314878
Epoch 10.0 ended.
0.8846834099752388
Epoch 11.0 ended.
0.876318975955717
Epoch 12.0 ended.
0.8789610389610389
Epoch 13.0 ended.
0.8853314772925004
Epoch 14.0 ended.
0.8812619171433524
Epoch 15.0 ended.
0.8757679180887371
Epoch 16.0 ended.
0.8799171842650104
Epoch 17.0 ended.
0.8794253463314521
Epoch 18.0 ended.
0.8774676650782846
Epoch 19.0 ended.
0.8809564474807857
Epoch 20.0 ended.
0.8836814399728306
Epoch 21.0 ended.
0.8798224650051211
Epoch 22.0 ended.
0.8840629274965801
Epoch 23.0 ended.
0.8808873720136519
Epoch 24.0 ended.
0.8788703640694112
Epoch 25.0 ended.
0.8800681431005111
Training completed successfully.
Model is trained and returned.


**Choose a model and set training parameters for your needs**

In [None]:
# Final model

In [4]:
# Assuming GLiNER and GLiNERDataset are already defined/imported
os.environ["TOKENIZERS_PARALLELISM"] = "true"


def train_model(model_name, custom_model_name, train_path, split_ratio, learning_rate, weight_decay, batch_size, epochs, compile_model):

    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using device: {device}")

    print("Loading model...")
    model = GLiNER.from_pretrained(model_name)

    print("Loading and preparing data...")

    with open('full_train_gliner.json', 'r', encoding='utf-8') as file:
        train_data = json.load(file)
        random.seed(42)
        random.shuffle(train_data)
    
    with open('test_gliner.json', 'r', encoding='utf-8') as file:
        test_data = json.load(file)

    train_dataset = GLiNERDataset(train_data, model.config, data_processor=model.data_processor)
    test_dataset = GLiNERDataset(test_data, model.config, data_processor=model.data_processor)
    data_collator = DataCollatorWithPadding(model.config)

    if compile_model:
        print("Compiling model for faster training...")
        torch.set_float32_matmul_precision('high')
        model.to(device)
        model.compile_for_training()
    else:
        model.to(device)

    training_args = TrainingArguments(
        output_dir="models",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        others_lr=learning_rate,
        others_weight_decay=weight_decay,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        dataloader_num_workers=1,
        use_cpu=(device == torch.device('cpu')),
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=model.data_processor.transformer_tokenizer,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    model.save_pretrained(f"models/{custom_model_name}")

    print("Training completed successfully.")
    return model


# Replace these with actual values as needed
model_name = "urchade/gliner_multi-v2.1"
custom_model_name = "my_custom_model"
train_path = os.path.join("data", "annotated_data.json")
split_ratio = 0.9
learning_rate = 0.00001
weight_decay = 0.05
batch_size = 32
epochs = 13
compile_model = False

trained_model = train_model(model_name, custom_model_name, train_path, split_ratio,
                            learning_rate, weight_decay, batch_size, epochs, compile_model)
print("Model is trained and returned.")


Using device: cuda:0
Loading model...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



Loading and preparing data...
Collecting all entities...


100%|██████████| 11161/11161 [00:00<00:00, 1115648.88it/s]


Total number of entity classes:  13
Collecting all entities...


100%|██████████| 5087/5087 [00:00<00:00, 1848910.26it/s]

Total number of entity classes:  13



  trainer = Trainer(


Starting training...


Step,Training Loss
500,46.5749
1000,20.1543
1500,13.3298
2000,10.5228
2500,7.9316
3000,7.9921
3500,6.2783
4000,5.6427
4500,4.6373


Training completed successfully.
Model is trained and returned.


In [6]:
with open('test_gliner.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

results, f1 = model.evaluate(test_data, flat_ner=True, threshold=0.95, batch_size=12, entity_types=all_labels)
print(results)

P: 84.97%	R: 88.99%	F1: 86.93%

