In [1]:
%%capture
!pip install accelerate -U
!pip install transformers huggingface_hub
!pip install gliner[gpu]


import os
import json
import random
import torch
from gliner import GLiNER
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

if not os.path.exists("models"):
        os.makedirs("models")
if not os.path.exists("data"):
        os.makedirs("data")

In [16]:
# Assuming GLiNER and GLiNERDataset are already defined/imported
os.environ["TOKENIZERS_PARALLELISM"] = "true"

def create_models_directory():
    if not os.path.exists("models"):
        os.makedirs("models")

def train_model(model_name, custom_model_name, learning_rate, weight_decay, batch_size, epochs, compile_model):
    create_models_directory()

    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using device: {device}")

    print("Loading model...")
    model = GLiNER.from_pretrained(model_name)

    print("Loading and preparing data...")

    with open("train_gliner.json", "r", encoding='utf-8') as f:
        train_data = json.load(f)

    with open("dev_gliner.json", "r", encoding='utf-8') as f:
        test_data = json.load(f)
    

    train_dataset = GLiNERDataset(train_data, model.config, data_processor=model.data_processor)
    test_dataset = GLiNERDataset(test_data, model.config, data_processor=model.data_processor)
    data_collator = DataCollatorWithPadding(model.config)

    if compile_model:
        print("Compiling model for faster training...")
        torch.set_float32_matmul_precision('high')
        model.to(device)
        model.compile_for_training()
    else:
        model.to(device)

    training_args = TrainingArguments(
        output_dir="models",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        others_lr=learning_rate,
        others_weight_decay=weight_decay,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        eval_strategy="epoch",
        save_total_limit=3,
        dataloader_num_workers=1,
        use_cpu=(device == torch.device('cpu')),
        report_to="none",
        save_strategy="epoch",
        load_best_model_at_end=True,           # Enable loading best model at end
        metric_for_best_model="eval_loss",      # Specify the metric to monitor
        greater_is_better=False,                # Set based on the metric (False for loss)
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=model.data_processor.transformer_tokenizer,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    model.save_pretrained(f"models/{custom_model_name}")

    print("Training completed successfully.")
    return model


# Replace these with actual values as needed
model_name = "urchade/gliner_multi-v2.1"
custom_model_name = "my_custom_model"
weight_decay = 0.05
batch_size = 32
learning_rate = 0.00001
epochs = 10
compile_model = False

trained_model = train_model(model_name, custom_model_name,learning_rate, weight_decay, batch_size, epochs, compile_model)
print("Model is trained and returned.")

Using device: cuda:0
Loading model...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



Loading and preparing data...
Collecting all entities...


100%|██████████| 8928/8928 [00:00<00:00, 1781481.74it/s]


Total number of entity classes:  13
Collecting all entities...


100%|██████████| 2233/2233 [00:00<00:00, 1747691.89it/s]

Total number of entity classes:  13



  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss
1,No log,27.795115
2,41.035200,21.67465
3,41.035200,20.837543
4,16.964100,22.425411
5,16.964100,25.021345
6,11.388100,24.418964
7,11.388100,25.79841
8,8.496600,27.727438
9,7.306100,31.481363
10,7.306100,29.391993


Skipping iteration due to error: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacity of 44.42 GiB of which 94.12 MiB is free. Process 3189696 has 44.32 GiB memory in use. Of the allocated memory 41.77 GiB is allocated by PyTorch, and 2.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Skipping iteration due to error: CUDA out of memory. Tried to allocate 290.00 MiB. GPU 0 has a total capacity of 44.42 GiB of which 270.12 MiB is free. Process 3189696 has 44.15 GiB memory in use. Of the allocated memory 41.92 GiB is allocated by PyTorch, and 1.72 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See docume

**Choose a model and set training parameters for your needs**

In [None]:
# Final model

In [25]:
# Assuming GLiNER and GLiNERDataset are already defined/imported
os.environ["TOKENIZERS_PARALLELISM"] = "true"


def train_model(model_name, custom_model_name, train_path, split_ratio, learning_rate, weight_decay, batch_size, epochs, compile_model):

    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    print(f"Using device: {device}")

    print("Loading model...")
    model = GLiNER.from_pretrained(model_name)

    print("Loading and preparing data...")

    with open('full_train_gliner.json', 'r', encoding='utf-8') as file:
        train_data = json.load(file)
        random.seed(42)
        random.shuffle(train_data)
    
    with open('test_gliner.json', 'r', encoding='utf-8') as file:
        test_data = json.load(file)

    train_dataset = GLiNERDataset(train_data, model.config, data_processor=model.data_processor)
    test_dataset = GLiNERDataset(test_data, model.config, data_processor=model.data_processor)
    data_collator = DataCollatorWithPadding(model.config)

    if compile_model:
        print("Compiling model for faster training...")
        torch.set_float32_matmul_precision('high')
        model.to(device)
        model.compile_for_training()
    else:
        model.to(device)

    training_args = TrainingArguments(
        output_dir="models",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        others_lr=learning_rate,
        others_weight_decay=weight_decay,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        dataloader_num_workers=1,
        use_cpu=(device == torch.device('cpu')),
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=model.data_processor.transformer_tokenizer,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    model.save_pretrained(f"models/{custom_model_name}")

    print("Training completed successfully.")
    return model


# Replace these with actual values as needed
model_name = "urchade/gliner_multi-v2.1"
custom_model_name = "my_custom_model"
train_path = os.path.join("data", "annotated_data.json")
split_ratio = 0.9
learning_rate = 0.00001
weight_decay = 0.05
batch_size = 32
epochs = 3
compile_model = False

trained_model = train_model(model_name, custom_model_name, train_path, split_ratio,
                            learning_rate, weight_decay, batch_size, epochs, compile_model)
print("Model is trained and returned.")


Using device: cuda:0
Loading model...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading and preparing data...
Collecting all entities...


100%|██████████| 11161/11161 [00:00<00:00, 1004972.56it/s]


Total number of entity classes:  13
Collecting all entities...


100%|██████████| 5087/5087 [00:00<00:00, 1771833.95it/s]

Total number of entity classes:  13



  trainer = Trainer(


Starting training...


Step,Training Loss
500,35.9417
1000,18.1918


Training completed successfully.
Model is trained and returned.


In [26]:
with open('data/annotated_data.json', 'r') as file:
    annotated_data = json.load(file)

with open('data/test_gliner.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

all_labels = []
for example in annotated_data:
    ner_data = example.get("ner", [])
    for entity in ner_data:
        label = entity[2] 
        if label not in all_labels:
            all_labels.append(label)

results, f1 = trained_model.evaluate(test_data, flat_ner=True, threshold=0.95, batch_size=12, entity_types=all_labels)
f1

np.float64(0.8521573604060915)