In [4]:
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 

ner_dataset = datasets.load_dataset("conll2003",
                                trust_remote_code=True) 
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [23]:
ner_dataset["train"].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [11]:
ner_dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

## 2. Prepare the NER model

In [13]:
import torch
from torch import nn, Tensor
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader

from transformers import CamembertModel, CamembertTokenizer, CamembertConfig
from datasets import load_dataset

In [14]:
class CamemBERTBaseModel(nn.Module):
    def __init__(self, model_path: str, trainable: bool = False):
        """
        Initialize the base CamemBERT model.
        :param model_path: Path to the pre-trained CamemBERT model.
        """
        super(CamemBERTBaseModel, self).__init__()
        self.base_model = CamembertModel.from_pretrained(model_path)
        self.tranaible = trainable
        self.config = CamembertConfig()
        #self.config = CamembertModel.from_pretrained(model_path).config

        if not trainable:
            for param in self.base_model.parameters():
                param.requires_grad = False
            self.base_model.eval()
        else :
            self.base_model.train()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the base model.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :return: Last hidden states from the base model.
        """
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_hidden_size(self) -> int:
        """
        Get the hidden size of the base model for dynamically attaching heads.
        :return: Hidden size of the CamemBERT model.
        """
        return self.config.hidden_size

In [24]:
class NerFinetuningModel(nn.Module):
    def __init__(self, model_path: str, num_labels: int = 9, trainable: bool = True):
        """
        Fine-tuning model for Named Entity Recognition (NER).
        :param model_path: Path to the pre-trained CamemBERT model.
        :param num_labels: Number of NER labels (e.g., 9).
        :param trainable: Whether to fine-tune the CamemBERT base model.
        """
        super(NerFinetuningModel, self).__init__()
        # Base model
        self.base_model = CamemBERTBaseModel(model_path, trainable=trainable)
        self.hidden_size = self.base_model.get_hidden_size()

        # Classification head for NER
        self.ner_head = nn.Linear(self.hidden_size, num_labels)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        """
        Forward pass for NER fine-tuning.
        :param input_ids: Tensor of token IDs (batch_size, seq_len).
        :param attention_mask: Attention mask (batch_size, seq_len).
        :param labels: Optional tensor of NER labels (batch_size, seq_len).
        :return: Dictionary containing logits and optionally loss.
        """
        # Get token embeddings from the base model
        hidden_states = self.base_model(input_ids, attention_mask)  # (batch_size, seq_len, hidden_size)

        # Pass through NER classification head
        logits = self.ner_head(hidden_states)  # (batch_size, seq_len, num_labels)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            # Flatten logits and labels for loss computation
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        return {"logits": logits, "loss": loss}


In [31]:
model_path = "../../../models/4gb_oscar"
ner_model = NerFinetuningModel(model_path , num_labels=9, trainable=True)

In [None]:
import pytorch_lightning as pl
from torchmetrics import Accuracy

class NER(pl.LightningModule):
    def __init__(self, model, lr=5e-5, total_steps=10000, num_labels=9):
        """
        NER model for training with PyTorch Lightning.
        :param model: Instance of the fine-tuning model.
        :param lr: Learning rate.
        :param total_steps: Total training steps (used for learning rate scheduler).
        :param num_labels: Number of NER labels (e.g., 9).
        """
        super(NER, self).__init__()
        self.model = model
        self.lr = lr
        self.total_steps = total_steps
        self.num_labels = num_labels

        # Metrics
        self.train_accuracy = Accuracy(task="multiclass", num_classes=num_labels, average="weighted")
        self.val_accuracy = Accuracy(task="multiclass", num_classes=num_labels, average="weighted")

        # Store metrics
        self.train_losses_epoch = []
        self.train_accuracies_epoch = []
        self.val_losses_epoch = []
        self.val_accuracies_epoch = []

    def forward(self, batch):
        """
        Forward pass for inference.
        """
        input_ids, attention_mask, _ = batch
        outputs = self.model(input_ids, attention_mask)
        return outputs["logits"]

    def training_step(self, batch, batch_index):
        """
        Training step for the model.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        
        outputs = self.model(input_ids, attention_mask, labels)
        loss = outputs["loss"]

        # Accuracy (token level)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)
        acc = self.train_accuracy(preds.view(-1), labels.view(-1))

        # Log metrics
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=False)
        self.log("train_acc", acc, prog_bar=True, on_step=True, on_epoch=False)

        return loss

    def validation_step(self, batch, batch_index):
        """
        Validation step for the model.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        
        outputs = self.model(input_ids, attention_mask, labels)
        loss = outputs["loss"]

        # Accuracy (token level)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)
        acc = self.val_accuracy(preds.view(-1), labels.view(-1))

        # Log metrics
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, on_step=False, on_epoch=True)

        return loss

    def on_train_epoch_end(self):
        """
        Store average metrics by epoch for training.
        """
        avg_acc = self.train_accuracy.compute().item()
        self.train_accuracies_epoch.append(avg_acc)
        print(f"[Epoch {self.current_epoch}] Train Accuracy: {avg_acc:.4f}")
        self.train_accuracy.reset()

    def on_validation_epoch_end(self):
        """
        Store average metrics by epoch for validation.
        """
        avg_acc = self.val_accuracy.compute().item()
        self.val_accuracies_epoch.append(avg_acc)
        print(f"[Epoch {self.current_epoch}] Val Accuracy: {avg_acc:.4f}")
        self.val_accuracy.reset()

    def configure_optimizers(self):
        """
        Configure the optimizer and learning rate scheduler.
        """
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)

        steps_per_epoch = 1534  # Adapt this value based on your dataset
        total_steps = steps_per_epoch * self.trainer.max_epochs

        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=self.lr,
            total_steps=total_steps,
            pct_start=0.1,
            anneal_strategy="linear",
        )
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step"}}

In [33]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [None]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
)

In [None]:
metric = datasets.load_metric("seqeval") 

In [None]:
label_list = ner_dataset["train"].features["ner_tags"].feature.names 
label_list