### Import 

In [1]:
import datasets_l 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import CamembertModel, CamembertTokenizer, AdamW, get_cosine_schedule_with_warmup 
from transformers import CamembertModel, CamembertTokenizer, CamembertConfig
from datasets import load_dataset
from sklearn.metrics import f1_score
import os

ner_dataset = datasets_l.load_dataset("conll2003",
                                trust_remote_code=True) 
ner_dataset

from datasets_l import load_dataset
ds = load_dataset("unimelb-nlp/wikiann", "fr", trust_remote_code=True)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


## Dataset

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch


class NERDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128, label_all_tokens=True):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_all_tokens = label_all_tokens

    def tokenize_and_align_labels(self, tokens, ner_tags):
        tokenized_inputs = self.tokenizer(
            tokens,
            truncation=True,
            is_split_into_words=True,
            padding='max_length',
            max_length=self.max_length
        )
        labels = []
        for i, label in enumerate(ner_tags):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)  # Special tokens
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label[word_idx] if self.label_all_tokens else -100)
                previous_word_idx = word_idx
            labels.append(label_ids)

        # Convert all outputs to tensors
        tokenized_inputs = {key: torch.tensor(val, dtype=torch.long) for key, val in tokenized_inputs.items()}
        tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.long)

        return tokenized_inputs

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        tokenized_data = self.tokenize_and_align_labels(
            [data['tokens']], [data['ner_tags']]
        )

        return {
            'input_ids': tokenized_data['input_ids'].squeeze(),
            'attention_mask': tokenized_data['attention_mask'].squeeze(),
            'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
        }


# Load dataset
dataset = load_dataset("unimelb-nlp/wikiann", "fr")

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")#AutoTokenizer.from_pretrained("camembert-base")

# Prepare datasets
train_data = NERDataset(dataset['train'], tokenizer)
val_data = NERDataset(dataset['validation'], tokenizer)
test_data = NERDataset(dataset['test'], tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)
test_loader = DataLoader(test_data, batch_size=16)

# Example usage
for batch in train_loader:
    print("Input IDs:", batch['input_ids'].shape)
    print("Attention Mask:", batch['attention_mask'].shape)
    print("Labels:", batch['labels'].shape)
    print(batch['labels'][0])
    print(batch['input_ids'][0])
    print(tokenizer.convert_ids_to_tokens(batch['input_ids'][0]))
    break




Input IDs: torch.Size([16, 128])
Attention Mask: torch.Size([16, 128])
Labels: torch.Size([16, 128])
tensor([-100,    5,    5,    6,    6,    6,    6,    6,    6,    6,    6, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100])
tensor([  101,  2862,  2063,  4078, 16569,

  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),


## Model 

In [3]:
class CamemBERTBaseModel(nn.Module):
    def __init__(self, model_path: str, trainable: bool = False):
        super(CamemBERTBaseModel, self).__init__()
        self.base_model = CamembertModel.from_pretrained(model_path)
        self.trainable = trainable
        self.config = CamembertConfig()

        if not trainable:
            for param in self.base_model.parameters():
                param.requires_grad = False
            self.base_model.eval()
        else:
            self.base_model.train()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_hidden_size(self) -> int:
        return self.base_model.config.hidden_size

## Finetunning

In [4]:
class NerFinetuningModel(nn.Module):
    def __init__(self, model_path: str, num_labels: int = 9, trainable: bool = True):
        super(NerFinetuningModel, self).__init__()
        self.base_model = CamemBERTBaseModel(model_path, trainable=trainable)
        self.hidden_size = self.base_model.get_hidden_size()
        self.ner_head = nn.Linear(self.hidden_size, num_labels)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        hidden_states = self.base_model(input_ids, attention_mask)
        logits = self.ner_head(hidden_states)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
        return {"logits": logits, "loss": loss}

In [5]:
def train_model(model, train_loader, val_loader, num_epochs, device, lr=5e-5, num_labels=9, save_dir="./models"):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = num_epochs * len(train_loader)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    model.to(device)

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_train_loss = 0
        correct, total = 0, 0
        all_preds, all_labels = [], []

        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, labels)
            logits = outputs["logits"]
            loss = outputs["loss"]

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Accumulate loss
            total_train_loss += loss.item()

            # Compute accuracy
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.view(-1).tolist())
            all_labels.extend(labels.view(-1).tolist())
            correct += (preds.view(-1) == labels.view(-1)).sum().item()
            total += labels.view(-1).numel()

        # Compute metrics
        valid_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
        valid_labels = [l for l in all_labels if l != -100]
        train_f1 = f1_score(valid_labels, valid_preds, average="weighted", labels=list(range(num_labels)))
        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = correct / total

        # Validation
        model.eval()
        total_val_loss = 0
        correct, total = 0, 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask, labels)
                logits = outputs["logits"]
                loss = outputs["loss"]

                # Accumulate loss
                total_val_loss += loss.item()

                # Compute accuracy
                preds = torch.argmax(logits, dim=-1)
                all_preds.extend(preds.view(-1).tolist())
                all_labels.extend(labels.view(-1).tolist())
                correct += (preds.view(-1) == labels.view(-1)).sum().item()
                total += labels.view(-1).numel()

        # Compute metrics
        valid_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
        valid_labels = [l for l in all_labels if l != -100]
        val_f1 = f1_score(valid_labels, valid_preds, average="weighted", labels=list(range(num_labels)))
        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct / total

        # Print metrics
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
        print(f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

        # Save model
        os.makedirs(save_dir, exist_ok=True)
        torch.save(model.state_dict(), f"{save_dir}/ner_model_epoch_{epoch + 1}.pth")
        print(f"Model saved to {save_dir}/ner_model_epoch_{epoch + 1}.pth")

In [6]:

train_data = NERDataset(dataset['train'], tokenizer)
val_data = NERDataset(dataset['validation'], tokenizer)
test_data = NERDataset(dataset['test'], tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)
test_loader = DataLoader(test_data, batch_size=16)

    # Initialize model
model_path = "camembert-base"
num_labels = len(dataset["train"].features["ner_tags"].feature.names)
model = NerFinetuningModel(model_path, num_labels=num_labels, trainable=True)

    # Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, val_loader, num_epochs=5, device=device, save_dir="./models")

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


: 

In [None]:
from sklearn.metrics import f1_score

def evaluate_model(model, data_loader, device, num_labels):
    """
    Evaluate the model on a dataset.
    :param model: Trained NER model.
    :param data_loader: DataLoader for the evaluation dataset.
    :param device: Device (CPU or GPU) to run the evaluation.
    :param num_labels: Number of NER labels.
    :return: Dictionary containing evaluation loss, accuracy, and F1-score.
    """
    model.eval()
    total_loss = 0
    correct, total = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, labels)
            logits = outputs["logits"]
            loss = outputs["loss"]

            # Accumulate loss
            total_loss += loss.item()

            # Predictions
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.view(-1).tolist())
            all_labels.extend(labels.view(-1).tolist())

            # Compute accuracy
            correct += (preds.view(-1) == labels.view(-1)).sum().item()
            total += labels.view(-1).numel()

    # Compute metrics
    avg_loss = total_loss / len(data_loader)
    accuracy = correct / total

    # Filter out padding tokens (-100) for F1-score
    valid_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
    valid_labels = [l for l in all_labels if l != -100]

    f1 = f1_score(valid_labels, valid_preds, average="weighted", labels=list(range(num_labels)))

    return {
        "loss": avg_loss,
        "accuracy": accuracy,
        "f1_score": f1
    }


In [None]:
# Evaluate the model on the test dataset
test_metrics = evaluate_model(model, test_loader, device, num_labels)
print(f"Test Loss: {test_metrics['loss']:.4f}")
print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
print(f"Test F1-Score: {test_metrics['f1_score']:.4f}")
