In [None]:
!pip install transformers datasets evaluate

In [None]:
import pandas as pd
import re
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel, EarlyStoppingCallback
import evaluate
import numpy as np
import torch
from peft import LoraConfig, get_peft_model, TaskType

# Project :



This notebook was created to fine-tune a pre-trained model for named entity recognition in English.

The data comes from news articles published in October 2025. The articles were downloaded from the Europresse platform.

The articles were first annotated using the ```english_web_core_trf``` pipeline from spaCy. The annotation were then corrected by 4 annotators, and exported in a .csv file to be used as data to fine-tune the model.

Pre-trained model that was fine-tuned : https://huggingface.co/distilbert/distilbert-base-uncased

The model is then evaluated using a confusion matrix. The results are very encouraging.
<br></br>
*Please note that much of the code used here is either inspired by or directly taken from Ms. Delphine Bernhard's Machine Learning course given at the Universit√© de Strasbourg.*
___

# Data retrieval and formatting

In [None]:
# load data
dataset = Dataset.from_csv("/content/correction_annotation - corpus_anno_no_text.csv")
dataset

In [None]:
# check values to discard
dataset.unique("correction_finale")

In [None]:
# withdraw anotations that were labeled as wrong
dataset = dataset.filter(lambda x: (
    x["correction_finale"] is not None
    and x["correction_finale"] != " NULL")
)

In [None]:
# keep only useful columns
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["expression", "correction_finale"]])
dataset

In [None]:
# set of tags
tags = set(dataset['correction_finale'])
# encode labels as ClassLabel
dataset = dataset.cast_column("correction_finale", ClassLabel(names=list(tags)))
dataset = dataset.rename_column("correction_finale", "label")

In [None]:
# target classes
target_classes = dataset.features['label'].names
target_classes

In [None]:
# numerical IDs for classes
[dataset.features['label'].str2int(c) for c in target_classes]

In [None]:
# link between numerical classes and semantic classes
label2id = {target_classes[i]:i for i in range(len(target_classes))}
id2label = {i:target_classes[i] for i in range(len(target_classes))}

In [None]:
# first train/test split
# shuffle because data is currenlty in chronological order
dataset = dataset.train_test_split(test_size = 0.2, shuffle=True, seed=47)
# second split to get a validation set as well
dataset2 = dataset["test"].train_test_split(test_size = 0.5, shuffle=True, seed = 47)
# DatasetDict that contains all splitted data
ds = DatasetDict({
    "train": dataset["train"],
    "validation": dataset2["train"],
    "test": dataset2["test"]
})

In [None]:
# check structure of DatasetDict
ds

In [None]:
ds['train'].features

In [None]:
ds['train'][0:3]

# Tokenization

In [None]:
model_dBERT = "distilbert/distilbert-base-uncased"
# load model's pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dBERT)

In [None]:
# tokenize function
def preprocess_function(dataset):
    return tokenizer(dataset["expression"], padding=True, truncation=True)

In [None]:
# data tokenization
tokenized_data = ds.map(preprocess_function, batched=True, batch_size=None)

# Evaluation settings

In [None]:
# evaluation metrics
accuracy = evaluate.load('accuracy')
f1_metric = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"accuracy": acc['accuracy'], "f1-macro": f1["f1"]}

# Fine-tuning

In [None]:
# load model
# GPU is used if available, if not CPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder_model = AutoModel.from_pretrained(model_dBERT).to(device)

In [None]:
batch_size = 8

# training arguments
training_args = TrainingArguments(
    output_dir=f"{model_dBERT}-finetuned-NER-LoRA",
    fp16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    learning_rate=3e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1-macro",
    logging_steps=50,
    report_to="none",
)

# LoRA parameters for fine-tuning process
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8, # avant: 4
    lora_alpha=32, # avant : 16
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_lin", "v_lin"]
)

In [None]:
# dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# early stopping to avoid unnecessary training
early_stop = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.001
)

def get_model():
    model = AutoModelForSequenceClassification.from_pretrained(
    model_dBERT, num_labels=len(target_classes), id2label=id2label, label2id=label2id
    ).to(device)
    lora_model = get_peft_model(model, lora_config)
    return lora_model

def init_trainer():
  model = get_model()
  return Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_data["train"],
      eval_dataset=tokenized_data["validation"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      callbacks=[early_stop],
      compute_metrics=compute_metrics
  ), model

In [None]:
# initialization of training
trainer, model = init_trainer()

In [None]:
# model fine-tuning
trainer.train()

# Performance analysis

In [None]:
preds_output = trainer.predict(tokenized_data['validation'])

In [None]:
preds_output.metrics

In [None]:
# save results into json file
import json

results = {
    "f1": preds_output.metrics.get("test_f1-macro"),
    "accuracy": preds_output.metrics.get("test_accuracy"),
    "loss": preds_output.metrics.get("test_loss"),
}

with open("results.json", "w", encoding="utf-8") as f:
  json.dump(results, f, indent=2)

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = tokenized_data['validation']['label']

In [None]:
# generate a confusion matrix to analyze performances qualitatively
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    labels_for_fig = [l[0:4]+'.' for l in labels]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=labels_for_fig)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.savefig("confusion_matrix.png", dpi=200, bbox_inches="tight")
    plt.show()

plot_confusion_matrix(y_preds, y_valid, target_classes)