In [None]:
import transformers
import warnings

# Désactiver les avertissements du tokenizer
transformers.logging.set_verbosity_error()
warnings.filterwarnings("ignore")

In [1]:
import torch
from torch import nn
from transformers import CamembertModel, CamembertTokenizer, CamembertConfig

from torch import nn, Tensor
from torch.nn.functional import softmax

from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

## 1. Prepare Data :

In [2]:
data_path = "../../../data/XNLI"
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

In [3]:
class XNLIDataset(Dataset):
    def __init__(self, split="train", language="fr", tokenizer=tokenizer, cache_directory="../data/xnli", max_length=128):
        """
        Dataset PyTorch pour le dataset XNLI.

        Args:
            split (str): Partition des données ("train", "test", "validation").
            language (str): Langue cible.
            cache_directory (str): Répertoire pour stocker le dataset téléchargé.
            max_length (int): Longueur maximale pour le padding/truncation.
        """
        super(XNLIDataset, self).__init__()
        self.split = split
        self.language = language
        self.cache_directory = cache_directory
        self.max_length = max_length

        # Charger les données et le tokenizer
        self.data = load_dataset(
            "facebook/xnli",
            name=self.language,
            cache_dir=self.cache_directory
        )[self.split]  # Charger uniquement la partition demandée

        self.tokenizer = tokenizer #CamembertTokenizer.from_pretrained("camembert-base")

    def __len__(self):
        """Retourne la taille du dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """
        Récupère un échantillon spécifique.

        Args:
            idx (int): Index de l'échantillon.

        Returns:
            dict: Contient les `input_ids`, `attention_mask` et `label`.
        """
        example = self.data[idx]
        inputs = self.tokenizer(
            example["premise"],
            example["hypothesis"],
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Ajouter les labels
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # Enlever la dimension batch
        inputs["label"] = torch.tensor(example["label"], dtype=torch.long)

        return inputs

In [4]:
xnli_train_dataset = XNLIDataset(split="train", language="fr", cache_directory=data_path, max_length=32)
xnli_val_dataset = XNLIDataset(split="validation", language="fr", cache_directory=data_path, max_length=32)

train_loader = DataLoader(xnli_train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(xnli_val_dataset, batch_size=2, shuffle=False)

## 2. Prepare the model

In [6]:
class CamemBERTBaseModel(nn.Module):
    def __init__(self, model_path: str, trainable: bool = False):
        """
        Initialize the base CamemBERT model.
        :param model_path: Path to the pre-trained CamemBERT model.
        """
        super(CamemBERTBaseModel, self).__init__()
        self.base_model = CamembertModel.from_pretrained(model_path)
        self.tranaible = trainable
        self.config = CamembertConfig()
        #self.config = CamembertModel.from_pretrained(model_path).config

        if not trainable:
            for param in self.base_model.parameters():
                param.requires_grad = False
            self.base_model.eval()
        else :
            self.base_model.train()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the base model.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :return: Last hidden states from the base model.
        """
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_hidden_size(self) -> int:
        """
        Get the hidden size of the base model for dynamically attaching heads.
        :return: Hidden size of the CamemBERT model.
        """
        return self.config.hidden_size

In [7]:
class NLIHead(nn.Module):
    def __init__(self, hidden_size: int, num_labels: int = 3):
        """
        Initialize the NLI head.
        :param hidden_size: Hidden size of the base model's output (e.g., 768 for CamemBERT).
        :param num_labels: Number of labels for NLI (default: 3 - coherent, neutral, contradictory).
        """
        super(NLIHead, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.classifier = nn.Linear(self.hidden_size, num_labels)

    def forward(self, cls_output: Tensor) -> Tensor:
        """
        Forward pass for the NLI head.
        :param cls_output: Tensor containing the [CLS] token representation (batch_size, hidden_size).
        :return: Logits for each class (batch_size, num_labels).
        """
        return self.classifier(cls_output)

In [10]:
class NLIFinetuningModel(nn.Module):
    def __init__(self, base_model: CamemBERTBaseModel, num_labels: int = 3):
        """
        Initialize the NLI fine-tuning model.
        :param base_model: Instance of the base CamemBERT model.
        :param num_labels: Number of labels for NLI.
        """
        super(NLIFinetuningModel, self).__init__()
        self.base_model = base_model 

        self.hidden_size = base_model.get_hidden_size()
        self.num_labels = num_labels

        self.nli_head = nn.Linear(self.hidden_size, num_labels)
        # self.nli_head = NLIHead(base_model.get_hidden_size(), num_labels)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        """
        Forward pass for NLI fine-tuning.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :param labels: Optional tensor of labels (batch_size).
        :return: Dictionary containing logits and optionally loss.
        """
        # Get last hidden states from the base model
        hidden_states = self.base_model(input_ids=input_ids, attention_mask=attention_mask) # (batch_size, seq_len, hidden_size) -> (batch_size, seq_len, hidden_size)

        # Extract the [CLS] token's representation
        cls_output = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)

        # Pass through the NLI head
        logits = self.nli_head(cls_output)  # Shape: (batch_size, num_labels)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"logits": logits, "loss": loss}

## 3. Train 

In [13]:
import pytorch_lightning as pl
import torch
from torchmetrics import Accuracy
from pytorch_lightning import Trainer

class NLI(pl.LightningModule):
    def __init__(self, model: NLIFinetuningModel, lr: float = 5e-5, warmup_steps: int = 0, total_steps: int = 10000):
        """
        Initialize the NLI model with PyTorch Lightning.
        :param model: Instance of the NLIFinetuningModel.
        :param lr: Learning rate for the optimizer.
        :param warmup_steps: Number of warmup steps for the scheduler.
        :param total_steps: Total steps for linear warmup scheduler.
        """
        super(NLI, self).__init__()
        self.model = model
        self.lr = lr
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps

        # Metrics
        self.train_acc = Accuracy(task="multiclass", num_classes=3)
        self.val_acc = Accuracy(task="multiclass", num_classes=3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

        # Lists to store metrics
        self.train_losses = []
        self.val_losses = []
        self.train_accuracies = []
        self.val_accuracies = []

    def training_step(self, batch, batch_index):
        """
        Training step for the model.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        
        outputs = self.model(input_ids, attention_mask, labels)
        loss = outputs["loss"]


        # Compute accuracy
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        acc = self.train_acc(preds, labels)

        # Store metrics
        self.train_losses.append(loss.item())
        self.train_accuracies.append(acc.item())


        # Log metrics
        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc', self.train_acc, prog_bar=True, on_step=False, on_epoch=True)

        return loss
    
    def on_train_epoch_end(self):
        """
        At the end of each training epoch, average the metrics and store them.
        """
        avg_loss = torch.tensor(self.train_loss_accum).mean().item()
        avg_acc = torch.tensor(self.train_acc_accum).mean().item()

        self.train_losses_epoch.append(avg_loss)
        self.train_acc_epoch.append(avg_acc)

        # Clear temporary accumulators
        self.train_loss_accum.clear()
        self.train_acc_accum.clear()


    def validation_step(self, batch, idx):
        """
        Validation step for the model.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]

        outputs = self.model(input_ids, attention_mask, labels)
        loss = outputs["loss"]

        # Compute accuracy
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        acc = self.val_acc(preds, labels)

        # Store metrics
        self.val_losses.append(loss.item())
        self.val_accuracies.append(acc.item())
        
        # Log loss
        self.log('val_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def on_validation_epoch_end(self):
        """
        Aggregate metrics at the end of each validation epoch.
        """
        # # Log global validation accuracy
        # self.log('val_accuracy', self.val_acc.compute(), prog_bar=True)
        # self.val_acc.reset()  # Reset accuracy for the next epoch

        avg_loss = torch.tensor(self.val_loss_accum).mean().item()
        avg_acc = torch.tensor(self.val_acc_accum).mean().item()

        self.val_losses_epoch.append(avg_loss)
        self.val_acc_epoch.append(avg_acc)

        # Clear temporary accumulators
        self.val_loss_accum.clear()
        self.val_acc_accum.clear()

        # Log global validation accuracy
        self.log('val_accuracy', avg_acc, prog_bar=True)


    def configure_optimizers(self):
        """
        Configure optimizer and learning rate scheduler.
        """
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, 
            max_lr=self.lr, 
            total_steps=self.total_steps, 
            pct_start=0.1, 
            anneal_strategy='linear'
        )
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step", "frequency": 1}}

    def forward(self, batch):
        """
        Forward pass for inference.
        """
        input_ids, attention_mask, _ = batch
        outputs = self.model(input_ids, attention_mask)
        return outputs["logits"]

In [14]:
model_path = "../../../models/oscar_4gb"

# 1. Load the base model
base_camembert = CamembertModel.from_pretrained(model_path)
# 2. Load the NLI model :
nli_camembert = NLIFinetuningModel(base_model=CamemBERTBaseModel(model_path, trainable=True), 
                                   num_labels=3)

nb_epochs = 3
nb_steps_per_epoch = len(train_loader)

pl_camembert = NLI(model=nli_camembert, total_steps=nb_epochs*nb_steps_per_epoch)

In [15]:
xnli_train_dataset = XNLIDataset(split="train", language="fr", cache_directory=data_path, max_length=256)
xnli_val_dataset = XNLIDataset(split="validation", language="fr", cache_directory=data_path, max_length=256)

train_loader = DataLoader(xnli_train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(xnli_val_dataset, batch_size=8, shuffle=False)

In [16]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


logger = TensorBoardLogger("my_logs", name="my_experiment")

# Configurer le checkpoint
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",         # Sauvegarder le modèle basé sur la val_loss
    dirpath="checkpoints/",     # Répertoire où sauvegarder les modèles
    filename="nli-{epoch:02d}-{val_loss:.2f}",  # Nom des fichiers
    save_top_k=2,               # Sauvegarder le meilleur modèle uniquement
    mode="min"                  # Minimiser val_loss
)

# Configuration du Trainer
trainer = Trainer(
    max_epochs=3,                # Nombre d'epochs
    logger=logger,               # Logger pour TensorBoard
    callbacks=[checkpoint_callback],  # Callback pour le checkpoint
    accelerator="gpu" if torch.cuda.is_available() else "cpu",  # Utilise le GPU si disponible
    devices=1,                   # Nombre de GPUs à utiliser
    log_every_n_steps=10,        # Fréquence d'affichage des logs
    precision=16,                # Utilisation du mixed precision pour accélérer l'entraînement (facultatif)
)

/home/amine/.local/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(pl_camembert, train_loader, val_loader)

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | NLIFinetuningModel | 110 M  | train
1 | train_acc | MulticlassAccuracy | 0      | train
2 | val_acc   | MulticlassAccuracy | 0      | train
3 | loss_fn   | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.497   Total estimated model params size (MB)
234       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/amine/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=47` in the `DataLoader` to improve performance.
/home/amine/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=47` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Validation: |          | 0/? [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
# Affichage des courbes
plt.figure(figsize=(12, 5))

# Loss
plt.subplot(1, 2, 1)
plt.plot(pl_camembert.train_losses, label="Training Loss")
plt.plot(pl_camembert.val_losses, label="Validation Loss")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()

# Accuracy
plt.subplot(1, 2, 2)
plt.plot(pl_camembert.train_accuracies, label="Training Accuracy")
plt.plot(pl_camembert.val_accuracies, label="Validation Accuracy")
plt.xlabel("Iterations")
plt.ylabel("Accuracy")
plt.title("Training and Validation Accuracy")
plt.legend()

plt.tight_layout()
plt.show()

# To do :

- 1. il faut aussi avoir une metric ainsi qu'une visualisation
- 2. regler le truc dans jupyter lab surtout avec les fichiers de data (faire un git clone)
- 3. voir pour le truc de tokenization de toute la base avant
- 4. voir la difference entre finetuning et transfer learning

# Comments :
- on utilise l'accuracy car les classes sont equilibrées dans le dataset nli
si ce n'etait pas le cas, le score f1 ou la matrice de confusion auraient etaient des bonnes alternatives