### 1. CamemBERT Base :

In [1]:
from transformers import CamembertModel, CamembertConfig

model_path = "../../models/4gb_oscar"
camembert = CamembertModel.from_pretrained(model_path)

In [4]:
config = CamembertConfig()
print(config)
from torchinfo import summary
summary(camembert)

CamembertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Layer (type:depth-idx)                                       Param #
CamembertModel                                               --
├─CamembertEmbeddings: 1-1                                   --
│    └─Embedding: 2-1                                        24,579,840
│    └─Embedding: 2-2                                        394,752
│    └─Embedding: 2-3                                        768
│    └─LayerNorm: 2-4                                        1,536
│    └─Dropout: 2-5                                          --
├─CamembertEncoder: 1-2                                      --
│    └─ModuleList: 2-6                                       --
│    │    └─CamembertLayer: 3-1                              7,087,872
│    │    └─CamembertLayer: 3-2                              7,087,872
│    │    └─CamembertLayer: 3-3                              7,087,872
│    │    └─CamembertLayer: 3-4                              7,087,872
│    │    └─CamembertLayer: 3-5                       

In [5]:
print(camembert)

CamembertModel(
  (embeddings): CamembertEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): CamembertEncoder(
    (layer): ModuleList(
      (0-11): 12 x CamembertLayer(
        (attention): CamembertAttention(
          (self): CamembertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): CamembertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 

In [8]:
import torch
from torch import nn
from transformers import CamembertModel, CamembertTokenizer, CamembertConfig
from transformers import CamembertTokenizer


class CamemBERTBaseModel(nn.Module):
    def __init__(self, model_path: str, trainable: bool = False):
        """
        Initialize the base CamemBERT model.
        :param model_path: Path to the pre-trained CamemBERT model.
        """
        super(CamemBERTBaseModel, self).__init__()
        self.base_model = CamembertModel.from_pretrained(model_path)
        self.tranaible = trainable
        self.config = CamembertConfig()
        #self.config = CamembertModel.from_pretrained(model_path).config

        if not trainable:
            for param in self.base_model.parameters():
                param.requires_grad = False
            self.base_model.eval()
        else :
            self.base_model.train()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the base model.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :return: Last hidden states from the base model.
        """
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_hidden_size(self) -> int:
        """
        Get the hidden size of the base model for dynamically attaching heads.
        :return: Hidden size of the CamemBERT model.
        """
        return self.config.hidden_size


In [9]:
# exemple pour tester le base-model :
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
base_model = CamemBERTBaseModel(model_path=model_path, trainable=False)

sentence = "Ceci est <s> un exemple de <s> phrase pour démontrer le fonctionnement de CamemBERT. Je suis content"

tokens = tokenizer.tokenize(sentence)
inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

# Afficher les tokens et leurs IDs
print("Tokens :", tokens)
print("Token IDs :", inputs["input_ids"].squeeze().tolist())

Tokens : ['▁Ceci', '▁est', '<s>', '▁un', '▁exemple', '▁de', '<s>', '▁phrase', '▁pour', '▁démontrer', '▁le', '▁fonctionnement', '▁de', '▁Cam', 'em', 'BERT', '.', '▁Je', '▁suis', '▁content']
Token IDs : [5, 2978, 30, 5, 23, 411, 8, 5, 3572, 24, 8310, 16, 1625, 8, 5628, 1868, 20703, 9, 100, 146, 2945, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [11]:
# Passage dans le modèle
with torch.no_grad():
    embeddings = base_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

# Afficher la forme des embeddings
print("Embeddings shape :", embeddings.shape)

# Exemple d'accès à un vecteur spécifique (exemple : le premier token)
print("Embedding du premier token (CLS) :", embeddings[:, 0, :].shape)

Embeddings shape : torch.Size([1, 128, 768])
Embedding du premier token (CLS) : torch.Size([1, 768])


### 2. The NLI HEAD : (just for testing)

In [12]:
from torch import nn, Tensor

class NLIHead(nn.Module):
    def __init__(self, hidden_size: int, num_labels: int = 3):
        """
        Initialize the NLI head.
        :param hidden_size: Hidden size of the base model's output (e.g., 768 for CamemBERT).
        :param num_labels: Number of labels for NLI (default: 3 - coherent, neutral, contradictory).
        """
        super(NLIHead, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.classifier = nn.Linear(self.hidden_size, num_labels)

    def forward(self, cls_output: Tensor) -> Tensor:
        """
        Forward pass for the NLI head.
        :param cls_output: Tensor containing the [CLS] token representation (batch_size, hidden_size).
        :return: Logits for each class (batch_size, num_labels).
        """
        return self.classifier(cls_output)

In [13]:
from torch.nn.functional import softmax

# Initialiser le NLI Head
hidden_size = base_model.get_hidden_size()  # Par exemple, 768
nli_head = NLIHead(hidden_size=hidden_size, num_labels=3)

# Exemple de phrase d'entrée
premise = "Ceci est une phrase pour tester la logique."
hypothesis = "C'est un test pour valider une hypothèse."

# Tokenisation
inputs = tokenizer(
    premise, hypothesis,
    return_tensors="pt",
    max_length=256,
    padding="max_length",
    truncation=True
)

In [14]:
# Passage dans le modèle de base
with torch.no_grad():
    embeddings = base_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

# Récupération du vecteur [CLS] (premier token)
cls_output = embeddings[:, 0, :]  # (batch_size, hidden_size)
cls_output.shape

torch.Size([1, 768])

In [15]:
# Passage dans le NLI Head
logits = nli_head(cls_output)
print(logits)
probs = softmax(logits, dim=-1)
print(probs)

tensor([[ 0.4450, -0.0039,  0.1467]], grad_fn=<AddmmBackward0>)
tensor([[0.4201, 0.2682, 0.3117]], grad_fn=<SoftmaxBackward0>)


In [11]:
torch.argmax(probs).item()

2

In [16]:
# Passage dans le NLI Head
logits = nli_head(cls_output)

# Appliquer une fonction softmax pour obtenir des probabilités

# Résultat
labels = ["cohérent", "neutre", "contradictoire"]
predicted_label = labels[torch.argmax(probs).item()]

# Afficher les résultats
print(f"Logits : {logits}")
print(f"Probabilités : {probs}")
print(f"Label prédit : {predicted_label}")

Logits : tensor([[ 0.4450, -0.0039,  0.1467]], grad_fn=<AddmmBackward0>)
Probabilités : tensor([[0.4201, 0.2682, 0.3117]], grad_fn=<SoftmaxBackward0>)
Label prédit : cohérent


## 3. Modele NLI complet :

In [17]:
input_ids = inputs["input_ids"]
input_ids.shape

torch.Size([1, 256])

on fait ca car on va vouloir changer le modèle a un moment donné et donc pour ne pas refaire le code depuis le début on fait ça

In [31]:
class NLIFinetuningModel(nn.Module):
    def __init__(self, base_model: CamemBERTBaseModel, num_labels: int = 3):
        """
        Initialize the NLI fine-tuning model.
        :param base_model: Instance of the base CamemBERT model.
        :param num_labels: Number of labels for NLI.
        """
        super(NLIFinetuningModel, self).__init__()
        self.base_model = base_model 
        self.nli_head = NLIHead(base_model.get_hidden_size(), num_labels)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        """
        Forward pass for NLI fine-tuning.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :param labels: Optional tensor of labels (batch_size).
        :return: Dictionary containing logits and optionally loss.
        """
        # Get last hidden states from the base model
        hidden_states = self.base_model(input_ids=input_ids, attention_mask=attention_mask) # (batch_size, seq_len, hidden_size) -> (batch_size, seq_len, hidden_size)

        # Extract the [CLS] token's representation
        cls_output = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)

        # Pass through the NLI head
        logits = self.nli_head(cls_output)  # Shape: (batch_size, num_labels)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"logits": logits, "loss": loss}
    

In [20]:
# Charger le modèle CamemBERT pré-entraîné
base_model = CamemBERTBaseModel(model_path=model_path, trainable=True)
# Créer une instance de NLIFinetuningModel
model = NLIFinetuningModel(base_model=base_model, num_labels=3)

# Exemple de données d'entrée (batch_size=2)
input_ids = torch.tensor([[1, 23, 45, 2], [1, 67, 89, 2]])  # Token IDs
attention_mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 1]])  # Attention mask
labels = torch.tensor([0, 2])  # Labels (coherent=0, contradictory=2)

# Forward pass
output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
print("Logits:", output["logits"])
print("Loss:", output["loss"])

Logits: tensor([[ 0.1743,  0.1455,  0.1853],
        [-0.0622, -0.2431,  0.1681]], grad_fn=<AddmmBackward0>)
Loss: tensor(0.9959, grad_fn=<NllLossBackward0>)


## 4. Test with real data :

In [23]:
import torch
from torch.utils.data import Dataset



from datasets import load_dataset
from transformers import CamembertTokenizer


class XNLIDataset(Dataset):
    def __init__(self, split="train", language="fr", tokenizer=tokenizer, cache_directory="../data/xnli", max_length=128):
        """
        Dataset PyTorch pour le dataset XNLI.

        Args:
            split (str): Partition des données ("train", "test", "validation").
            language (str): Langue cible.
            cache_directory (str): Répertoire pour stocker le dataset téléchargé.
            max_length (int): Longueur maximale pour le padding/truncation.
        """
        super(XNLIDataset, self).__init__()
        self.split = split
        self.language = language
        self.cache_directory = cache_directory
        self.max_length = max_length

        # Charger les données et le tokenizer
        self.data = load_dataset(
            "facebook/xnli",
            name=self.language,
            cache_dir=self.cache_directory
        )[self.split]  # Charger uniquement la partition demandée

        self.tokenizer = tokenizer #CamembertTokenizer.from_pretrained("camembert-base")

    def __len__(self):
        """Retourne la taille du dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """
        Récupère un échantillon spécifique.

        Args:
            idx (int): Index de l'échantillon.

        Returns:
            dict: Contient les `input_ids`, `attention_mask` et `label`.
        """
        example = self.data[idx]
        inputs = self.tokenizer(
            example["premise"],
            example["hypothesis"],
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Ajouter les labels
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # Enlever la dimension batch
        inputs["label"] = torch.tensor(example["label"], dtype=torch.long)

        return inputs

In [27]:
from torch.utils.data import DataLoader
cache_directory ="../data/xnli"
xnli = XNLIDataset(split="train", cache_directory=cache_directory, max_length=256)

data_loader = DataLoader(xnli, batch_size=16, shuffle=True)
batch = next(iter(data_loader))

In [26]:
print(batch['input_ids'].shape)
print(batch['input_ids'].shape)
print(batch['label'])

input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch['label']

torch.Size([16, 128])
torch.Size([16, 128])
tensor([0, 0, 1, 1, 0, 0, 2, 2, 0, 2, 0, 1, 0, 2, 2, 0])


In [28]:
nli_model = NLIFinetuningModel(base_model=base_model)

In [29]:
output = nli_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = output["loss"]
loss.item()

1.0578067302703857

In [30]:
output = nli_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
print("Logits:", output["logits"])
print("Loss:", output["loss"])

Logits: tensor([[ 0.2395, -0.1110, -0.1766],
        [ 0.3280, -0.0443, -0.1854],
        [ 0.2464, -0.0946, -0.1660],
        [ 0.3357, -0.0239, -0.0630],
        [ 0.3458, -0.0148, -0.1879],
        [ 0.2138, -0.0526, -0.1945],
        [ 0.2906, -0.0869, -0.1904],
        [ 0.2087, -0.0788, -0.2027],
        [ 0.2006, -0.1566, -0.1919],
        [ 0.2682, -0.0122, -0.2245],
        [ 0.3817, -0.0137, -0.2667],
        [ 0.2620, -0.1228, -0.1829],
        [ 0.2008, -0.1305, -0.2211],
        [ 0.2664, -0.0700, -0.1728],
        [ 0.4032, -0.1006, -0.1706],
        [ 0.2362, -0.1486, -0.1505]], grad_fn=<AddmmBackward0>)
Loss: tensor(1.0649, grad_fn=<NllLossBackward0>)


## 5. Train the model :

### 5.1 Prepare data :

In [32]:
from torch.utils.data import DataLoader
from transformers import CamembertTokenizer

# Instancier le tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Charger vos données
train_dataset = XNLIDataset(split="train", language="fr", tokenizer=tokenizer, cache_directory="../data/xnli", max_length=128)
val_dataset = XNLIDataset(split="validation", language="fr", tokenizer=tokenizer, cache_directory="../data/xnli", max_length=128)

# Créer les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)


### 5.2 Configure data :

In [33]:
# Charger le modèle CamemBERT
base_model = CamemBERTBaseModel(model_path="../../models/4gb_oscar", trainable=True)

# Initialiser le modèle complet avec la tête NLI
model = NLIFinetuningModel(base_model=base_model, num_labels=3)

### 5.3 Optimizer and LossFunction :

In [34]:
from torch.optim import AdamW

# Optimiseur
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fonction de perte
criterion = nn.CrossEntropyLoss()

### 5.4 Training configuration :

In [25]:
# device = torch.device("cude" if torch.cuda.is_available() else "cpu")
# model.to(device)

# def train_for_one_epoch():
#     model.train() # activant le dropout et la normalisation par exemple 
#     running_loss = 0.0
#     for batch in train_loader:
#         input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
#         #optimizer.zero_grad()

#         # Forward pass
#         output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss, logits = output["loss"], output["logits"]
#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item() # loss.item() retourne la valeur de la loss

#     return running_loss / len(train_loader) # we devide by the number of batchs to get the average loss

In [26]:
# from tqdm import tqdm
# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# def train_one_epoch(model, loader, optimizer, criterion, device):
#     model.train()
#     running_loss = 0.0

#     for batch in tqdm(loader, desc="Training"):
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         # Forward pass
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs["loss"]

#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#     return running_loss / len(loader)


# def evaluate(model, loader, criterion, device):
#     model.eval()
#     running_loss = 0.0
#     correct_predictions = 0
#     total_predictions = 0

#     with torch.no_grad():
#         for batch in tqdm(loader, desc="Validation"):
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["label"].to(device)

#             # Forward pass
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs["loss"]
#             logits = outputs["logits"]

#             running_loss += loss.item()

#             # Prédictions
#             predictions = torch.argmax(logits, dim=-1)
#             correct_predictions += (predictions == labels).sum().item()
#             total_predictions += labels.size(0)

#     accuracy = correct_predictions / total_predictions
#     return running_loss / len(loader), accuracy

In [37]:
# nb_epochs = 2

# for epoch in tqdm(range(nb_epochs)):
#     print(f"Epoch {epoch + 1}/{nb_epochs}")

#     # Entraînement
#     train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
#     print(f"Training Loss: {train_loss:.4f}")

#     # Validation
#     val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
#     print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

In [None]:
import torch
from tqdm import tqdm
import time

def train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    criterion,
    device,
    num_epochs=2,
    log_interval=1000,
    scheduler=None,
    save_best_model=True,
    checkpoint_path="best_model.pth",
):
    """
    Fonction professionnelle pour entraîner un modèle avec suivi des métriques et validation périodique.

    Args:
        model (nn.Module): Le modèle à entraîner.
        train_loader (DataLoader): DataLoader pour les données d'entraînement.
        val_loader (DataLoader): DataLoader pour les données de validation.
        optimizer (Optimizer): Optimiseur à utiliser.
        criterion (Loss): Fonction de perte.
        device (torch.device): Appareil d'entraînement (CPU ou GPU).
        num_epochs (int): Nombre d'époques d'entraînement.
        log_interval (int): Fréquence d'affichage des logs pendant l'entraînement.
        scheduler (torch.optim.lr_scheduler, optional): Scheduler pour ajuster le taux d'apprentissage.
        save_best_model (bool): Sauvegarder le meilleur modèle basé sur la performance de validation.
        checkpoint_path (str): Chemin pour sauvegarder le modèle.

    Returns:
        dict: Contient les historiques de perte et d'exactitude.
    """
    model.to(device)

    history = {
        "train_loss": [],
        "val_loss": [],
        "val_accuracy": [],
    }
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        print(f"\n=== Epoch {epoch + 1}/{num_epochs} ===")
        start_time = time.time()

        # Entraînement
        model.train()
        running_loss = 0.0
        for step, batch in enumerate(tqdm(train_loader, desc="Training"), 1):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs["loss"]

            # Backward pass
            # optimizer.zero_grad()
            loss.backward()
            # update weights :
            optimizer.step()

            running_loss += loss.item()

            # Affichage périodique
            if step % log_interval == 0 or step == len(train_loader):
                avg_loss = running_loss / step
                print(f"Step {step}/{len(train_loader)} - Training Loss: {avg_loss:.4f}")

        avg_train_loss = running_loss / len(train_loader)
        history["train_loss"].append(avg_train_loss)

        # Validation
        model.eval()
        val_running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs["loss"]
                logits = outputs["logits"]

                val_running_loss += loss.item()

                # Prédictions
                predictions = torch.argmax(logits, dim=-1)
                correct_predictions += (predictions == labels).sum().item()
                total_predictions += labels.size(0)


        avg_val_loss = val_running_loss / len(val_loader)
        val_accuracy = correct_predictions / total_predictions

        history["val_loss"].append(avg_val_loss)
        history["val_accuracy"].append(val_accuracy)

        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

        # Scheduler step (si utilisé)
        if scheduler:
            scheduler.step(avg_val_loss)

        # Sauvegarde du meilleur modèle
        if save_best_model and avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Best model saved to {checkpoint_path}")

        # Afficher le temps d'exécution pour l'époque
        end_time = time.time()
        epoch_time = end_time - start_time
        print(f"Epoch {epoch + 1} completed in {epoch_time:.2f} seconds")

    return history

In [30]:
 # model summary
from torchinfo import summary
from torch.utils.data import DataLoader
from transformers import CamembertForMaskedLM, CamembertTokenizer, TrainingArguments, AdamW, Trainer , CamembertConfig
from datasets import load_from_disk
from dataset import OscarDataset

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
# === Model === #
#config = CamembertConfig(
 #    vocab_size=tokenizer.vocab_size)  # Adjust to match your tokenizer's vocab size
#     hidden_size=768,                 # Hidden size (RoBERTa_BASE)
#     num_hidden_layers=12,            # Number of transformer layers
#     num_attention_heads=12,          # Number of attention heads
#     intermediate_size=3072,          # FFN inner hidden size
#     hidden_dropout_prob=0.1,         # Dropout probability
#     attention_probs_dropout_prob=0.1, # Attention dropout probability
#     max_position_embeddings=514,     # Maximum sequence length + special tokens
#     type_vocab_size=1,               # No token type embeddings
#     initializer_range=0.02           # Standard deviation for weight initialization
# )

config = CamembertConfig()
print(config)
# Initialize a randomly weighted CamembertForMaskedLM model
model = CamembertForMaskedLM(config) 
#model.to("cuda")

print("Model initialized")
print(model)




ModuleNotFoundError: No module named 'dataset'

In [35]:
from huggingface_hub import whoami
user_info = whoami()
print(f"Vous êtes connecté en tant que : {user_info['name']}")

LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [None]:
import torch
from transformers import (
    CamembertTokenizer,
    CamembertForSequenceClassification,
    CamembertForTokenClassification,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset

class CamemBERTFineTuner:
    def __init__(self, model_path, num_labels, task="classification"):
        self.model_path = model_path
        self.tokenizer = CamembertTokenizer.from_pretrained(model_path)
        self.task = task
        
        # Charger le modèle adapté à la tâche
        if task == "classification":  # e.g., NLI
            self.model = CamembertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
        elif task == "token_classification":  # e.g., NER
            self.model = CamembertForTokenClassification.from_pretrained(model_path, num_labels=num_labels)
        else:
            raise ValueError("Unsupported task. Use 'classification' or 'token_classification'.")
    
    def tokenize_function(self, examples):
        """Tokenize input data"""
        return self.tokenizer(
            examples["text"],  # Assurez-vous que 'text' correspond à votre dataset
            padding=True,
            truncation=True,
            max_length=512
        )
    
    def load_and_preprocess_data(self, dataset_name, split, text_column, label_column):
        """Charger et préparer les données"""
        dataset = load_dataset(dataset_name, split=split)
        dataset = dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=[text_column]
        )
        dataset = dataset.rename_column(label_column, "labels")
        dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        return dataset

    def train(self, train_dataset, eval_dataset, output_dir, batch_size=8, epochs=3, lr=5e-5):
        """Fine-tuner le modèle"""
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
            logging_dir=f"{output_dir}/logs",
            logging_steps=10,
            save_total_limit=2
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer
        )
        
        trainer.train()

    def predict(self, texts):
        """Effectuer une prédiction sur de nouveaux textes"""
        inputs = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.logits


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from torch import nn
from transformers import CamembertModel

class CamemBERTBaseModel(nn.Module):
    def __init__(self, model_path: str):
        """
        Initialize the base CamemBERT model.
        :param model_path: Path to the pre-trained CamemBERT model.
        """
        super(CamemBERTBaseModel, self).__init__()
        self.base_model = CamembertModel.from_pretrained(model_path)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the base model.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :return: Last hidden states from the base model.
        """
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_hidden_size(self) -> int:
        """
        Get the hidden size of the base model for dynamically attaching heads.
        :return: Hidden size of the CamemBERT model.
        """
        return self.base_model.config.hidden_size


In [None]:
class NLIFinetuningModel(nn.Module):
    def __init__(self, base_model: nn.Module, hidden_size: int, num_labels: int = 3):
        """
        Initialize the NLI fine-tuning model.
        :param base_model: Instance of the base CamemBERT model.
        :param hidden_size: Hidden size of the base model's output.
        :param num_labels: Number of labels for NLI (default: 3).
        """
        super(NLIFinetuningModel, self).__init__()
        self.base_model = base_model  # Base CamemBERT model
        self.nli_head = NLIHead(hidden_size, num_labels)  # NLI-specific head

    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        labels: Tensor = None,
    ) -> dict:
        """
        Forward pass for NLI fine-tuning.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :param labels: Optional tensor of labels (batch_size).
        :return: Dictionary containing logits and optionally loss.
        """
        # Get last hidden states from the base model
        hidden_states = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the [CLS] token's representation
        cls_output = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)

        # Pass through the NLI head
        logits = self.nli_head(cls_output)  # Shape: (batch_size, num_labels)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"logits": logits, "loss": loss}


In [None]:
class NLIFinetuningModel(nn.Module):
    def __init__(self, base_model: CamemBERTBaseModel, num_labels: int = 3):
        """
        Initialize the NLI fine-tuning model.
        :param base_model: Instance of the base CamemBERT model.
        :param num_labels: Number of labels for NLI.
        """
        super(NLIFinetuningModel, self).__init__()
        self.base_model = base_model
        self.nli_head = NLIHead(base_model.get_hidden_size(), num_labels)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        """
        Forward pass for NLI fine-tuning.
        :param input_ids: Tensor of token IDs.
        :param attention_mask: Tensor of attention masks.
        :param labels: Optional tensor of labels (batch_size).
        :return: Dictionary containing logits and optionally loss.
        """
        # Get last hidden states from the base model
        hidden_states = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the [CLS] token's representation
        cls_output = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)

        # Pass through the NLI head
        logits = self.nli_head(cls_output)  # Shape: (batch_size, num_labels)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"logits": logits, "loss": loss}

In [36]:
premises = ["Le ciel est bleu.", "Marie mange une pomme."]
hypotheses = ["Le ciel est vert.", "Marie est au marché."]
labels = [2, 1]  # Contradiction, Neutral

dataset = NLIDataset(premises, hypotheses, labels, model_path="camembert-base")
print(dataset[0])  # Premier exemple

NameError: name 'NLIDataset' is not defined