In [110]:
import torch
import torch.nn as nn
import math
from typing import List, Optional, Tuple, Union
from packaging import version
from transformers.utils import logging
import torch.nn.functional as F
from dataclasses import dataclass


@dataclass
class CamembertConfig:
    """
    This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. 
    It defines the model architecture and is used to instantiate a Camembert model with specified arguments.
    """

    vocab_size: int = 32005
    hidden_size: int = 768
    num_hidden_layers: int = 12
    num_attention_heads: int = 12
    intermediate_size: int = 3072
    hidden_act: str = "gelu"
    hidden_dropout_prob: float = 0.1
    attention_probs_dropout_prob: float = 0.1
    max_position_embeddings: int = 514
    type_vocab_size: int = 1
    initializer_range: float = 0.02
    layer_norm_eps: float = 1e-05
    pad_token_id: int = 1
    bos_token_id: int = 0
    eos_token_id: int = 2
    position_embedding_type: str = "absolute"
    use_cache: bool = True
    head_type: str = "MLM"
    classifier_dropout: float = None


class CamembertEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.position_ids = torch.arange(config.max_position_embeddings).unsqueeze(0)  # Shape (1, max_position_embeddings)
        self.token_type_ids = torch.zeros_like(self.position_ids, dtype=torch.long)  # Shape (1, max_position_embeddings)

    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        input_shape = input_ids.size()
        batch_size, seq_length = input_shape

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length].to(input_ids.device)
        if token_type_ids is None:
            token_type_ids = self.token_type_ids[:, :seq_length].expand(batch_size, seq_length).to(input_ids.device)

        inputs_embeds = self.word_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.token_type_embeddings(token_type_ids)
        embeddings = inputs_embeds + position_embeds + token_type_embeds

        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings
    

class CamembertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        batch_size, seq_length, hidden_size = x.size()
        x = x.view(batch_size, seq_length, self.num_attention_heads, self.attention_head_size)
        return x.permute(0, 2, 1, 3)  # [batch, num_heads, seq_len, head_size]

    def forward(self, hidden_states, attention_mask=None):
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores /= math.sqrt(self.attention_head_size)

        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch, 1, 1, seq_len]
            attention_scores += attention_mask

        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        context_layer = context_layer.view(hidden_states.size(0), -1, self.all_head_size)

        return context_layer
    

class CamembertMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.hidden_size, config.intermediate_size)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x


class CamembertBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = CamembertSelfAttention(config)  # Multi-Head Attention
        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # LayerNorm après l'attention
        self.mlp = CamembertMLP(config)  # Feed Forward Network (MLP)
        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # LayerNorm après le MLP

    def forward(self, hidden_states, attention_mask=None):
        # Multi-Head Attention avec résiduel
        attention_output = self.attention(hidden_states, attention_mask)
        hidden_states = hidden_states + attention_output  # Résiduel
        hidden_states = self.ln_1(hidden_states)  # Normalisation

        # Feed Forward Network (MLP) avec résiduel
        mlp_output = self.mlp(hidden_states)
        hidden_states = hidden_states + mlp_output  # Résiduel
        hidden_states = self.ln_2(hidden_states)  # Normalisation

        return hidden_states


class Camembert(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeddings = CamembertEmbeddings(config)
        self.encoder = nn.ModuleList([CamembertBlock(config) for _ in range(config.num_hidden_layers)])
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(self, input_ids, attention_mask=None):
        hidden_states = self.embeddings(input_ids)
        for block in self.encoder:
            hidden_states = block(hidden_states, attention_mask)
        hidden_states = self.final_layer_norm(hidden_states)
        logits = self.lm_head(hidden_states)
        return logits
    

model = Camembert(CamembertConfig())

In [1]:
import os
import pickle   
from datasets import load_from_disk
from transformers import CamembertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
import torch.nn as nn

from tqdm import tqdm
import time

In [2]:
import os
import sys
os.chdir(os.path.abspath("../../../"))
os.getcwd()

'c:\\Users\\Napster\\Desktop\\M2_ISI\\MLA\\CamemBERT\\MLA-CamemBERT'

# 1. Prepare Data :

In [6]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

In [5]:
class Tokenized_oscar_dataset(Dataset):
    def __init__(self, raw_tokenised_data):
        """
        Initialiser le dataset avec les données sous forme de dictionnaire.
        """
        self.raw_data = raw_tokenised_data  # Le dictionnaire avec 'input_ids', 'attention_mask', et 'labels'
        self.data = {
            "texts": self.raw_data['text'],
            "input_ids": self.raw_data['input_ids'],
            "attention_mask": self.raw_data['attention_mask'],
            "labels": self.raw_data['labels']
        }
        
    def __len__(self):
        """
        Retourne le nombre d'exemples dans le dataset.
        """
        return len(self.data['input_ids'])
    
    def __getitem__(self, idx):
        """
        Retourne un exemple individuel sous forme de dictionnaire.
        """
        return {
            'input_ids': torch.tensor(self.data['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.data['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor(self.data['labels'][idx], dtype=torch.long)
        }

    
def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    """Prepare masked tokens for MLM."""
    labels = inputs.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace 80% of masked tokens with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of masked tokens with random words
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    return inputs, labels


def preprocess_function(texts, tokenizer, max_length=512):
    """
    Préparer les données pour l'entraînement MLM.
    :param texts: Liste de textes.
    :param tokenizer: Tokenizer de CamemBERT.
    :return: Dictionnaire avec 'input_ids', 'attention_mask', et 'labels'.
    """
    # Tokeniser les textes
    tokenized = tokenizer(texts, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    # Appliquer le masquage
    masked_inputs, labels = mask_tokens(input_ids, tokenizer)

    return { 
        "text": list(texts),
        "input_ids": masked_inputs.tolist(),
        "attention_mask": attention_mask.tolist(),
        "labels": labels.tolist()
    }

load the downlaoded Oscar :

In [13]:
dataset_path ="data/oscar.Arrow"
mini_oscar_dataset = load_from_disk(dataset_path)
train_texts, val_texts = train_test_split(mini_oscar_dataset['text'], test_size=0.2, random_state=42)

In [15]:
train_data = preprocess_function(train_texts, tokenizer)
val_data = preprocess_function(val_texts, tokenizer)

In [17]:
import pickle

def save_tokenized_data(data, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        pickle.dump(data, f)

save_tokenized_data(train_data, "data/tokenized_data/tokenized_train_data.pkl")
save_tokenized_data(val_data, "data/tokenized_data/tokenized_val_data.pkl")

In [18]:
def load_tokenized_data(filename):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    return data

train_data = load_tokenized_data("data/tokenized_data/tokenized_train_data.pkl")
val_data = load_tokenized_data("data/tokenized_data/tokenized_val_data.pkl")

In [19]:
# Créer les datasets
train_dataset = Tokenized_oscar_dataset(train_data)
val_dataset = Tokenized_oscar_dataset(val_data)

# Créer les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8192, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8192, shuffle=False)

In [20]:
# tester la donnée :
batch = next(iter(train_loader))
print(batch['input_ids'].shape)
print(batch['attention_mask'].shape)
print(batch['labels'].shape)


print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
Training samples: 8, Validation samples: 2


# 2. Model :

In [30]:
def init_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        # print(f"Initializing {module.__class__.__name__} with Xavier Uniform")
        nn.init.xavier_uniform_(module.weight)
    elif isinstance(module, nn.LayerNorm):
        # print(f"Initializing {module.__class__.__name__} with Constant Weights")
        nn.init.constant_(module.bias, 0)
        nn.init.constant_(module.weight, 1)
    if hasattr(module, 'bias') and module.bias is not None:
        # print(f"Initializing Bias for {module.__class__.__name__}")
        nn.init.constant_(module.bias, 0)

# 3. Train :

In [31]:
def save_checkpoint(epoch, model, optimizer, loss, save_dir):
    """
    Sauvegarder l'état actuel du modèle, optimiseur et perte.
    """
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch}.pth")
    torch.save(checkpoint, save_path)
    print(f"Checkpoint saved at {save_path}")

In [42]:
# 1. data :
batch_size = 4096
train_data = load_tokenized_data("data/tokenized_data/tokenized_train_data.pkl")
val_data = load_tokenized_data("data/tokenized_data/tokenized_val_data.pkl")

train_dataset = Tokenized_oscar_dataset(train_data)
val_dataset = Tokenized_oscar_dataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [104]:
# Hugging Face Model :  
from transformers import CamembertForMaskedLM
hf_model = CamembertForMaskedLM.from_pretrained("camembert-base")
# hf_model.apply(init_weights)
print()

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





training loop :

In [None]:
# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

max_lr = 6e-4
min_lr = max_lr * 0.1  # 10 %
warmup_steps = 10

save_interval = 40  # Save the model every 40 epochs 
eval_interval = 40  # Evaluate the model every 40 epochs
num_epochs = 420  # Total epochs
learning_rate = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=learning_rate)
history = {'train_loss': [], 'val_loss': []}

# Dossier de sauvegarde
save_dir = "models/mlm_training/model_checkpoints"
os.makedirs(save_dir, exist_ok=True)

############################## Training Loop ################################
for epoch in range(num_epochs):
    model.train()  # Mettre le modèle en mode entraînement
    train_loss = 0.0  # Accumulateur pour la perte par époque

    t0 = time.time()
    
    # once in a while evaluate our validation loss : (only for one step, next we'll consider more steps validation)
    if epoch % 40 ==0 or epoch == num_epochs - 1:
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            for batch_index, batch in enumerate(val_loader):
                # 1. Prepare val data :
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                # 2. Forward pass :
                # optimizer.zero_grad() not needed since torch.no_grad is used 
                logits = model(input_ids, attention_mask=attention_mask)['logits']
                loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

                val_loss += loss.item()
            avg_val_loss = val_loss / len(val_loader)
            print(f"Epoch {epoch + 1 }, Validation Loss: {avg_val_loss:.4f}")
            history['val_loss'].append(avg_val_loss)

    # Update the parameters of the model :
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}") as pbar:
        for batch_index, batch in enumerate(train_loader):
            # 1. Prepare trin data :
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # 2. Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

            # 3. Backward pass et optimisation
            loss.backward()
            optimizer.step()

            # Suivi de la progression
            train_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
            pbar.update(1)
        
    avg_train_loss = train_loss / len(train_loader)

    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}, Epoch time: {dt:.2f} s")

    # Sauvegarder la perte moyenne de l'époque
    history['train_loss'].append(avg_train_loss)

    # Sauvegarder tous les 40 epochs
    if (epoch + 1) % save_interval == 0 or (epoch + 1) == num_epochs:
        save_checkpoint(epoch + 1, model, optimizer, avg_train_loss, save_dir)

___

In [90]:
# 1. data :
batch_size = 2
train_data = load_tokenized_data("data/tokenized_data/tokenized_train_data.pkl")
val_data = load_tokenized_data("data/tokenized_data/tokenized_val_data.pkl")

train_dataset = Tokenized_oscar_dataset(train_data)
val_dataset = Tokenized_oscar_dataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

batch = next(iter(val_loader))
batch['input_ids'].shape

torch.Size([2, 512])

test our model :

In [108]:
# Our Model : 
model_from_scratch = Camembert(CamembertConfig()) 
model = model_from_scratch

In [109]:
batch  = next(iter(train_loader))

input_ids = batch['input_ids']  # Les ID des tokens
attention_mask = batch["attention_mask"]  # Masque d'attention

print("Input IDs:", input_ids.shape)
print("Attention Mask:", attention_mask.shape)

# Étape 3 : Passer les données dans le modèle
logits = model(input_ids, attention_mask)



# Étape 4 : Interpréter les résultats
print("Logits Shape:", logits.shape)  # Devrait être (batch_size, seq_length, vocab_size)

# Étape 5 : Décoder les prédictions pour voir les tokens les plus probables
predicted_tokens = torch.argmax(logits, dim=-1)
print(torch.tensor(predicted_tokens).shape)
decoded_tokens = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_tokens]

print("Predicted Tokens:", decoded_tokens)

Input IDs: torch.Size([2, 512])
Attention Mask: torch.Size([2, 512])
Logits Shape: torch.Size([2, 512, 32005])
torch.Size([2, 512])
Predicted Tokens: ['Roland Roland Roland Roland Roland FORMATION règlementation intéressantes Rolandrine Roland Roland FORMATIONrine règlementation Roland Roland es règlementation Roland Roland Roland Roland Roland Roland Roland règlementation Roland règlementation Eventrine Roland Roland Roland rêvé intéressantes Rolandrine FORMATION FORMATION Roland Roland Roland Roland règlementation Roland Roland Roland Roland FORMATION FORMATION Roland Roland règlementation Roland physiques Roland FORMATION Roland bd Roland Roland Roland Roland Roland Roland Roland Roland Roland Roland Rolandrine Roland rêvé Roland FORMATION Roland intéressantes règlementation Roland règlementation FORMATION règlementation Roland Roland Roland Roland Roland règlementation Roland FORMATION es FORMATION Rolandrine Rolandrine FORMATION règlementationrine Roland Roland Roland Roland FORMA

  print(torch.tensor(predicted_tokens).shape)


train our own model :

In [98]:
# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

max_lr = 6e-4
min_lr = max_lr * 0.1  # 10 %
warmup_steps = 10

save_interval = 40  # Save the model every 40 epochs 
eval_interval = 40  # Evaluate the model every 40 epochs
num_epochs = 3 # 420  # Total epochs
learning_rate = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=learning_rate)
history = {'train_loss': [], 'val_loss': []}

# Dossier de sauvegarde
save_dir = "models/mlm_training/model_checkpoints"
os.makedirs(save_dir, exist_ok=True)

############################## Training Loop ################################
for epoch in range(num_epochs):
    model.train()  # Mettre le modèle en mode entraînement
    train_loss = 0.0  # Accumulateur pour la perte par époque

    t0 = time.time()
    
    # once in a while evaluate our validation loss : (only for one step, next we'll consider more steps validation)
    if epoch % 40 ==0 or epoch == num_epochs - 1:
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            for batch_index, batch in enumerate(val_loader):
                # 1. Prepare val data :
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                # 2. Forward pass :
                # optimizer.zero_grad() not needed since torch.no_grad is used 
                logits = model(input_ids, attention_mask=attention_mask)
                loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

                val_loss += loss.item()
            avg_val_loss = val_loss / len(val_loader)
            print(f"Epoch {epoch + 1 }, Validation Loss: {avg_val_loss:.4f}")
            history['val_loss'].append(avg_val_loss)

    # Update the parameters of the model :
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}") as pbar:
        for batch_index, batch in enumerate(train_loader):
            # 1. Prepare trin data :
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # 2. Forward pass
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask=attention_mask)
    
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

            # 3. Backward pass et optimisation
            loss.backward()
            optimizer.step()

            # Suivi de la progression
            train_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
            pbar.update(1)
        
    avg_train_loss = train_loss / len(train_loader)

    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}, Epoch time: {dt:.2f} s")

    # Sauvegarder la perte moyenne de l'époque
    history['train_loss'].append(avg_train_loss)

    # Sauvegarder tous les 40 epochs
    if (epoch + 1) % save_interval == 0 or (epoch + 1) == num_epochs:
        save_checkpoint(epoch + 1, model, optimizer, avg_train_loss, save_dir)

Epoch 1, Validation Loss: 10.5864


Epoch 1/3: 100%|██████████| 4/4 [00:16<00:00,  4.03s/it, loss=9.79]


Epoch 1, Training Loss: 10.1843, Epoch time: 17.36 s


Epoch 2/3: 100%|██████████| 4/4 [00:18<00:00,  4.52s/it, loss=8.98]


Epoch 2, Training Loss: 9.1851, Epoch time: 18.08 s
Epoch 3, Validation Loss: 9.2178


Epoch 3/3: 100%|██████████| 4/4 [00:16<00:00,  4.08s/it, loss=8.78]


Epoch 3, Training Loss: 8.7238, Epoch time: 17.69 s
Checkpoint saved at models/mlm_training/model_checkpoints\checkpoint_epoch_3.pth


In [106]:
model = hf_model

batch  = next(iter(train_loader))

input_ids = batch['input_ids']  # Les ID des tokens
attention_mask = batch["attention_mask"]  # Masque d'attention

print("Input IDs:", input_ids.shape)
print("Attention Mask:", attention_mask.shape)

# Étape 3 : Passer les données dans le modèle
logits = model(input_ids, attention_mask).logits



# Étape 4 : Interpréter les résultats
print("Logits Shape:", logits.shape)  # Devrait être (batch_size, seq_length, vocab_size)

# Étape 5 : Décoder les prédictions pour voir les tokens les plus probables
predicted_tokens = torch.argmax(logits, dim=-1)
print(torch.tensor(predicted_tokens).shape)
decoded_tokens = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_tokens]


print(tokenizer.decode(input_ids[0], skip_special_tokens=True))
print("Predicted Tokens:", decoded_tokens)

Input IDs: torch.Size([2, 512])
Attention Mask: torch.Size([2, 512])
Logits Shape: torch.Size([2, 512, 32005])
torch.Size([2, 512])
ne pouvez pas directement coller images. Envoyez-les depuis votre ordinateur ou insérez- depuis une URL.
Predicted Tokens: ['Vous ne pouvez pas directement coller les images. Envoyez-les depuis votre ordinateur ou insérez- les depuis une URL.', "La troisième prestation est une revue ponctuelle ou continue de la qualité du codage. Cette activité est complémentaire à l’audit du codage, mais elle ne le remplace pas: l’audit annuel non obligatoire, est réalisé par un organisme certifié mandaté pour vérifier que le travail est exempt d’erreurs, «erreur comprise » , compris strictement dans le sens de «fraudes». En re ant la qualité », nous nous rons une analyse détaillée des dossiers en vue d' identifier s »il y a ddes erreurs ou des oublis justifiant, le cas échéant, une refacturation. L'optimisation du codage: 3 étapes: < SERVICES ANALYSE DU POTENTIEL Dans ce

  print(torch.tensor(predicted_tokens).shape)
