In [8]:
import sys, os
from dotenv import load_dotenv
load_dotenv()

project_root = os.getenv("ROOT_DIR")
os.chdir(project_root)
print(os.getcwd())

C:\Users\Napster\Desktop\M2_ISI\MLA\CamemBERT\MLA-CamemBERT


In [9]:
import torch
from torch.utils.data import DataLoader

from transformers import CamembertTokenizer, CamembertForMaskedLM, logging
from datasets import load_from_disk

from src.dataset.oscar_dataset import OscarDataset

In [6]:
# Charger le dataset Hugging Face sauvegardé
mini_oscar_path = os.path.abspath("../../data/CamemBERT/data/mini_oscar/mini_dataset.arrow")
hf_dataset = load_from_disk(mini_oscar_path)

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

dataset = OscarDataset(hf_dataset, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=8)

# Boucle pour vérifier les données
for batch in dataloader:
    print(batch["input_ids"].shape)  # Shape : (batch_size, max_length)
    print(batch["attention_mask"].shape)  # Shape : (batch_size, max_length)
    print(batch['input_ids'][0])
    print(batch['attention_mask'][0])
    break

# Let's test CamemBERT :

In [19]:
logging.set_verbosity_error()

# Vérifier si un GPU est disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Charger le tokenizer et le modèle
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")
model = model.to(device)

# 2. Charger le dataset
data_path = os.path.abspath("../../data/CamemBERT/data/mini_oscar/mini_dataset.arrow")
hf_dataset = load_from_disk(data_path)

# 3. Créer le DataLoader
oscar_dataset = OscarDataset(hf_dataset, tokenizer, max_length=512)
dataloader = DataLoader(oscar_dataset, batch_size=4)  #  shuffle=True

Using device: cuda
model_loaded
dataset loaded
datalaoder created


In [20]:
batch = next(iter(dataloader))

In [None]:
# 4. Tester le modèle avec un batch
model.eval()  # Mode évaluation
for batch in dataloader:
    input_ids = batch["input_ids"].to(device)  # (B, 512)
    attention_mask = batch["attention_mask"].to(device)  # (B, 512)

    # Passer le batch dans le modèle
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Afficher les résultats
    print("Logits shape:", outputs.logits.shape)  # (B, 512, vocab_size)
    break  # Une seule itération pour tester


In [33]:
print(model)

CamembertForMaskedLM(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
       

In [32]:
# Create dummy inputs to check input-output shapes
batch_size = 2
seq_length = 512
input_ids = torch.randint(0, model.config.vocab_size, (batch_size, seq_length))
attention_mask = torch.ones_like(input_ids)

# Pass dummy inputs through the model
outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Print the shapes of the outputs
print("\nOutput Shapes:")
print(f"Logits: {outputs.logits.shape}")  # Logits for masked LM


Output Shapes:
Logits: torch.Size([2, 512, 32005])


# CamembertForMaskedLM Architecture with Input and Output Shapes

### **Model Input:**
- **Input Shape:** `(batch_size, sequence_length)`

---

### **Embeddings:**
1. **Word Embeddings**: 
   - **Input:** `(batch_size, sequence_length)` (token IDs, vocab size = 32,005)
   - **Output:** `(batch_size, sequence_length, 768)` (embedding dimension)

2. **Position Embeddings**:
   - **Input:** `(batch_size, sequence_length)` (positions in sequence, max = 514)
   - **Output:** `(batch_size, sequence_length, 768)` (embedding dimension)

3. **Token Type Embeddings**:
   - **Input:** `(batch_size, sequence_length)` (token type IDs, type size = 1)
   - **Output:** `(batch_size, sequence_length, 768)` (embedding dimension)

4. **Layer Normalization**:
   - **Input:** `(batch_size, sequence_length, 768)`
   - **Output:** `(batch_size, sequence_length, 768)`

5. **Dropout**:
   - **Input:** `(batch_size, sequence_length, 768)`
   - **Output:** `(batch_size, sequence_length, 768)`

---

### **Encoder (CamembertEncoder):**
- Composed of **12 CamembertLayer** modules.

**For each CamembertLayer:**

#### **Attention (Self-Attention + Output):**
6. **Self-Attention Query, Key, Value:**
   - **Input:** `(batch_size, sequence_length, 768)`
   - **Output:** `(batch_size, sequence_length, 768)` (attention heads combined)

7. **Self-Attention Output (Dense + LayerNorm):**
   - **Input:** `(batch_size, sequence_length, 768)`
   - **Output:** `(batch_size, sequence_length, 768)`

#### **Intermediate Layer:**
8. **Dense + GELU Activation:**
   - **Input:** `(batch_size, sequence_length, 768)`
   - **Output:** `(batch_size, sequence_length, 3072)` (intermediate dimension)

#### **Output Layer:**
9. **Dense + LayerNorm:**
   - **Input:** `(batch_size, sequence_length, 3072)`
   - **Output:** `(batch_size, sequence_length, 768)` (back to embedding dimension)

---

### **LM Head (CamembertLMHead):**
10. **Dense**:
    - **Input:** `(batch_size, sequence_length, 768)`
    - **Output:** `(batch_size, sequence_length, 768)`

11. **LayerNorm**:
    - **Input:** `(batch_size, sequence_length, 768)`
    - **Output:** `(batch_size, sequence_length, 768)`

12. **Decoder (Final Linear Layer):**
    - **Input:** `(batch_size, sequence_length, 768)`
    - **Output:** `(batch_size, sequence_length, 32005)` (logits for vocabulary)

---

### **Model Output:**
- **Logits Shape**: `(batch_size, sequence_length, 32005)` (vocabulary scores for each token in the sequence).


In [41]:
# Exemple avec un batch de deux phrases
texts = [
    "La programmation en [MASK] est fascinante et facile.",
    "J'aime apprendre avec [MASK]."
]

# Tokenisation
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Envoyer au modèle
with torch.no_grad():
    outputs = model(**inputs)

# Récupérer les logits
logits = outputs.logits  # Shape: (batch_size, sequence_length, vocab_size)

# Trouver les indices des tokens [MASK]
mask_token_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)

# Extraire les logits uniquement pour les positions [MASK]
mask_logits = logits[mask_token_index]  # Shape: (num_masked_tokens, vocab_size)

# Prédictions pour chaque [MASK]
predicted_ids = mask_logits.argmax(dim=-1)  # Shape: (num_masked_tokens,)
predicted_tokens = tokenizer.decode(predicted_ids)

print("Tokens prédits pour les positions masquées :")
print(predicted_tokens)


Tokens prédits pour les positions masquées :



In [42]:
inputs

{'input_ids': tensor([[    5,    61,  4732,    22,   403,  3654,   229,   707,   374,    30,
         25094,    14,   811,     9,     6],
        [    5,   121,    11,   660,  1891,    42,   403,  3654,   229,   707,
          2805,     6,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [45]:
logits = outputs.logits
print(logits)
logits.shape

tensor([[[ 22.6741,  -3.2752,   6.6612,  ...,  -6.5251,  -2.7296,   5.9366],
         [  4.9040,  -3.4535,  15.1498,  ...,  -9.6906,  -0.9981,   7.8614],
         [  5.1116,  -3.5781,   3.1757,  ..., -15.3700, -10.0413,   0.2270],
         ...,
         [  3.2980,  -7.6721,   4.0967,  ...,  -6.3892,  -8.3578,   5.3333],
         [  5.6675,  -7.0110,   6.9816,  ...,  -5.7003,  -6.4600,   6.2704],
         [  3.8989,  -3.5776,  27.2475,  ...,  -9.1564,  -5.2341,   5.9207]],

        [[ 22.8516,  -3.5508,   8.0211,  ...,  -5.7155,  -2.0732,   6.0353],
         [ -2.1751,  -4.6412,  15.2779,  ..., -13.9153,  -0.7081,  -2.5885],
         [  2.0429,  -6.4475,   2.5401,  ..., -11.6387,  -7.5492,   1.1310],
         ...,
         [  7.7672,  -3.3566,  27.2912,  ...,  -8.2482,  -3.4188,   6.0584],
         [  7.7672,  -3.3566,  27.2912,  ...,  -8.2482,  -3.4188,   6.0584],
         [  7.7672,  -3.3566,  27.2912,  ...,  -8.2482,  -3.4188,   6.0584]]])


torch.Size([2, 15, 32005])

In [46]:
mask_token_index

(tensor([], dtype=torch.int64), tensor([], dtype=torch.int64))

In [47]:
tokenizer.mask_token_id

32004

In [48]:
mask_logits

tensor([], size=(0, 32005))

In [51]:
from transformers import CamembertTokenizer, CamembertForMaskedLM
import torch

# Charger le tokenizer et le modèle
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# Phrase avec un token masqué
text = "La programmation en <MASK> est fascinante."

# Tokenisation
inputs = tokenizer(text, return_tensors="pt")

# Inférence pour obtenir les logits
with torch.no_grad():
    outputs = model(**inputs)

# Récupérer les logits
logits = outputs.logits  # Shape: (batch_size, sequence_length, vocab_size)

# Trouver la position du token [MASK]
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

# Extraire les logits pour le token [MASK]
mask_logits = logits[0, mask_token_index, :]  # Shape: (num_masked_tokens, vocab_size)

# Trouver l'ID du token prédit
predicted_token_id = mask_logits.argmax(dim=-1)

# Décoder l'ID pour obtenir le mot prédict
predicted_token = tokenizer.decode(predicted_token_id)

# Remplacer le token [MASK] par le mot prédit
reconstructed_text = text.replace(tokenizer.mask_token, predicted_token)

# Afficher les résultats
print(f"Phrase originale : {text}")
print(f"Mot prédit pour [MASK] : {predicted_token}")
print(f"Phrase reconstruite : {reconstructed_text}")


Phrase originale : La programmation en <MASK> est fascinante.
Mot prédit pour [MASK] : 
Phrase reconstruite : La programmation en <MASK> est fascinante.


In [50]:
# Charger le tokenizer et le modèle
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# Phrase avec un token masqué
text = "La programmation en <mask> est fascinante."

# Tokenisation
inputs = tokenizer(text, return_tensors="pt")

# Inférence pour obtenir les logits
with torch.no_grad():
    outputs = model(**inputs)

# Récupérer les logits
logits = outputs.logits  # Shape: (batch_size, sequence_length, vocab_size)

# Trouver la position du token <mask>
mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

# Extraire les logits pour le token <mask>
mask_logits = logits[0, mask_token_index, :]  # Shape: (num_masked_tokens, vocab_size)

# Trouver l'ID du token prédit
predicted_token_id = mask_logits.argmax(dim=-1)

# Décoder l'ID pour obtenir le mot prédit
predicted_token = tokenizer.decode(predicted_token_id)

# Remplacer le token <mask> par le mot prédit
reconstructed_text = text.replace("<mask>", predicted_token)

# Afficher les résultats
print(f"Phrase originale : {text}")
print(f"Mot prédit pour <mask> : {predicted_token}")
print(f"Phrase reconstruite : {reconstructed_text}")


Phrase originale : La programmation en <mask> est fascinante.
Mot prédit pour <mask> : ligne
Phrase reconstruite : La programmation en ligne est fascinante.
