In [1]:
from transformers import CamembertConfig, CamembertForMaskedLM, AdamW, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling, CamembertTokenizer
from datasets import Dataset
import torch
import tqdm

In [5]:
from datasets import load_from_disk
from transformers import CamembertTokenizer

# Load the dataset from a local directory
dataset_path = "/home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow"  # Replace with the path to your saved dataset
hf_dataset = load_from_disk(dataset_path)

# Load the Camembert tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Define masking function
def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    """Prepare masked tokens for MLM."""
    labels = inputs.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace 80% of masked tokens with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of masked tokens with random words
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    return inputs, labels

# Tokenization and masking
def preprocess_data(examples):
    tokenized = tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    input_ids, labels = mask_tokens(tokenized["input_ids"], tokenizer)
    return {
        "input_ids": input_ids,
        "attention_mask": tokenized["attention_mask"],  # Include attention mask
        "labels": labels,
    }

# Preprocess the dataset
tokenized_dataset = hf_dataset.map(preprocess_data, batched=True, remove_columns=["text"])


Map:   0%|          | 0/1200000 [00:00<?, ? examples/s]

In [4]:
from transformers import DataCollatorForLanguageModeling

# Define a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # Enable MLM
    mlm_probability=0.15,  # Masking probability
)


NameError: name 'tokenizer' is not defined

In [None]:
from transformers import CamembertForMaskedLM

# Load CamembertForMaskedLM
model = CamembertForMaskedLM.from_pretrained("camembert-base")


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./camembert-mlm",  # Directory to save checkpoints
    overwrite_output_dir=True,  # Overwrite previous outputs
    evaluation_strategy="steps",  # Evaluate every `eval_steps`
    save_strategy="steps",  # Save checkpoint every `save_steps`
    per_device_train_batch_size=8,  # Batch size per GPU
    gradient_accumulation_steps=64,  # Effective batch size = 8 * 64 = 512
    learning_rate=1e-4,  # Learning rate from Camembert paper
    weight_decay=0.01,  # Weight decay for regularization
    warmup_steps=10000,  # Warmup steps
    max_steps=1000000,  # Total training steps
    logging_dir="./logs",  # Directory for logs
    logging_steps=500,  # Log every 500 steps
    save_steps=10000,  # Save every 10,000 steps
    eval_steps=10000,  # Evaluate every 10,000 steps
    fp16=True,  # Enable mixed precision training
    num_train_epochs=5,  # Train for 5 epochs
)


In [None]:
from transformers import DataCollatorForLanguageModeling

# Define a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # Enable MLM
    mlm_probability=0.15,  # Masking probability
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,  # Ensures proper tokenization and batching
    data_collator=data_collator,  # Handles attention_mask and MLM masking
)


In [None]:
trainer.train()



In [3]:
from torch.utils.data import DataLoader
from transformers import CamembertForMaskedLM, CamembertTokenizer, TrainingArguments, Trainer
from datasets import load_from_disk
from dataset import OscarDataset

# === Initialize Tokenizer and Dataset === #
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Load dataset
dataset_path = "/mini_oscar"  # Replace with your dataset path
hf_dataset = load_from_disk(dataset_path)
oscar_dataset = OscarDataset(hf_dataset, tokenizer)
# === Convert Dataset into Hugging Face Format === #
# Hugging Face Trainer expects datasets in dictionary format with labels included
def collate_fn(batch):
    return {
        "input_ids": torch.stack([item["masked_input_ids"] for item in batch]),
        "attention_mask": torch.stack([item["attention_mask"] for item in batch]),
        "labels": torch.stack([item["labels"] for item in batch]),
    }


FileNotFoundError: Directory /mini_oscar not found

In [7]:
 # model summary
from torchinfo import summary
from torch.utils.data import DataLoader
from transformers import CamembertForMaskedLM, CamembertTokenizer, TrainingArguments, Trainer , CamembertConfig
from datasets import load_from_disk
from dataset import OscarDataset
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
# === Model === #
# config = CamembertConfig(
#     vocab_size=tokenizer.vocab_size,  # Adjust to match your tokenizer's vocab size
#     hidden_size=768,                 # Hidden size (RoBERTa_BASE)
#     num_hidden_layers=12,            # Number of transformer layers
#     num_attention_heads=12,          # Number of attention heads
#     intermediate_size=3072,          # FFN inner hidden size
#     hidden_dropout_prob=0.1,         # Dropout probability
#     attention_probs_dropout_prob=0.1, # Attention dropout probability
#     max_position_embeddings=514,     # Maximum sequence length + special tokens
#     type_vocab_size=1,               # No token type embeddings
#     initializer_range=0.02           # Standard deviation for weight initialization
# )

config = CamembertConfig()
print(config)
# Initialize a randomly weighted CamembertForMaskedLM model
model = CamembertForMaskedLM(config) 
# model.to("cuda")

print("Model initialized")

summary(model)





CamembertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Model initialized


Layer (type:depth-idx)                                       Param #
CamembertForMaskedLM                                         --
├─CamembertModel: 1-1                                        --
│    └─CamembertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                                   23,440,896
│    │    └─Embedding: 3-2                                   393,216
│    │    └─Embedding: 3-3                                   1,536
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─CamembertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                                  85,054,464
├─CamembertLMHead: 1-2                                       --
│    └─Linear: 2-3                                           590,592
│    └─LayerNorm: 2-4                                        1,536
│    └─Linear: 2-5                                           23,

In [None]:
# === Training Arguments === #
training_args = TrainingArguments(
    output_dir="./camembert_mlm",  # Directory to save the model and checkpoints
    overwrite_output_dir=True,
    num_train_epochs=3,  # 3 epochs as specified
    per_device_train_batch_size=8,  # Batch size
    gradient_accumulation_steps=16,  # Effective batch size = 8 * 16
    learning_rate=6e-4,  # Peak learning rate for base
    weight_decay=0.01,  # Weight decay
    max_steps=500000,  # Train for 500k steps
    warmup_steps=24000,  # 24k warmup steps for base
    save_steps=5000,  # Save model every 5000 steps
    logging_steps=500,  # Log training loss every 500 steps
    save_total_limit=2,  # Keep only the 2 most recent checkpoints
    lr_scheduler_type="linear",  # Linear learning rate decay
    evaluation_strategy="no",  # No validation dataset (can be added if needed)
    fp16=True,  # Mixed precision training for faster performance
)

In [None]:
# === Trainer === #
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=oscar_dataset,
    tokenizer=tokenizer,
    data_collator=collate_fn,
)

# === Train === #
trainer.train()

# === Save the Final Model === #
trainer.save_model("./camembert_mlm")
tokenizer.save_pretrained("./camembert_mlm")
print("Training complete. Model saved to ./camembert_mlm.")
