In [2]:
from transformers import CamembertConfig, CamembertForMaskedLM, AdamW, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling, CamembertTokenizer
from datasets import Dataset
import torch

In [None]:
from tokenizers import ByteLevelBPETokenizer

files = ["path_to_your_oscar_french_text_file.txt"]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(
    files=files,
    vocab_size=32000,  
    min_frequency=2,
    special_tokens=[
        "<s>",  
        "<pad>", 
        "</s>",  
        "<unk>",  
        "<mask>"  
    ]
)

tokenizer.save_model("./oscar_fr_vocab")


In [None]:
from torch.utils.data import DataLoader
from transformers import CamembertForMaskedLM, CamembertTokenizer, TrainingArguments, Trainer
from datasets import load_from_disk
from dataset import OscarDataset

# === Initialize Tokenizer and Dataset === #
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Load dataset
dataset_path = "/mini_oscar"  # Replace with your dataset path
hf_dataset = load_from_disk(dataset_path)
oscar_dataset = OscarDataset(hf_dataset, tokenizer)
# === Convert Dataset into Hugging Face Format === #
# Hugging Face Trainer expects datasets in dictionary format with labels included
def collate_fn(batch):
    return {
        "input_ids": torch.stack([item["masked_input_ids"] for item in batch]),
        "attention_mask": torch.stack([item["attention_mask"] for item in batch]),
        "labels": torch.stack([item["labels"] for item in batch]),
    }


In [7]:
 # model summary
from torchinfo import summary
from torch.utils.data import DataLoader
from transformers import CamembertForMaskedLM, CamembertTokenizer, TrainingArguments, Trainer , CamembertConfig
from datasets import load_from_disk
from dataset import OscarDataset
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
# === Model === #
# config = CamembertConfig(
#     vocab_size=tokenizer.vocab_size,  # Adjust to match your tokenizer's vocab size
#     hidden_size=768,                 # Hidden size (RoBERTa_BASE)
#     num_hidden_layers=12,            # Number of transformer layers
#     num_attention_heads=12,          # Number of attention heads
#     intermediate_size=3072,          # FFN inner hidden size
#     hidden_dropout_prob=0.1,         # Dropout probability
#     attention_probs_dropout_prob=0.1, # Attention dropout probability
#     max_position_embeddings=514,     # Maximum sequence length + special tokens
#     type_vocab_size=1,               # No token type embeddings
#     initializer_range=0.02           # Standard deviation for weight initialization
# )

config = CamembertConfig()
print(config)
# Initialize a randomly weighted CamembertForMaskedLM model
model = CamembertForMaskedLM(config) 
# model.to("cuda")

print("Model initialized")

summary(model)





CamembertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Model initialized


Layer (type:depth-idx)                                       Param #
CamembertForMaskedLM                                         --
├─CamembertModel: 1-1                                        --
│    └─CamembertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                                   23,440,896
│    │    └─Embedding: 3-2                                   393,216
│    │    └─Embedding: 3-3                                   1,536
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─CamembertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                                  85,054,464
├─CamembertLMHead: 1-2                                       --
│    └─Linear: 2-3                                           590,592
│    └─LayerNorm: 2-4                                        1,536
│    └─Linear: 2-5                                           23,

In [None]:
# === Training Arguments === #
training_args = TrainingArguments(
    output_dir="./camembert_mlm",  # Directory to save the model and checkpoints
    overwrite_output_dir=True,
    num_train_epochs=3,  # 3 epochs as specified
    per_device_train_batch_size=8,  # Batch size
    gradient_accumulation_steps=16,  # Effective batch size = 8 * 16
    learning_rate=6e-4,  # Peak learning rate for base
    weight_decay=0.01,  # Weight decay
    max_steps=500000,  # Train for 500k steps
    warmup_steps=24000,  # 24k warmup steps for base
    save_steps=5000,  # Save model every 5000 steps
    logging_steps=500,  # Log training loss every 500 steps
    save_total_limit=2,  # Keep only the 2 most recent checkpoints
    lr_scheduler_type="linear",  # Linear learning rate decay
    evaluation_strategy="no",  # No validation dataset (can be added if needed)
    fp16=True,  # Mixed precision training for faster performance
)

In [None]:
# === Trainer === #
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=oscar_dataset,
    tokenizer=tokenizer,
    data_collator=collate_fn,
)

# === Train === #
trainer.train()

# === Save the Final Model === #
trainer.save_model("./camembert_mlm")
tokenizer.save_pretrained("./camembert_mlm")
print("Training complete. Model saved to ./camembert_mlm.")
