In [1]:
from datasets import load_from_disk
from transformers import CamembertTokenizer, CamembertForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from model import CamembertModel
from config_model import CamembertConfig
import torch.nn as nn
import torch

# Step 1: Load the dataset from disk
dataset_path = "/home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow"  # Replace with your dataset path
hf_dataset = load_from_disk(dataset_path)

# Step 2: Load the Camembert tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Step 3: Define a custom dynamic dataset class for tokenization
class OscarDataset(Dataset):
    def __init__(self, raw_texts, tokenizer, max_length=512):
        self.raw_texts = raw_texts  # List of raw text samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.raw_texts)

    def __getitem__(self, idx):
        text = self.raw_texts[idx]
        tokenized = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
        }

# Create the dynamic dataset
raw_texts = hf_dataset["text"]  # Extract the raw text column from the dataset
train_texts, val_texts = train_test_split(raw_texts, test_size=0.1, random_state=42)

print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")

train_dataset = OscarDataset(train_texts, tokenizer)
eval_dataset = OscarDataset(val_texts, tokenizer)
def reinitialize_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        module.reset_parameters()
    elif isinstance(module, nn.LayerNorm):
        module.reset_parameters()
    elif hasattr(module, 'weight') and module.weight is not None:
        nn.init.xavier_uniform_(module.weight)  # Use Xavier initialization
    if hasattr(module, 'bias') and module.bias is not None:
        nn.init.zeros_(module.bias)

2024-12-15 00:49:57.871275: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734220197.888314 1601130 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734220197.893467 1601130 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-15 00:49:57.913795: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Training samples: 1080000, Validation samples: 120000


In [2]:
# Step 4: Define the data collator for MLM
from transformers import CamembertConfig, CamembertForMaskedLM
import torch 
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # Enable MLM
    mlm_probability=0.15,  # Masking probability
)
config = CamembertConfig()
# Step 5: Load the Camembert model for MLM
device = "cuda"
model = CamembertForMaskedLM.from_pretrained("camembert-base")
model.apply(reinitialize_weights)

# Move the model to the device (e.g., GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Move the model to the GPU
print(model)
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./camembert-mlm",  # Directory to save checkpoints
    overwrite_output_dir=True,  # Overwrite previous outputs
    eval_strategy="steps",  # Evaluate every `eval_steps`
    save_strategy="steps",  # Save checkpoint every `save_steps`
    per_device_train_batch_size=32,  # Batch size per GPU
    gradient_accumulation_steps=64,  # Effective batch size = 8 * 64 = 512
    learning_rate=1e-4,  # Learning rate from Camembert paper
    weight_decay=0.01,  # Weight decay for regularization
    warmup_steps=10000,  # Warmup steps  # Total training steps
    logging_dir="./logs",  # Directory for logs
    logging_steps=100,  # Log every 500 steps
    save_steps=100,  # Save every 10,000 steps
    eval_steps=100,  # Evaluate every 10,000 steps
    fp16=True,  # Enable mixed precision training
    num_train_epochs=5,  # Train for 5 epochs
)

# Step 7: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add validation dataset
    data_collator=data_collator,
)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CamembertForMaskedLM(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
       

In [None]:
# Step 8: Train the model
trainer.train()
#

Step,Training Loss,Validation Loss
100,10.2929,9.817461
200,9.5818,9.334126
300,9.2144,9.025359
400,8.896,8.704417
500,8.5599,8.367453
600,8.2244,8.049786
700,7.9314,7.805247
800,7.7428,7.692867
900,7.6833,7.675489
1000,7.6745,7.671484


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

