In [5]:
from torch.utils.data import Dataset
from datasets import load_from_disk
from transformers import CamembertTokenizer, DataCollatorForLanguageModeling

# 1. Load the Local OSCAR Dataset
dataset_path = "/home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow"  # Replace with your local path
print("Loading the OSCAR dataset from:", dataset_path)
dataset = load_from_disk(dataset_path)
from transformers import CamembertTokenizer

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# 3. Custom Dataset Class with Dynamic Tokenization
class OscarDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        """
        Dynamically tokenize the text when fetched.
        Masked Language Modeling will be applied later via DataCollator.
        """
        text = self.texts[idx]
        tokenized = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(0),
            "attention_mask": tokenized["attention_mask"].squeeze(0),
        }

# Split dataset into train and validation
print("Splitting dataset into train and validation...")
split = dataset.train_test_split(test_size=0.05)
train_dataset = OscarDataset(split["train"]["text"], tokenizer)
val_dataset = OscarDataset(split["test"]["text"], tokenizer)


Loading the OSCAR dataset from: /home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow
Splitting dataset into train and validation...


In [2]:
from transformers import CamembertForMaskedLM, Trainer, TrainingArguments

# 1. Load the Model
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# 2. Training Arguments
training_args = TrainingArguments(
    output_dir="./camembert-pretraining-checkpoints",
    eval_strategy="steps",               # Use eval_strategy instead of evaluation_strategy
    eval_steps=5000,
    save_steps=5000,
    logging_steps=500,
    per_device_train_batch_size=114,
    per_device_eval_batch_size=114,
    learning_rate=6e-4,
    weight_decay=0.01,
    warmup_steps=24000,                  # Match RoBERTa Base
    max_steps=500000,
    report_to="tensorboard",
    save_total_limit=2,
    gradient_accumulation_steps=256,     # Effective batch size 8k
    fp16=True,
    logging_dir="./logs-trainer-CamemBert",
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)



Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
max_steps is given, it will override any value given in num_train_epochs


In [None]:

# 4. Train the Model
print("Training with Hugging Face Trainer...")
trainer.train()

# Save Final Model
print("Saving final model...")
trainer.save_model("./camembert-final")
tokenizer.save_pretrained("./camembert-final")

Training with Hugging Face Trainer...


In [7]:
import torch
import torch.nn as nn
import math
from typing import List, Optional, Tuple, Union
from packaging import version
from transformers.utils import logging
import torch.nn.functional as F

# Classes to code : 
# Embedding
# Self-attention
# self output 
# Attention output
# CamemBERT block(layer)
# CamemBERT Encoder 
# CamemBERT output
# CamemBERT Model
# We can add the 4 classes to fine-tune the model on the 4 donwsteam tasks
# + one class to load directly a pretrained model

class CamembertEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """

    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        self.padding_idx = config.pad_token_id
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        if position_ids is None:
            if input_ids is not None:
                # Create the position ids from the input token ids. Any padded tokens remain padded.
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
            else:
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
        # issue #5664

        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
        def create_position_ids_from_inputs_embeds(self, inputs_embeds):
            """
            We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
    
            Args:
                inputs_embeds: torch.Tensor
    
            Returns: torch.Tensor
            """
            input_shape = inputs_embeds.size()[:-1]
            sequence_length = input_shape[1]
    
            position_ids = torch.arange(
                self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
            )
            return position_ids.unsqueeze(0).expand(input_shape)


class CamembertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        batch_size, seq_length, hidden_size = x.size()
        x = x.view(batch_size, seq_length, self.num_attention_heads, self.attention_head_size)
        return x.permute(0, 2, 1, 3)  # [batch, num_heads, seq_len, head_size]

    def forward(self, hidden_states, attention_mask=None):
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores /= math.sqrt(self.attention_head_size)

        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch, 1, 1, seq_len]
            attention_scores += attention_mask

        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        context_layer = context_layer.view(hidden_states.size(0), -1, self.all_head_size)

        return context_layer

class CamembertFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense_1 = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout1 = nn.Dropout(config.hidden_dropout_prob)
        self.dense_2 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.activation = nn.GELU()
        self.dense_3 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout2 = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.dense_1(x)
        x = self.layer_norm(x)
        x - self.dropout1(x)
        x = self.dense_2(x)
        x = self.activation(x)
        x = self.dense_3(x)
        x = self.layer_norm(x)
        x = self.dropout2(x)
        
        return self.dropout(x)

class CamembertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = CamembertSelfAttention(config)
        self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.feed_forward = CamembertFeedForward(config)
        self.feed_forward_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None):
        # Self-Attention with skip connection
        attention_output = self.attention(hidden_states, attention_mask)
        attention_output = self.attention_norm(hidden_states + attention_output)  # Skip connection

        # Feed-Forward with skip connection
        feed_forward_output = self.feed_forward(attention_output)
        layer_output = self.feed_forward_norm(attention_output + feed_forward_output)  # Skip connection

        return layer_output

class CamembertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask=None):
        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask)
        return hidden_states
    
class CamembertModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeddings = CamembertEmbeddings(config)  
        self.encoder = CamembertEncoder(config)  
        if config.head_type == "MLM":
            self.head = CamembertLMHead(config)
        else:
            raise ValueError(f"Head type {config.head_type} not supported")
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Embedding layer
        embedded_input = self.embeddings(input_ids)

        if attention_mask is not None:
            attention_mask = (1.0 - attention_mask) * -float('inf')

        encoder_output = self.encoder(embedded_input, attention_mask)
        logits = self.head(encoder_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(ignore_index=-100)  
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, logits) if labels is not None else logits


class CamembertLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = F.gelu(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        logits = self.decoder(hidden_states)
        return logits


# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    mask = input_ids.ne(padding_idx).int()
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    return incremental_indices.long() + padding_idx

In [21]:
from transformers import Trainer, TrainingArguments
from config_model import CamembertConfig
# Initialize Model
config = CamembertConfig()
print(config)
model = CamembertModel(config)
#model.to("cuda")
# Training Arguments
training_args = TrainingArguments(
    output_dir="./camembert-pretraining-checkpoints",
    eval_strategy="steps",               # Use eval_strategy instead of evaluation_strategy
    eval_steps=5000,
    save_steps=5000,
    logging_steps=500,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=6e-4,
    weight_decay=0.01,
    warmup_steps=24000,                  # Match RoBERTa Base
    max_steps=500000,
    report_to="tensorboard",
    save_total_limit=2,
    gradient_accumulation_steps=256,     # Effective batch size 8k
    fp16=True,
    logging_dir="./logs-trainer-CamemBert",
)



CamembertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "head_type": "MLM",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup

# Custom Training Function
def train_custom(model, train_loader, val_loader, device="cuda", lr=4e-4, steps=50000, save_steps=10000):
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=30000, num_training_steps=steps)

    writer = SummaryWriter(log_dir="./logs-custom")
    model.train()

    step = 0
    for epoch in range(steps // len(train_loader)):
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            #print(input_ids)
            attention_mask = batch["attention_mask"].to(device)

            # Generate labels for MLM
            labels = input_ids.clone()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs[0]
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Log training loss
            if step % 500 == 0:
                writer.add_scalar("Training Loss", loss.item(), step)
                print(f"Step {step}, Training Loss: {loss.item():.4f}")

            # Save checkpoints
            if step % save_steps == 0 and step > 0:
                save_path = f"./checkpoints-custom/checkpoint-{step}"
                model.save_pretrained(save_path)
                tokenizer.save_pretrained(save_path)
                print(f"Checkpoint saved at step {step}")

            # Validation
            if step % 5000 == 0 and step > 0:
                val_loss = evaluate(model, val_loader, device)
                writer.add_scalar("Validation Loss", val_loss, step)
                print(f"Step {step}, Validation Loss: {val_loss:.4f}")

            step += 1
            if step >= steps:
                break
        if step >= steps:
            break

    print("Training complete!")
    model.save_pretrained("./camembert-final-custom")
    tokenizer.save_pretrained("./camembert-final-custom")
    writer.close()

# Evaluation Function
def evaluate(model, val_loader, device="cuda"):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = input_ids.clone()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs[0].item()
    model.train()
    return total_loss / len(val_loader)

# Initialize DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=data_collator)




In [None]:
# Train the model
print("Training with custom training loop...")
train_custom(model, train_loader, val_loader)

In [17]:
from transformers import CamembertTokenizer, CamembertConfig, DataCollatorForLanguageModeling
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
from torch import nn

# Load Tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
print(f"Loaded tokenizer vocab size: {tokenizer.vocab_size}")

# Initialize Model Config
config = CamembertConfig()


# Initialize Embedding Layer
embedding_layer = CamembertEmbeddings(config)

# Example Dataset
sample_texts = [
    "Bonjour, comment ça va?",
    "C'est une belle journée!",
    "Le modèle CamemBERT est très puissant.",
    "Les embeddings sont générés correctement.",
]
train_dataset = Dataset.from_dict({"text": sample_texts})

# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(["text"])

# Validate input_ids
def validate_input_ids(input_ids, vocab_size):
    if input_ids.max() >= vocab_size or input_ids.min() < 0:
        raise ValueError("Input IDs contain out-of-range indices!")
    print("Input IDs are valid.")

# Data Collator and DataLoader
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15
)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=data_collator)

# Test Embeddings
def test_embeddings(train_loader, embedding_layer):
    for batch in train_loader:
        input_ids = batch["input_ids"]
        print("Input IDs shape:", input_ids.shape)
        print("Input IDs min:", input_ids.min().item())
        print("Input IDs max:", input_ids.max().item())
        #validate_input_ids(input_ids, embedding_layer.word_embeddings.num_embeddings)
        with torch.no_grad():
            embeddings = embedding_layer(input_ids)
        print("Embeddings shape:", embeddings.shape)
        print("Sample embedding (first token):", embeddings[0, 0, :5])
        break

test_embeddings(train_loader, embedding_layer)


Loaded tokenizer vocab size: 32000


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Input IDs shape: torch.Size([2, 512])
Input IDs min: 1
Input IDs max: 23100
Embeddings shape: torch.Size([2, 512, 768])
Sample embedding (first token): tensor([-1.4775, -0.3669,  2.0144, -0.7398,  1.1188])


In [22]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    CamembertTokenizer,
    DataCollatorForLanguageModeling,
    CamembertForMaskedLM,
)
from datasets import Dataset
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup

# Step 1: Initialize Tokenizer and Model
print("Initializing tokenizer and model...")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
config = CamembertConfig()
model = CamembertModel(config)

# Step 2: Create a Minimal Dataset (1 sample for both train and validation)
print("Creating minimal dataset...")
sample_text = "Ceci est une phrase d'exemple pour tester le modèle."
data = {"text": [sample_text]}  # Single sample
dataset = Dataset.from_dict(data)

# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

print("Tokenizing dataset...")
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["text"])
dataset.set_format("torch")

# Split into train and validation sets (1 sample each)
train_dataset = dataset.select([0])
val_dataset = dataset.select([0])

# Step 3: Create DataLoaders
print("Creating DataLoaders...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=1, collate_fn=data_collator)

# Step 4: Training Function
def train_custom(model, train_loader, val_loader, device="cuda", lr=4e-4, steps=10, save_steps=5):
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2, num_training_steps=steps)

    writer = SummaryWriter(log_dir="./logs-custom")
    model.train()

    step = 0
    for epoch in range(steps // len(train_loader) + 1):
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate labels for MLM
            labels = input_ids.clone()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Log training loss
            if step % 1 == 0:  # Log every step since it's minimal
                writer.add_scalar("Training Loss", loss.item(), step)
                print(f"Step {step}, Training Loss: {loss.item():.4f}")

            # Save checkpoints
            if step % save_steps == 0 and step > 0:
                save_path = f"./checkpoints-custom/checkpoint-{step}"
                model.save_pretrained(save_path)
                tokenizer.save_pretrained(save_path)
                print(f"Checkpoint saved at step {step}")

            # Validation
            if step % 5 == 0:
                val_loss = evaluate(model, val_loader, device)
                writer.add_scalar("Validation Loss", val_loss, step)
                print(f"Step {step}, Validation Loss: {val_loss:.4f}")

            step += 1
            if step >= steps:
                break
        if step >= steps:
            break

    print("Training complete!")
    model.save_pretrained("./camembert-final-custom")
    tokenizer.save_pretrained("./camembert-final-custom")
    writer.close()

# Step 5: Evaluation Function
def evaluate(model, val_loader, device="cuda"):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = input_ids.clone()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    model.train()
    return total_loss / len(val_loader)

# Step 6: Train the Model
print("Starting training...")
train_custom(model, train_loader, val_loader, steps=10)  # Use 10 steps for testing


Initializing tokenizer and model...
Creating minimal dataset...
Tokenizing dataset...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating DataLoaders...
Starting training...


../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [30,0,0], thread:

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
import torch
from torch.utils.data import DataLoader, random_split
from datasets import load_from_disk
from transformers import (
    CamembertTokenizer,
    DataCollatorForLanguageModeling,
    CamembertConfig,
    CamembertForMaskedLM,
)
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup

# Step 1: Load the Full OSCAR Dataset
dataset_path = "/home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow"  # Replace with your local path
print("Loading the OSCAR dataset from:", dataset_path)
dataset = load_from_disk(dataset_path)

# Step 2: Split Dataset into Train and Validation (95% Train, 5% Validation)
print("Splitting dataset into train and validation sets...")
dataset = dataset.shuffle(seed=42)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Convert Subset to Dataset
train_indices = train_dataset.indices
val_indices = val_dataset.indices
train_dataset = dataset.select(train_indices)
val_dataset = dataset.select(val_indices)

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")

# Step 3: Initialize Tokenizer and Model
print("Initializing tokenizer and model...")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
config = CamembertConfig.from_pretrained("camembert-base")
model = CamembertForMaskedLM(config)

# Step 4: Tokenize Datasets
def tokenize_function(examples):
    return tokenizer(
        examples["text"],  # Replace "text" with the appropriate column name if different
        padding="max_length",
        truncation=True,
        max_length=512,
    )

print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

# Set dataset format for PyTorch
train_dataset.set_format("torch")
val_dataset.set_format("torch")

# Step 5: Create DataLoaders
print("Creating DataLoaders...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=data_collator)

# Step 6: Training Function
def train_custom(model, train_loader, val_loader, device="cuda", lr=4e-4, steps=50000, save_steps=10000):
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3000, num_training_steps=steps)

    writer = SummaryWriter(log_dir="./logs-custom")
    model.train()

    step = 0
    for epoch in range(steps // len(train_loader) + 1):
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate labels for MLM
            labels = input_ids.clone()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Log training loss
            if step % 500 == 0:
                writer.add_scalar("Training Loss", loss.item(), step)
                print(f"Step {step}, Training Loss: {loss.item():.4f}")

            # Save checkpoints
            if step % save_steps == 0 and step > 0:
                save_path = f"./checkpoints-custom/checkpoint-{step}"
                model.save_pretrained(save_path)
                tokenizer.save_pretrained(save_path)
                print(f"Checkpoint saved at step {step}")

            # Validation
            if step % 5000 == 0 and step > 0:
                val_loss = evaluate(model, val_loader, device)
                writer.add_scalar("Validation Loss", val_loss, step)
                print(f"Step {step}, Validation Loss: {val_loss:.4f}")

            step += 1
            if step >= steps:
                break
        if step >= steps:
            break

    print("Training complete!")
    model.save_pretrained("./camembert-final-custom")
    tokenizer.save_pretrained("./camembert-final-custom")
    writer.close()

# Step 7: Evaluation Function
def evaluate(model, val_loader, device="cuda"):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = input_ids.clone()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    model.train()
    return total_loss / len(val_loader)

# Step 8: Train the Model
print("Starting training...")
train_custom(model, train_loader, val_loader, steps=50000)  # Adjust steps as needed


Loading the OSCAR dataset from: /home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow
Splitting dataset into train and validation sets...
Train size: 1140000, Validation size: 60000
Initializing tokenizer and model...
Tokenizing datasets...


Map:   0%|          | 0/1140000 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [8]:
import torch
from transformers import (
    CamembertTokenizer,
    CamembertConfig,
    CamembertForMaskedLM,
    Trainer,
    TrainingArguments,
)
from datasets import load_from_disk, DatasetDict
from torch import nn

# Reinitialize weights function
def reinitialize_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        module.reset_parameters()
    elif isinstance(module, nn.LayerNorm):
        module.reset_parameters()
    elif hasattr(module, 'weight') and module.weight is not None:
        nn.init.xavier_uniform_(module.weight)  # Use Xavier initialization
    if hasattr(module, 'bias') and module.bias is not None:
        nn.init.zeros_(module.bias)

# Step 1: Load Dataset
dataset_path = "/home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow"  # Replace with your path
print("Loading the OSCAR dataset from:", dataset_path)
dataset = load_from_disk(dataset_path)

# Split into train and validation sets (95% train, 5% validation)
dataset = dataset.shuffle(seed=42)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

# Step 2: Initialize Tokenizer and Model
print("Initializing tokenizer and model...")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# Step 3: Reinitialize Model Weights
print("Reinitializing model weights...")
model.apply(reinitialize_weights)

# Step 4: Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

print("Tokenizing datasets...")
tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 5: TrainingArguments
training_args = TrainingArguments(
    output_dir="./camembert-pretraining-checkpoints",
    eval_strategy="steps",               # Use eval_strategy instead of evaluation_strategy
    eval_steps=100,
    save_steps=100,
    logging_steps=100,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=6e-4,
    weight_decay=0.01,
    warmup_steps=24000,                  # Match RoBERTa Base
    max_steps=10000,
    report_to="tensorboard",
    save_total_limit=2,
    gradient_accumulation_steps=64,     # Effective batch size 8k
    fp16=True,
    logging_dir="./logs-trainer-CamemBert",
)
# Step 6: Trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

Loading the OSCAR dataset from: /home/amine/CamemBERT/data/CamemBERT/data/mini_oscar_1.2/mini_dataset.arrow
Initializing tokenizer and model...


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing model weights...
Tokenizing datasets...
Initializing Trainer...


max_steps is given, it will override any value given in num_train_epochs


In [None]:

# Step 7: Train
print("Starting training...")
trainer.train()

# Save final model
print("Saving final model...")
trainer.save_model("./camembert-final-random")
tokenizer.save_pretrained("./camembert-final-random")


Starting training...


Step,Training Loss,Validation Loss
100,10.0872,9.511073
200,9.2634,8.966062
300,8.7266,8.42877
400,8.1869,7.93248
500,7.7878,7.694324
600,7.6806,7.677878
700,7.6745,7.675823
800,7.669,7.669961
900,7.6671,7.666504
1000,7.6618,7.659964


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [24]:

# Step 5: TrainingArguments
training_args = TrainingArguments(
    output_dir="./camembert-pretraining-checkpoints",
    eval_strategy="steps",               # Use eval_strategy instead of evaluation_strategy
    eval_steps=1000,
    save_steps=1000,
    logging_steps=500,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=100,
    learning_rate=6e-4,
    weight_decay=0.01,
    warmup_steps=24000,                  # Match RoBERTa Base
    max_steps=500000,
    report_to="tensorboard",
    save_total_limit=2,
    gradient_accumulation_steps=256,     # Effective batch size 8k
    fp16=True,
    logging_dir="./logs-trainer-CamemBert",
)
# Step 6: Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

# Step 7: Train
print("Starting training...")
trainer.train()

# Save final model
print("Saving final model...")
trainer.save_model("./camembert-final-random")
tokenizer.save_pretrained("./camembert-final-random")

Initializing Trainer...


  trainer = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
