In [1]:
from datasets import load_dataset

# Load the OSCAR dataset in streaming mode
dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", split="train", streaming=True)

# Limit the dataset size to approximately 4GB
subset_size = 4 * 1024**3  # 4GB in bytes
current_size = 0
subset = []

for example in dataset:
    text = example["text"]  # Only use the "text" field
    current_size += len(text.encode("utf-8"))  # Estimate size in bytes
    subset.append(text)  # Keep only the text
    if current_size >= subset_size:
        break

In [2]:
# Save the subset to a text file
with open("oscar_text_subset.txt", "w", encoding="utf-8") as f:
    for line in subset:
        f.write(line + "\n")


In [3]:
# Reload the text-only dataset
with open("oscar_text_subset.txt", "r", encoding="utf-8") as f:
    subset = [line.strip() for line in f]


In [4]:
from transformers import CamembertTokenizer

# Load CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Tokenize the text data
tokenized_data = [tokenizer(text, truncation=True, max_length=512) for text in subset]


In [5]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Convert tokenized data into a PyTorch DataLoader
dataloader = DataLoader(tokenized_data, batch_size=8, shuffle=True, collate_fn=data_collator)


2024-12-26 17:42:48.512319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735231368.585828  903259 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735231368.607699  903259 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-26 17:42:48.689516: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import torch
from transformers import (
    CamembertTokenizer,
    CamembertForMaskedLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset, Dataset, DatasetDict
from torch import nn

# Reinitialize weights function
def reinitialize_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        module.reset_parameters()
    elif isinstance(module, nn.LayerNorm):
        module.reset_parameters()
    elif hasattr(module, "weight") and module.weight is not None:
        nn.init.xavier_uniform_(module.weight)  # Xavier initialization
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.zeros_(module.bias)


# Step 1: Load Dataset with a 4GB limitation
print("Loading the OSCAR dataset...")
dataset = load_dataset(
    "oscar-corpus/OSCAR-2201",
    split="train",
    language="fr",
    streaming=True,
    trust_remote_code=True,
)

# Limit to 4GB of text
def limit_dataset(dataset, limit_gb=4):
    current_size = 0
    byte_limit = limit_gb * 1024 ** 3
    for example in dataset:
        text_size = len(example["text"].encode("utf-8"))
        if current_size + text_size > byte_limit:
            break
        current_size += text_size
        yield example

limited_dataset = list(limit_dataset(dataset, limit_gb=4))

# Convert to Hugging Face Dataset
print("Converting limited dataset to Hugging Face Dataset...")
limited_dataset = Dataset.from_dict({"text": [example["text"] for example in limited_dataset]})

# Split into train and validation sets (95% train, 5% validation)
print("Splitting dataset into train and validation sets...")
oscar_dataset = limited_dataset.train_test_split(test_size=0.05, seed=42)

# Step 2: Initialize Tokenizer and Model
print("Initializing tokenizer and model...")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# Step 3: Reinitialize Model Weights
print("Reinitializing model weights...")
model.apply(reinitialize_weights)

# Step 4: Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

print("Tokenizing datasets...")
tokenized_datasets = oscar_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 5: TrainingArguments
training_args = TrainingArguments(
    output_dir="./camembert-pretraining-checkpoints",
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_steps=100,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=6e-4,
    weight_decay=0.01,
    warmup_steps=24000,
    max_steps=10000,
    report_to="tensorboard",
    save_total_limit=2,
    gradient_accumulation_steps=64,
    fp16=True,
    logging_dir="./logs-trainer-CamemBert",
)

# Step 6: Trainer
print("Initializing Trainer...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

# Step 7: Train the Model
print("Starting training...")
trainer.train()


2025-01-01 22:53:47.346759: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735768427.541985 3040077 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735768427.699910 3040077 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-01 22:53:49.570292: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading the OSCAR dataset...
Converting limited dataset to Hugging Face Dataset...
Splitting dataset into train and validation sets...
Initializing tokenizer and model...


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing model weights...
Tokenizing datasets...


Map:   0%|          | 0/738747 [00:00<?, ? examples/s]

Map:   0%|          | 0/38882 [00:00<?, ? examples/s]

Initializing Trainer...


max_steps is given, it will override any value given in num_train_epochs


Starting training...


Step,Training Loss,Validation Loss
100,10.0534,9.508634
200,9.2616,8.962401
300,8.728,8.428139
400,8.1889,7.934975
500,7.7896,7.693579
600,7.6809,7.675951
700,7.6732,7.675043
800,7.6692,7.670251
900,7.6649,7.665514
1000,7.6613,7.661201


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

