In [None]:
from datasets_l import load_dataset

dataset = load_dataset(
    "oscar-corpus/OSCAR-2201",
    language="fr",                     # Specify language (e.g., 'fr' for French)
    split="train",                     # Use 'train' split
    streaming=True,                    # Use streaming for large datasets
    trust_remote_code=True             # Allow execution of custom code
)

for i, sample in enumerate(dataset):
    print(sample)
    if i == 4:  # Limit to 5 examples
        break


{'id': 0, 'text': "L’éditeur à l’origine de cours\td'entraînement de armée n’est autres que tag action gamescours d'entraînement de armée a été mis à jour le 03 août 2017 [...] dernier sujet à aborder pour finir : ce qui est bien avec la plateforme de téléchargement de google c'est que l'on compte énormément d'applications qui sont gratuites cours\td'entraînement de armée ne déroge pas à cette règle elle offre ses fonctionnalités gratuitement , vous voulez une app qui sort du lot ? laissez vous tenter par celle ci / avant de la télécharger découvrez ses fonctionnalités : ce programme est disponible actuellement sur toucharger dans sa version 1 | 2 ; 3 : pour installer cette app vous avez besoin au minimum de la version d'android 2 [...] vérifiez donc que votre appareil est compatible en vous rendant dans les paramètres de votre smartphone ou tablette , sa dernière version actualisée n'est pas si vieille que ça elle remonte au 03 août 2017 / vous pouvez toujours regarder de plus près le

In [10]:
from transformers import CamembertTokenizer

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")




In [None]:
from datasets_l import load_dataset
from transformers import CamembertTokenizer
from torch.utils.data import DataLoader
import torch

# Load tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Load OSCAR dataset in streaming mode
dataset = load_dataset(
    "oscar-corpus/OSCAR-2201",
    split="train",                  # Use 'train' split
    language="fr",                  # French language subset
    streaming=True,                 # Stream the dataset
    trust_remote_code=True          # Allow custom code
)

# Limit to 4GB
def limit_dataset(dataset, limit_gb=4):
    current_size = 0  # Track total size
    byte_limit = limit_gb * 1024 ** 3  # Convert GB to bytes
    for example in dataset:
        text_size = len(example["text"].encode("utf-8"))  # Size of text in bytes
        if current_size + text_size > byte_limit:
            break
        current_size += text_size
        yield example

# Apply size limitation
limited_dataset = limit_dataset(dataset, limit_gb=4)

# Tokenize on-the-fly
def tokenize_on_the_fly(example):
    return tokenizer(
        example["text"],            # Text data from the dataset
        truncation=True,            # Truncate to max_length
        padding="max_length",       # Pad to max_length
        max_length=512,             # Model max length
        return_tensors="pt"         # Return PyTorch tensors
    )

# Stream the limited dataset and tokenize
tokenized_dataset = (tokenize_on_the_fly(example) for example in limited_dataset)

# Create DataLoader
def collate_fn(batch):
    input_ids = torch.cat([b["input_ids"] for b in batch])
    attention_mask = torch.cat([b["attention_mask"] for b in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask}

batch_size = 16
dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, collate_fn=collate_fn)


In [None]:
import torch
from transformers import (
    CamembertTokenizer,
    CamembertForMaskedLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets_l import load_dataset, Dataset, DatasetDict
from torch import nn

# Reinitialize weights function
def reinitialize_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        module.reset_parameters()
    elif isinstance(module, nn.LayerNorm):
        module.reset_parameters()
    elif hasattr(module, "weight") and module.weight is not None:
        nn.init.xavier_uniform_(module.weight)  # Xavier initialization
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.zeros_(module.bias)


# Step 1: Load Dataset with a 4GB limitation
print("Loading the OSCAR dataset...")
dataset = load_dataset(
    "oscar-corpus/OSCAR-2201",
    split="train",
    language="fr",
    streaming=True,
    trust_remote_code=True,
)

# Limit to 4GB of text
def limit_dataset(dataset, limit_gb=4):
    current_size = 0
    byte_limit = limit_gb * 1024 ** 3
    for example in dataset:
        text_size = len(example["text"].encode("utf-8"))
        if current_size + text_size > byte_limit:
            break
        current_size += text_size
        yield example

limited_dataset = list(limit_dataset(dataset, limit_gb=4))

# Convert to Hugging Face Dataset
print("Converting limited dataset to Hugging Face Dataset...")
limited_dataset = Dataset.from_dict({"text": [example["text"] for example in limited_dataset]})

# Split into train and validation sets (95% train, 5% validation)
print("Splitting dataset into train and validation sets...")
oscar_dataset = limited_dataset.train_test_split(test_size=0.05, seed=42)

# Step 2: Initialize Tokenizer and Model
print("Initializing tokenizer and model...")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# Step 3: Reinitialize Model Weights
print("Reinitializing model weights...")
model.apply(reinitialize_weights)

# Step 4: Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

print("Tokenizing datasets...")
tokenized_datasets = oscar_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 5: TrainingArguments
training_args = TrainingArguments(
    output_dir="./camembert-pretraining-checkpoints",
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_steps=100,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=6e-4,
    weight_decay=0.01,
    warmup_steps=24000,
    max_steps=10000,
    report_to="tensorboard",
    save_total_limit=2,
    gradient_accumulation_steps=64,
    fp16=True,
    logging_dir="./logs-trainer-CamemBert",
)

# Step 6: Trainer
print("Initializing Trainer...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

# Step 7: Train the Model
print("Starting training...")
trainer.train()


TypeError: object of type 'generator' has no len()