In [1]:
from datasets import load_dataset

# Load the OSCAR dataset in streaming mode
dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", split="train", streaming=True)

# Limit the dataset size to approximately 4GB
subset_size = 4 * 1024**3  # 4GB in bytes
current_size = 0
subset = []

for example in dataset:
    text = example["text"]  # Only use the "text" field
    current_size += len(text.encode("utf-8"))  # Estimate size in bytes
    subset.append(text)  # Keep only the text
    if current_size >= subset_size:
        break


In [2]:
# Save the subset to a text file
with open("oscar_text_subset.txt", "w", encoding="utf-8") as f:
    for line in subset:
        f.write(line + "\n")


In [3]:
# Reload the text-only dataset
with open("oscar_text_subset.txt", "r", encoding="utf-8") as f:
    subset = [line.strip() for line in f]


In [None]:
from transformers import CamembertTokenizer

# Load CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Tokenize the text data
tokenized_data = [tokenizer(text, truncation=True, max_length=512) for text in subset]


In [None]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Convert tokenized data into a PyTorch DataLoader
dataloader = DataLoader(tokenized_data, batch_size=8, shuffle=True, collate_fn=data_collator)
