In [6]:
import os
import pickle   
from datasets import load_from_disk
from transformers import CamembertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
import torch.nn as nn

from tqdm import tqdm
import time

In [7]:
import os
import sys
os.chdir(os.path.abspath("../../../"))
os.getcwd()

'c:\\Users\\Napster\\Desktop\\M2_ISI\\MLA\\CamemBERT\\MLA-CamemBERT'

# 1. Prepare Data :

In [3]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

In [5]:
class Tokenized_oscar_dataset(Dataset):
    def __init__(self, raw_tokenised_data):
        """
        Initialiser le dataset avec les données sous forme de dictionnaire.
        """
        self.raw_data = raw_tokenised_data  # Le dictionnaire avec 'input_ids', 'attention_mask', et 'labels'
        self.data = {
            "texts": self.raw_data['text'],
            "input_ids": self.raw_data['input_ids'],
            "attention_mask": self.raw_data['attention_mask'],
            "labels": self.raw_data['labels']
        }
        
    def __len__(self):
        """
        Retourne le nombre d'exemples dans le dataset.
        """
        return len(self.data['input_ids'])
    
    def __getitem__(self, idx):
        """
        Retourne un exemple individuel sous forme de dictionnaire.
        """
        return {
            'input_ids': torch.tensor(self.data['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.data['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor(self.data['labels'][idx], dtype=torch.long)
        }

    
def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    """Prepare masked tokens for MLM."""
    labels = inputs.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace 80% of masked tokens with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of masked tokens with random words
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    return inputs, labels


def preprocess_function(texts, tokenizer, max_length=512):
    """
    Préparer les données pour l'entraînement MLM.
    :param texts: Liste de textes.
    :param tokenizer: Tokenizer de CamemBERT.
    :return: Dictionnaire avec 'input_ids', 'attention_mask', et 'labels'.
    """
    # Tokeniser les textes
    tokenized = tokenizer(texts, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    # Appliquer le masquage
    masked_inputs, labels = mask_tokens(input_ids, tokenizer)

    return { 
        "text": list(texts),
        "input_ids": masked_inputs.tolist(),
        "attention_mask": attention_mask.tolist(),
        "labels": labels.tolist()
    }

load the downlaoded Oscar :

In [None]:
dataset_path ="data/oscar.Arrow"
mini_oscar_dataset = load_from_disk(dataset_path)
train_texts, val_texts = train_test_split(mini_oscar_dataset['text'], test_size=0.2, random_state=42)

In [None]:
train_data = preprocess_function(train_texts, tokenizer)
val_data = preprocess_function(val_texts, tokenizer)

In [None]:
import pickle

def save_tokenized_data(data, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        pickle.dump(data, f)

save_tokenized_data(train_data, "data/tokenized_data/tokenized_train_data.pkl")
save_tokenized_data(val_data, "data/tokenized_data/tokenized_val_data.pkl")

In [None]:
def load_tokenized_data(filename):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    return data

train_data = load_tokenized_data("data/tokenized_data/tokenized_train_data.pkl")
val_data = load_tokenized_data("data/tokenized_data/tokenized_val_data.pkl")

In [None]:
# Créer les datasets
train_dataset = Tokenized_oscar_dataset(train_data)
val_dataset = Tokenized_oscar_dataset(val_data)

# Créer les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8192, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8192, shuffle=False)

In [None]:
# tester la donnée :
batch = next(iter(train_loader))
print(batch['input_ids'].shape)
print(batch['attention_mask'].shape)
print(batch['labels'].shape)


print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")

# 2. Model :

In [None]:
def init_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        print(f"Initializing {module.__class__.__name__} with Xavier Uniform")
        nn.init.xavier_uniform_(module.weight)
    elif isinstance(module, nn.LayerNorm):
        print(f"Initializing {module.__class__.__name__} with Constant Weights")
        nn.init.constant_(module.bias, 0)
        nn.init.constant_(module.weight, 1)
    if hasattr(module, 'bias') and module.bias is not None:
        print(f"Initializing Bias for {module.__class__.__name__}")
        nn.init.constant_(module.bias, 0)

# 3. Train :

In [None]:
def save_checkpoint(epoch, model, optimizer, loss, save_dir):
    """
    Sauvegarder l'état actuel du modèle, optimiseur et perte.
    """
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    save_path = os.path.join(save_dir, f"checkpoint_epoch_{epoch}.pth")
    torch.save(checkpoint, save_path)
    print(f"Checkpoint saved at {save_path}")

In [None]:
# 1. data :
batch_size = 4096
train_data = load_tokenized_data("data/tokenized_data/tokenized_train_data.pkl")
val_data = load_tokenized_data("data/tokenized_data/tokenized_val_data.pkl")

#  1.1. Créer les datasets
train_dataset = Tokenized_oscar_dataset(train_data)
val_dataset = Tokenized_oscar_dataset(val_data)

#  1.2. Créer les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 2. model :
from transformers import CamembertForMaskedLM
model = CamembertForMaskedLM.from_pretrained("camembert-base")
model.apply(init_weights)

training loop :

In [None]:
# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

max_lr = 6e-4
min_lr = max_lr * 0.1  # 10 %
warmup_steps = 10

save_interval = 40  # Save the model every 40 epochs 
eval_interval = 40  # Evaluate the model every 40 epochs
num_epochs = 420  # Total epochs
learning_rate = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=learning_rate)
history = {'train_loss': [], 'val_loss': []}

# Dossier de sauvegarde
save_dir = "models/mlm_training/model_checkpoints"
os.makedirs(save_dir, exist_ok=True)

############################## Training Loop ################################
for epoch in range(num_epochs):
    model.train()  # Mettre le modèle en mode entraînement
    train_loss = 0.0  # Accumulateur pour la perte par époque

    t0 = time.time()
    
    # once in a while evaluate our validation loss : (only for one step, next we'll consider more steps validation)
    if epoch % 40 ==0 or epoch == num_epochs - 1:
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            for batch_index, batch in enumerate(val_loader):
                # 1. Prepare val data :
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                # 2. Forward pass :
                # optimizer.zero_grad() not needed since torch.no_grad is used 
                logits = model(input_ids, attention_mask=attention_mask)['logits']
                loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

                val_loss += loss.item()
            avg_val_loss = val_loss / len(val_loader)
            print(f"Epoch {epoch + 1 }, Validation Loss: {avg_val_loss:.4f}")
            history['val_loss'].append(avg_val_loss)

    # Update the parameters of the model :
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}") as pbar:
        for batch_index, batch in enumerate(train_loader):
            # 1. Prepare trin data :
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # 2. Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

            # 3. Backward pass et optimisation
            loss.backward()
            optimizer.step()

            # Suivi de la progression
            train_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
            pbar.update(1)
        
    avg_train_loss = train_loss / len(train_loader)

    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}, Epoch time: {dt:.2f} s")

    # Sauvegarder la perte moyenne de l'époque
    history['train_loss'].append(avg_train_loss)

    # Sauvegarder tous les 40 epochs
    if (epoch + 1) % save_interval == 0 or (epoch + 1) == num_epochs:
        save_checkpoint(epoch + 1, model, optimizer, avg_train_loss, save_dir)