In [11]:
import datasets
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import CamembertModel, CamembertTokenizer, AdamW, get_cosine_schedule_with_warmup 
from transformers import CamembertModel, CamembertTokenizer
from datasets import load_dataset
from sklearn.metrics import f1_score
import os

In [12]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
class CamembertConfig:
    def __init__(self):
        self.vocab_size = 32005
        self.hidden_size = 768
        self.num_hidden_layers = 12
        self.num_attention_heads = 12
        self.intermediate_size = 3072
        self.hidden_act = "gelu"
        self.hidden_dropout_prob = 0.1
        self.attention_probs_dropout_prob = 0.1
        self.max_position_embeddings = 514
        self.type_vocab_size = 1
        self.initializer_range = 0.02
        self.layer_norm_eps = 1e-5
        self.pad_token_id = 1
        self.head_type = "MLM"
class CamembertEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        input_shape = input_ids.size()
        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device).unsqueeze(0)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)

        inputs_embeds = self.word_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeds + token_type_embeds
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        # Debug prints
        # print(f"Embeddings NaN: {torch.isnan(embeddings).any()}")

        return embeddings

class CamembertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.dropout = nn.Dropout(0.2)  # Increased dropout rate

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        return x.view(new_x_shape).permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # Debug query, key, value
        # print(f"Query NaN: {torch.isnan(query_layer).any()}")
        # print(f"Key NaN: {torch.isnan(key_layer).any()}")
        # print(f"Value NaN: {torch.isnan(value_layer).any()}")

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores /= math.sqrt(self.attention_head_size)

        # Clamp scores to prevent overflow
        attention_scores = torch.clamp(attention_scores, min=-1e9, max=1e9)
        attention_probs = nn.functional.softmax(attention_scores, dim=-1) + 1e-9
        attention_probs = self.dropout(attention_probs)

        # Debug attention scores and probabilities
        # print(f"Attention Scores NaN Before Clamp: {torch.isnan(attention_scores).any()}")
        # print(f"Attention Probs NaN: {torch.isnan(attention_probs).any()}")

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        context_layer = context_layer.view(context_layer.size(0), -1, self.all_head_size)

        # Debug context layer
        # print(f"Context Layer NaN: {torch.isnan(context_layer).any()}")

        return context_layer



class CamembertFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.activation = F.gelu if config.hidden_act == "gelu" else nn.ReLU()
        self.dense_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(0.2)  # Increased dropout rate

    def forward(self, hidden_states):
        intermediate_output = self.activation(self.dense_1(hidden_states))
        intermediate_output = torch.clamp(intermediate_output, min=-1e9, max=1e9)

        output = self.dense_2(intermediate_output)
        output = self.dropout(output)
        output = self.LayerNorm(output + hidden_states)

        # Debug intermediate and final outputs
        # print(f"Intermediate Output NaN: {torch.isnan(intermediate_output).any()}")
        # print(f"Final Output NaN: {torch.isnan(output).any()}")

        return output


class CamembertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = CamembertSelfAttention(config)
        self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.feed_forward = CamembertFeedForward(config)

    def forward(self, hidden_states, attention_mask=None):
        attention_output = self.attention(hidden_states, attention_mask)
        hidden_states = self.attention_norm(hidden_states + attention_output)
        return self.feed_forward(hidden_states)

class CamembertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask=None):
        for i, layer in enumerate(self.layers):
            hidden_states = layer(hidden_states, attention_mask)

            # Debug prints for each layer
            # print(f"Layer {i} Hidden States NaN: {torch.isnan(hidden_states).any()}")

        return hidden_states

class CamembertLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    def forward(self, hidden_states):
        hidden_states = F.gelu(self.dense(hidden_states))
        hidden_states = self.layer_norm(hidden_states)
        logits = self.decoder(hidden_states)

        # Debug prints
        # print(f"Logits NaN: {torch.isnan(logits).any()}")

        return logits
        
class Camembert_emb_en(nn.Module) : 
    def __init__(self, config):
        super(Camembert_emb_en , self).__init__()
        self.embeddings = CamembertEmbeddings(config)
        self.encoder = CamembertEncoder(config)
        self.config = config 
    def forward(self, input_ids, attention_mask=None):
        embedded_input = self.embeddings(input_ids)

        if attention_mask is not None:
            attention_mask = (1.0 - attention_mask) * -float('inf')

        encoder_output = self.encoder(embedded_input, attention_mask)
        return encoder_output

In [13]:
class CamemBERTBaseModel(nn.Module):
    def __init__(self, model_path: str, trainable: bool = False):
        super(CamemBERTBaseModel, self).__init__()
        self.base_model = CamembertModel.from_pretrained(model_path)
        self.trainable = trainable
        self.config = CamembertConfig()

        if not trainable:
            for param in self.base_model.parameters():
                param.requires_grad = False
            self.base_model.eval()
        else:
            self.base_model.train()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_hidden_size(self) -> int:
        return self.base_model.config.hidden_size

class CamemBERTBaseModelv2(nn.Module):
    def __init__(self, model_path: str, config=None, trainable: bool = False):
        super(CamemBERTBaseModelv2, self).__init__()
        self.base_model = Camembert_emb_en(config)
        checkpoint = torch.load(model_path)
        state_dict = checkpoint['model_state_dict']

        # Filter out head-related keys
        #filtered_state_dict = {k: v for k, v in state_dict.items() if not k.startswith("head.")}
        
        # Load the filtered state_dict
        self.base_model.load_state_dict(state_dict, strict=False)
        print("Model loaded from {}".format(model_path))

        self.trainable = trainable
        self.config = config

        # Set trainable state
        if not trainable:
            for param in self.base_model.parameters():
                param.requires_grad = False
            self.base_model.eval()
        else:
            self.base_model.train()

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        # Get structured output from base_model
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs


    def get_hidden_size(self) -> int:
        return self.base_model.config.hidden_size


In [18]:
class NerFinetuningModel(nn.Module):
    def __init__(self, model_path: str, num_labels: int = 9, trainable: bool = True , our_model = False , config = None):
        super(NerFinetuningModel, self).__init__()
        if our_model : 
            self.base_model = CamemBERTBaseModelv2(model_path, trainable=trainable , config=config)
        else :
            self.base_model = CamemBERTBaseModel(model_path, trainable=trainable)
        self.hidden_size = self.base_model.get_hidden_size()
        self.ner_head = nn.Linear(self.hidden_size, num_labels)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        hidden_states = self.base_model(input_ids, attention_mask)
    
        logits = self.ner_head(hidden_states)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
        return {"logits": logits, "loss": loss}

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

class NERDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128, label_all_tokens=True):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_all_tokens = label_all_tokens

    def tokenize_and_align_labels(self, tokens, ner_tags):
        tokenized_inputs = self.tokenizer(
            tokens,
            truncation=True,
            is_split_into_words=True,
            padding='max_length',
            max_length=self.max_length
        )
        labels = []
        for i, label in enumerate(ner_tags):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)  
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label[word_idx] if self.label_all_tokens else -100)
                previous_word_idx = word_idx
            labels.append(label_ids)

        # Convert all outputs to tensors
        tokenized_inputs = {key: torch.tensor(val, dtype=torch.long) for key, val in tokenized_inputs.items()}
        tokenized_inputs["labels"] = torch.tensor(labels, dtype=torch.long)

        return tokenized_inputs

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        tokenized_data = self.tokenize_and_align_labels(
            [data['tokens']], [data['ner_tags']]
        )

        return {
            'input_ids': tokenized_data['input_ids'].squeeze(),
            'attention_mask': tokenized_data['attention_mask'].squeeze(),
            'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
        }



In [20]:
def train_model(model, train_loader, val_loader, num_epochs, device, lr=5e-5, num_labels=9, save_dir="./models"):
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = num_epochs * len(train_loader)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    model.to(device)

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_train_loss = 0
        correct, total = 0, 0
        all_preds, all_labels = [], []

        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, labels)
            logits = outputs["logits"]
            loss = outputs["loss"]

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Accumulate loss
            total_train_loss += loss.item()

            # Compute accuracy
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.view(-1).tolist())
            all_labels.extend(labels.view(-1).tolist())
            correct += (preds.view(-1) == labels.view(-1)).sum().item()
            total += labels.view(-1).numel()

        # Compute metrics
        valid_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
        valid_labels = [l for l in all_labels if l != -100]
        train_f1 = f1_score(valid_labels, valid_preds, average="weighted", labels=list(range(num_labels)))
        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = correct / total

        # Validation
        model.eval()
        total_val_loss = 0
        correct, total = 0, 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask, labels)
                logits = outputs["logits"]
                loss = outputs["loss"]

                # Accumulate loss
                total_val_loss += loss.item()

                # Compute accuracy
                preds = torch.argmax(logits, dim=-1)
                all_preds.extend(preds.view(-1).tolist())
                all_labels.extend(labels.view(-1).tolist())
                correct += (preds.view(-1) == labels.view(-1)).sum().item()
                total += labels.view(-1).numel()

        # Compute metrics
        valid_preds = [p for p, l in zip(all_preds, all_labels) if l != -100]
        valid_labels = [l for l in all_labels if l != -100]
        val_f1 = f1_score(valid_labels, valid_preds, average="weighted", labels=list(range(num_labels)))
        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct / total

        # Print metrics
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
        print(f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

        # Save model
        os.makedirs(save_dir, exist_ok=True)
        torch.save(model.state_dict(), f"{save_dir}/ner_model_epoch_{epoch + 1}.pth")
        print(f"Model saved to {save_dir}/ner_model_epoch_{epoch + 1}.pth")

In [21]:
# Load tokenizer

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")#AutoTokenizer.from_pretrained("camembert-base")
dataset = load_dataset("unimelb-nlp/wikiann", "fr", trust_remote_code=True)
train_data = NERDataset(dataset['train'], tokenizer)
val_data = NERDataset(dataset['validation'], tokenizer)
test_data = NERDataset(dataset['test'], tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)
config = CamembertConfig()
    # Initialize model
model_path = "/home/amine/Noureddine/MLA-CamemBERT/notebooks/trainings/models/Pretraining/model_checkpoints/checkpoint_epoch_9.pth"#"camembert-base"
num_labels = len(dataset["train"].features["ner_tags"].feature.names)
model = NerFinetuningModel(model_path, num_labels=num_labels, trainable=True , our_model=True , config=config)

    # Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, val_loader, num_epochs=5, device=device, save_dir="./models/NER_finetune_from_our_pretrained_model")

  checkpoint = torch.load(model_path)


Model loaded from /home/amine/Noureddine/MLA-CamemBERT/notebooks/trainings/models/Pretraining/model_checkpoints/checkpoint_epoch_9.pth


  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/5
Train Loss: 1.6680, Train Accuracy: 0.0402, Train F1: 0.2705
Val Loss: 1.6727, Val Accuracy: 0.0397, Val F1: 0.2666
Model saved to ./models/NER_finetune_from_our_pretrained_model/ner_model_epoch_1.pth


  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/5
Train Loss: 1.6653, Train Accuracy: 0.0402, Train F1: 0.2705
Val Loss: 1.6670, Val Accuracy: 0.0397, Val F1: 0.2666
Model saved to ./models/NER_finetune_from_our_pretrained_model/ner_model_epoch_2.pth


  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/5
Train Loss: 1.6639, Train Accuracy: 0.0402, Train F1: 0.2705
Val Loss: 1.6669, Val Accuracy: 0.0397, Val F1: 0.2666
Model saved to ./models/NER_finetune_from_our_pretrained_model/ner_model_epoch_3.pth


  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/5
Train Loss: 1.6628, Train Accuracy: 0.0402, Train F1: 0.2705
Val Loss: 1.6657, Val Accuracy: 0.0397, Val F1: 0.2666
Model saved to ./models/NER_finetune_from_our_pretrained_model/ner_model_epoch_4.pth


  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  'labels': torch.tensor(tokenized_data['labels'][0], dtype=torch.long),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/5
Train Loss: 1.6619, Train Accuracy: 0.0402, Train F1: 0.2705
Val Loss: 1.6655, Val Accuracy: 0.0397, Val F1: 0.2666
Model saved to ./models/NER_finetune_from_our_pretrained_model/ner_model_epoch_5.pth
