In [1]:
import functools
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import CamembertForTokenClassification_xval, CamembertForMaskedLM_xval, AutoTokenizer, AutoConfig
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
import numpy as np
from LabelClassif.data import plot_confusion_matrix
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
def stats(data):
    average = np.mean(data, axis=0)
    std =  np.mean((data-average)**2, axis=0)
    std = np.sqrt(std)
    return average, std

In [4]:
#decode batch of tokenized sentences into sentences
def decode_properly(batch_input_ids, tokenizer):
    sentences = []
    for input_ids in batch_input_ids:
        sentences.append(tokenizer.decode(input_ids, skip_special_tokens= True))
    return sentences

In [5]:
class CustomizedDataset(Dataset):
    def __init__(self, dataset_list):
        self.files = dataset_list
        
    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        return self.files[idx]

In [6]:
"""
tokenizer = AutoTokenizer.from_pretrained('../camembert-bio-model')
tokenizer.add_tokens("NUM")
tokenizer.save_pretrained("../checkpoints/tokenizer_xval/")
"""
tokenizer = AutoTokenizer.from_pretrained('../checkpoints/tokenizer_xval/')

keyword_id = 32005
len_tokenizer = len(tokenizer)

In [7]:
min_exponents = -4

In [8]:
def custom_precision_recall_score(y_true, y_pred, nb_labels):
    matrix = confusion_matrix(y_true, y_pred, labels = list(range(nb_labels)))
    precision = np.zeros(nb_labels)
    recall = np.zeros(nb_labels)
    sum_cols = np.sum(matrix, axis = 0)
    sum_lines = np.sum(matrix, axis = 1)
    acc = 0
    for i in range(nb_labels):
        sum_col_i = sum_cols[i] + 1e-7
        sum_line_i = sum_lines[i] + 1e-7
        precision[i] = matrix[i, i]/sum_col_i
        recall[i] = matrix[i, i]/sum_line_i
        acc += matrix[i, i]
    return precision, recall, acc/sum(sum_cols)

def custom_f1_score(y_true, y_pred, nb_labels, average = None):
    #average = None or "macro"
    precision, recall, acc = custom_precision_recall_score(y_true, y_pred, nb_labels)
    scores = 2*precision*recall/(precision + recall + 1e-7)
    if average == None:
        return scores
    else: 
        return scores.mean()

In [9]:
class MSLELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        
    def forward(self, pred, actual):
        return self.mse(torch.log(pred + 1), torch.log(actual + 1))

In [19]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, optim, lr, weight_decay, T_0=None):
        super().__init__()
        self.save_hyperparameters('model_name', 'optim', 'lr', 'weight_decay', 'T_0') 
        self.optim = optim   
        self.lr = lr
        self.weight_decay = weight_decay
        self.T_0 = T_0
        # We apply the multitask optimization approach as explained in Kendall et. al. 2017
        self.log_sigma_2 = torch.nn.Parameter(torch.zeros(3)) #We have three losses
        self.name = model_name
        if self.name == 'XVAL2_BertForMaskedLM':
            self.model = CamembertForMaskedLM_xval.from_pretrained(model_directory)
            self.num_labels = len_tokenizer #self.model.lm_head.decoder.out_features
        elif self.name == 'XVAL2_BertForTokClassif':
            self.model = CamembertForTokenClassification_xval.from_pretrained(model_directory)
            self.num_labels = self.model.num_labels
        else:
            raise ValueError('model name:' + model_name + 'is unknown')
            
        self.model.resize_token_embeddings(len_tokenizer)
        
    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            label_inputs = batch["label_inputs"],
            h_num = batch["h_nums"],
            attention_mask=batch["attention_mask"]
            )
        
    def training_step(self, batch):
        out = self.forward(batch)

        # -------- MASKED --------
        if self.name != 'XVAL2_BertForMaskedLM':
            logits = out.logits
            loss_fn = torch.nn.CrossEntropyLoss()#weight=28000/torch.tensor([28000, 21, 80, 57, 143, 130, 41, 58], device='cuda'))
            loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))

        else:
            pred_logits, num_logits = out.logits
            significand_logits, exponent_logits = num_logits

            loss_fn = torch.nn.CrossEntropyLoss()
            loss_1 = loss_fn(pred_logits.view(-1, self.num_labels), batch["labels"].view(-1))
            
            #selecting the tokens to be classified by NUM head
            condition = torch.flatten((batch["labels"].view(-1)==keyword_id).nonzero())
            #print(batch["labels"].view(-1)[condition])
            #print(batch["h_nums"].view(-1)[condition])
            loss_2 = loss_fn((exponent_logits.view(-1, exponent_logits.size(2)))[condition], (batch["exponents"].view(-1))[condition])
            
            loss_fn = torch.nn.MSELoss()
            loss_3 = loss_fn((significand_logits.view(-1))[condition], (batch["significands"].view(-1))[condition])
            #loss = loss_1 + loss_2 + loss_3
            
            weights = torch.exp(-self.log_sigma_2)
            loss =  weights[0]*loss_1 + weights[1]*loss_2 + .5*weights[2]*loss_3 + .5*self.log_sigma_2.sum()
            
        # ------ END MASKED ------

        self.log("train/loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_index):
        out = self.forward(batch)
        
        if self.name != 'XVAL2_BertForMaskedLM':
            preds = torch.max(out.logits, -1).indices
            f1 = custom_f1_score(batch["labels"].view(-1).cpu().numpy(), preds.view(-1).cpu().numpy(),
                 self.num_labels, average = "macro")
            self.log("valid/f1", f1, prog_bar=True, on_step=False, on_epoch=True)
            
        else:
            pred_logits, num_logits = out.logits
            significand_logits, exponent_logits = num_logits

            loss_fn = torch.nn.CrossEntropyLoss()
            loss_1 = loss_fn(pred_logits.view(-1, self.num_labels), batch["labels"].view(-1))
            
            #selecting the tokens to be classified by NUM head
            condition = torch.flatten((batch["labels"].view(-1)==keyword_id).nonzero())
            loss_2 = loss_fn((exponent_logits.view(-1, exponent_logits.size(2)))[condition], (batch["exponents"].view(-1))[condition])
            
            loss_fn = torch.nn.MSELoss()
            loss_3 = loss_fn((significand_logits.view(-1))[condition], (batch["significands"].view(-1))[condition])
            #loss = loss_1 + loss_2 + loss_3
            
            weights = torch.exp(-self.log_sigma_2)
            loss =  weights[0]*loss_1 + weights[1]*loss_2 + .5*weights[2]*loss_3 + .5*self.log_sigma_2.sum()
            self.log("valid/loss", loss, prog_bar=True, on_step=False, on_epoch=True)
            
    def predict_step(self, batch, batch_idx):
        """La fonction predict step facilite la prédiction de données. Elle est 
        similaire à `validation_step`, sans le calcul des métriques.
        """
        out = self.forward(batch)
        if self.name != 'XVAL2_BertForMaskedLM':
            return torch.max(out.logits, -1).indices
        
        batch_size, sentence_length = batch["input_ids"].size()
        pred_logits, num_logits = out.logits
        significand_logits, exponent_logits = num_logits

        texts = torch.max(pred_logits.view(-1, self.num_labels), -1).indices
        exponents = torch.max(exponent_logits.view(-1, exponent_logits.size(2)), -1).indices
        numbers = significance_logits.view(-1) * (exponents + min_exponents)
        # we replace the masked tokens corresponding to numbers by the predicted numbers
        condition = torch.flatten((texts==keyword_id).nonzero())
            
        texts[condition] = numbers[condition]
        return texts.view(batch_size, sentence_length)

    def configure_optimizers(self):
        if self.optim == 'AdamW':
            optimizer = torch.optim.AdamW(
                        self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
                    )
            optimizer.add_param_group({'params': self.log_sigma_2})
            
        elif self.optim == 'SGD':
            optimizer = torch.optim.SGD(
                            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
                        )
            optimizer.add_param_group({'params': self.log_sigma_2})
            
        else:
            raise ValueError('Optim name: ' + self.optim + ' is unknown')
        
        if self.T_0 is None:
            return optimizer
        else:
            scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, self.T_0)
            return [optimizer], [scheduler]

In [11]:
#key words for the sequence classification task
key_words = np.array([['mot', 'patient', 'historique'],
                      ['fraction', 'ejection', 'raccourcissement'],
                      ['cardiaque', 'coeur', 'fréquence'],
                      ['diamètre', 'pulmonaire', 'artère'], 
                      ['oxygène', 'O2', 'saturation'],
                      ['apgar', 'minute', 'nombre'],
                      ['gradient', 'pulmonaire', 'ventricule'],
                      ['cia', 'civ', 'inter']
                     ])
num_labels = 8 #['O', 'Cp', 'FC', 'D', 'SO2', 'AGPR', 'G', 'CI']
num_words_per_label = 3
label_inputs = torch.zeros((num_labels, num_words_per_label), dtype = int)

for label in range(num_labels):
    for i in range(num_words_per_label):
        label_inputs[label, i] = tokenizer.encode(key_words[label, i],
                   add_special_tokens = False)[0]

# Training Langage Modeling with the whole unlabeled dataset

In [None]:
model_directory = '../camembert-bio-model'
#checkpoint_path = '../checkpoints/xval2_camembert_mlm/epoch=130-step=96678.ckpt' #trained (with logsigma) on AdamW, lr=3e-5
#checkpoint_path = '../checkpoints/xval2_camembert_mlm/epoch=12-step=9594.ckpt' #trained (with logsigma) on AdamW, lr=3e-5 then SGD
#checkpoint_path = '../checkpoints/xval2_camembert_mlm/epoch=61-step=45756.ckpt' #LESA XVAL trained (with logsigma) on AdamW, lr=3e-5
checkpoint_path = '../checkpoints/xval2_camembert_mlm/epoch=9-step=7380.ckpt' #LESA XVAL trained (with logsigma) on AdamW, lr=3e-5 then SGD

In [None]:
with open("../sadcsip/xval_mlm_val", "rb") as fp:   # Unpickling
   mlm_val_ds = pickle.load(fp)
with open("../sadcsip/xval_mlm_train", "rb") as fp:   # Unpickling
   mlm_train_ds = pickle.load(fp)

mlm_val_ds = CustomizedDataset(mlm_val_ds)
mlm_train_ds = CustomizedDataset(mlm_train_ds)

In [None]:
def tokenize_and_align_labels(examples, tokenizer):
    text = [example["tokens"] for example in examples]
    tokenized_inputs = tokenizer(text, 
                                 padding="longest", truncation=True, return_tensors="pt",
                                 is_split_into_words=True)
    
    tokenized_inputs["labels"] = tokenized_inputs['input_ids'].detach().clone()
    
    #aligning h_nums with the tokens
    h_nums = []
    significands = []
    exponents = []
    for i in range(len(examples)):
        h_num = examples[i]["h_nums"]
        significand = examples[i]["significands"]
        exponent = examples[i]["exponents"]

        word_ids = tokenized_inputs.word_ids(batch_index = i)  # Map tokens to their respective word.
        h_num_extended = []
        significands_extended = []
        exponents_extended = []
        for word_idx in word_ids:  
            if word_idx is None: # Set the special tokens to 1.
                h_num_extended.append(1)
                significands_extended.append(1.0)
                exponents_extended.append(-min_exponents)
            elif text[i][word_idx] == "NUM":  # number is encountered.
                h_num_extended.append(h_num[word_idx])
                significands_extended.append(significand[word_idx])
                exponents_extended.append(exponent[word_idx])
            else:
                h_num_extended.append(1)
                significands_extended.append(1.0)
                exponents_extended.append(-min_exponents)
            
        h_nums.append(h_num_extended)
        significands.append(significands_extended)
        exponents.append(exponents_extended)

    h_nums = torch.tensor(h_nums, dtype=torch.float32)
    significands = torch.tensor(significands, dtype=torch.float32)
    exponents = torch.tensor(exponents, dtype=tokenized_inputs["labels"].dtype)
    
    tokenized_inputs["h_nums"] = h_nums
    tokenized_inputs["significands"] = significands
    tokenized_inputs["exponents"] = exponents
    
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(tokenized_inputs['input_ids'].shape)
    # create mask array while avoiding CLS, SEP and PAD
    mask_arr = (rand < 0.15) * (tokenized_inputs['input_ids'] != tokenizer.bos_token_id) * \
           (tokenized_inputs['input_ids'] != tokenizer.sep_token_id) * (tokenized_inputs['input_ids'] != tokenizer.pad_token_id)
    
    selection = []
    for i in range(tokenized_inputs['input_ids'].shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    #applying the mask
    for i in range(tokenized_inputs['input_ids'].shape[0]):
        tokenized_inputs["input_ids"][i, selection[i]] = tokenizer.mask_token_id
        tokenized_inputs["h_nums"][i, selection[i]] = 1.
    
    tokenized_inputs["label_inputs"] = label_inputs
    return tokenized_inputs

In [None]:
"""
examples = [{"tokens":["le", "ph", "=", "nombre", "."], "h_nums":[1, 1, 1, 7, 1]}, 
            {"tokens":["TA", "nombre", "mmgh", "."], "h_nums":[1, 120, 1, 1]}]
tokenize_and_align_labels(examples, tokenizer)
"""

In [None]:
#camembert_mlm = LightningModel(model_name = 'XVAL2_BertForMaskedLM', optim='AdamW', lr=3e-5, weight_decay=0.01)
camembert_mlm = LightningModel.load_from_checkpoint(checkpoint_path=checkpoint_path)#, optim='SGD', lr=1e-7, weight_decay=0.01, T_0=10)

In [None]:
it= iter(val_dataloader);

In [None]:
batch=next(it)
batch = {k:v.cuda() for k,v in batch.items()}
out = camembert_mlm.model(
        input_ids=batch["input_ids"],
        label_inputs = batch["label_inputs"],
        h_num = batch["h_nums"],
        attention_mask=batch["attention_mask"]
        )
pred_logits, num_logits = out.logits
significand_logits, exponent_logits = num_logits

loss_fn = torch.nn.CrossEntropyLoss()
loss_1 = loss_fn(pred_logits.view(-1, camembert_mlm.num_labels), batch["labels"].view(-1))
            
#selecting the tokens to be classified by NUM head
condition = torch.flatten((batch["labels"].view(-1)==keyword_id).nonzero())
#print(batch["labels"].view(-1)[condition])
#print(batch["h_nums"].view(-1)[condition])
loss_2 = loss_fn((exponent_logits.view(-1, exponent_logits.size(2)))[condition], (batch["exponents"].view(-1))[condition])
            
loss_fn = torch.nn.MSELoss()
loss_3 = loss_fn((significand_logits.view(-1))[condition], (batch["significands"].view(-1))[condition])
loss = loss_1 + loss_2 + loss_3
            

In [None]:
print(loss_1.item(), loss_2.item(), loss_3.item(), loss.item())

In [None]:
(significand_logits.view(-1))[condition]

In [None]:
(batch["significands"].view(-1))[condition]

In [None]:
camembert_mlm.log_sigma_2

In [None]:
torch.max(exponent_logits.view(-1, exponent_logits.size(2)), -1).indices[condition]

In [None]:
(batch["exponents"].view(-1))[condition]

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
camembert_mlm_checkpoint = pl.callbacks.ModelCheckpoint(dirpath = '../checkpoints/xval2_camembert_mlm',
                                                      monitor="valid/loss", mode="min")

camembert_mlm_trainer = pl.Trainer(
    #max_epochs=100,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/loss", patience=10, mode="min"),
        camembert_mlm_checkpoint,
    ],
    #detect_anomaly=True
)

In [None]:
train_dataloader = DataLoader(
    mlm_train_ds, 
    batch_size=24, 
    shuffle=True, 
    collate_fn=functools.partial(tokenize_and_align_labels, tokenizer=tokenizer),
    num_workers=8
)

val_dataloader = DataLoader(
    mlm_val_ds,
    batch_size=24, 
    shuffle=False, 
    collate_fn=functools.partial(tokenize_and_align_labels, tokenizer=tokenizer),
    num_workers=8
)

In [None]:
camembert_mlm_trainer.fit(camembert_mlm, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

# Token Classification

In [21]:
model_directory = "../camembert-bio-model"

#checkpoint_path = '../checkpoints/xval2_camembert_mlm/epoch=12-step=9594.ckpt' #trained (with logsigma) on AdamW, lr=3e-5 then SGD
checkpoint_path = '../checkpoints/xval2_camembert_mlm/epoch=9-step=7380.ckpt' #LESA XVAL trained (with logsigma) on AdamW, lr=3e-5 then SGD

In [13]:
with open("../sadcsip/xval_test", "rb") as fp:   # Unpickling
   test_ds = pickle.load(fp)

with open("../sadcsip/xval_val", "rb") as fp:   # Unpickling
   val_ds = pickle.load(fp)
with open("../sadcsip/xval_train", "rb") as fp:   # Unpickling
   train_ds = pickle.load(fp)

test_ds = CustomizedDataset(test_ds)
val_ds = CustomizedDataset(val_ds)
train_ds = CustomizedDataset(train_ds)

In [14]:
def tokenize_and_align_labels(examples, tokenizer):
    text = [example["tokens"] for example in examples]
    tokenized_inputs = tokenizer(text, 
                                 padding="longest", truncation=True, return_tensors="pt",
                                 is_split_into_words=True)
    #aligning h_nums with the tokens
    h_nums = []
    for i in range(len(examples)):
        h_num = examples[i]["h_nums"]
        word_ids = tokenized_inputs.word_ids(batch_index = i)  # Map tokens to their respective word.
        h_num_extended = []
        for word_idx in word_ids:  
            if word_idx is None: # Set the special tokens to 1.
                h_num_extended.append(1)
            elif text[i][word_idx] == "NUM":  # number is encountered.
                h_num_extended.append(h_num[word_idx])
            else:
                h_num_extended.append(1)
        h_nums.append(h_num_extended)
    h_nums = torch.tensor(h_nums, dtype=torch.float32)
    tokenized_inputs["h_nums"] = h_nums
    
    labels = []
    for i in range(len(examples)):
        label = examples[i]["classes"]
        word_ids = tokenized_inputs.word_ids(batch_index = i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    labels = torch.tensor(labels)
    
    tokenized_inputs["labels"] = labels
    tokenized_inputs["label_inputs"] = label_inputs
    
    return tokenized_inputs

In [15]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [16]:
val_dataloader = DataLoader(
        val_ds, 
        batch_size=24, 
        shuffle=False,
        num_workers = 8,
        collate_fn=functools.partial(tokenize_and_align_labels, tokenizer=tokenizer)
    )
    #next(iter(val_dataloader))
test_dataloader = DataLoader(
    test_ds,
    batch_size=24,
    shuffle=False,
    num_workers = 8,
    collate_fn=functools.partial(tokenize_and_align_labels, tokenizer=tokenizer)
    )

In [22]:
seeds = [10, 100, 1000, 10000, 100000, 12, 123, 1234, 12345, 123456]
for rep in range(2):#len(seeds)):
    torch.manual_seed(seeds[rep])
    train_dataloader = DataLoader(
        train_ds, 
        batch_size=24, 
        shuffle=True, 
        num_workers = 8,
        collate_fn=functools.partial(tokenize_and_align_labels, tokenizer=tokenizer)
    )
    camembert_mlm = LightningModel.load_from_checkpoint(checkpoint_path=checkpoint_path)
    
    camembert_tok = LightningModel("XVAL2_BertForTokClassif", optim='AdamW', lr=3e-5, weight_decay=0.01)
    #adding the lesa_bert model trained previously
    camembert_tok.model.roberta = camembert_mlm.model.roberta

    #camembert_tok = LightningModel.load_from_checkpoint(checkpoint_path= checkpoint_path)
    #for training
    model_checkpoint = pl.callbacks.ModelCheckpoint(#dirpath = '../checkpoints/xval2_camembert_tok',
                                                    dirpath = '../checkpoints/xval2_camembert_tok_lesa',
                                                    monitor="valid/f1", mode="max")

    camembert_trainer = pl.Trainer(
        #max_epochs=100,
        callbacks=[
            pl.callbacks.EarlyStopping(monitor="valid/f1", patience=5, mode="max"),
            model_checkpoint,
        ]
    )
    camembert_trainer.fit(camembert_tok, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

Some weights of the model checkpoint at ../camembert-bio-model were not used when initializing CamembertForTokenClassification_xval: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification_xval were not initialized from the model checkpoint at ../camembert-bio-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

                                                                           

  rank_zero_warn(


Epoch 31: 100%|██████████| 31/31 [00:12<00:00,  2.55it/s, v_num=786, valid/f1=0.239, train/loss=0.220]


Some weights of the model checkpoint at ../camembert-bio-model were not used when initializing CamembertForTokenClassification_xval: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification_xval were not initialized from the model checkpoint at ../camembert-bio-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

                                                                           

  rank_zero_warn(


Epoch 18: 100%|██████████| 31/31 [00:12<00:00,  2.43it/s, v_num=787, valid/f1=0.200, train/loss=0.246]


In [18]:
#path = '../checkpoints/xval2_camembert_tok'
path = '../checkpoints/xval2_camembert_tok_lesa'
rep=0
seeds = [10, 100, 1000, 10000, 100000, 12, 123, 1234, 12345, 123456]

for filename in os.listdir(path):    
    checkpoint_path = os.path.join(path, filename)
    torch.manual_seed(seeds[rep])
    rep += 1
    train_dataloader = DataLoader(
        train_ds, 
        batch_size=24, 
        shuffle=True, 
        num_workers = 8,
        collate_fn=functools.partial(tokenize_and_align_labels, tokenizer=tokenizer)
    )
    camembert_tok = LightningModel.load_from_checkpoint(checkpoint_path=checkpoint_path, optim='SGD',
             lr=1e-6, weight_decay=0.01)
    #for training
    model_checkpoint = pl.callbacks.ModelCheckpoint(#dirpath = '../checkpoints/xval2_camembert_tok_tuned',
                                                    dirpath = '../checkpoints/xval2_camembert_tok_lesa_tuned',
                                                    monitor="valid/f1", mode="max")

    camembert_trainer = pl.Trainer(
        #max_epochs=100,
        callbacks=[
            pl.callbacks.EarlyStopping(monitor="valid/f1", patience=5, mode="max"),
            model_checkpoint,
        ]
    )
    camembert_trainer.fit(camembert_tok, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

Some weights of the model checkpoint at ../camembert-bio-model were not used when initializing CamembertForTokenClassification_xval: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification_xval were not initialized from the model checkpoint at ../camembert-bio-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

                                                                           

  rank_zero_warn(


Epoch 8: 100%|██████████| 31/31 [00:12<00:00,  2.56it/s, v_num=784, valid/f1=0.240, train/loss=0.593]


Some weights of the model checkpoint at ../camembert-bio-model were not used when initializing CamembertForTokenClassification_xval: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification_xval were not initialized from the model checkpoint at ../camembert-bio-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

                                                                           

  rank_zero_warn(


Epoch 5: 100%|██████████| 31/31 [00:12<00:00,  2.45it/s, v_num=785, valid/f1=0.256, train/loss=0.146]


# Predicting with Token Classifier

In [None]:
camembert_tok = LightningModel.load_from_checkpoint(checkpoint_path= '../checkpoints/xval_camembert_tok/epoch=33-step=2108.ckpt', optim='AdamW')

In [None]:
label_names = ['O', 'Cp', 'FC', 'D', 'SO2', 'AGPR', 'G', 'CI']
nb_labels = len(label_names)

In [None]:
#"""
#for prediction
prediction_trainer = pl.Trainer(
    max_epochs=20,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/f1", patience=4, mode="max"),
    ]
)
camembert_preds = prediction_trainer.predict(camembert_tok, dataloaders=test_dataloader)
#"""

In [None]:
print(len(camembert_preds))

In [None]:
camembert_preds = [batch_preds.view(-1) for batch_preds in camembert_preds]
camembert_preds = torch.cat(camembert_preds, -1)

In [None]:
# collecting the labels
labels = []
it = iter(test_dataloader)
exit = False
while not exit:
    try:
        # Samples the batch
        tokenized_batch = next(it)
        label = tokenized_batch['labels']
        labels.append(label.view(-1))
    except StopIteration:
        exit = True
labels = torch.cat(labels, -1)

In [None]:
plot_confusion_matrix(labels, camembert_preds, label_names, 'camembert_tok')

In [None]:
precision, recall, _ = custom_precision_recall_score(labels, camembert_preds, nb_labels)
print('precision =', precision)
print('recall    =', recall)
print('f1_score  =', custom_f1_score(labels, camembert_preds, nb_labels, average= None))

## Collecting results among sample of trainings

In [34]:
f1_scores=[]

In [35]:
label_names = ['O', 'Cp', 'FC', 'D', 'SO2', 'AGPR', 'G', 'CI']
nb_labels = len(label_names)

prediction_trainer = pl.Trainer(
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [36]:
import os

#path = '../checkpoints/xval2_camembert_tok'
path = '../checkpoints/xval2_camembert_tok_lesa'

for filename in os.listdir(path):    
    checkpoint_path = os.path.join(path, filename)
    camembert_tok = LightningModel.load_from_checkpoint(checkpoint_path= checkpoint_path)
    camembert_preds = prediction_trainer.predict(camembert_tok, dataloaders=test_dataloader)
    camembert_preds = [batch_preds.view(-1) for batch_preds in camembert_preds]
    camembert_preds = torch.cat(camembert_preds, -1)
    
    # collecting the labels
    labels = []
    it = iter(test_dataloader)
    exit = False
    while not exit:
        try:
            # Samples the batch
            tokenized_batch = next(it)
            label = tokenized_batch['labels']
            labels.append(label.view(-1))
        except StopIteration:
            exit = True
    labels = torch.cat(labels, -1)
    
    f1_score = custom_f1_score(labels, camembert_preds, nb_labels, average= None)
    f1_scores.append(f1_score)

Some weights of the model checkpoint at ../camembert-bio-model were not used when initializing CamembertForTokenClassification_xval: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification_xval were not initialized from the model checkpoint at ../camembert-bio-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 10.55it/s]


Some weights of the model checkpoint at ../camembert-bio-model were not used when initializing CamembertForTokenClassification_xval: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification_xval from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification_xval were not initialized from the model checkpoint at ../camembert-bio-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

Predicting DataLoader 0: 100%|██████████| 7/7 [00:00<00:00, 11.49it/s]


In [37]:
f1_scores

[array([0.98894956, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]),
 array([0.9690077 , 0.        , 0.        , 0.        , 0.19277106,
        0.        , 0.        , 0.        ])]

In [21]:
#normal xval2
average, std = stats(f1_scores)
average

array([0.99147411, 0.        , 0.33939144, 0.46152437, 0.48000128,
       0.62043578, 0.3447996 , 0.20099754])

In [22]:
std

array([0.00149897, 0.        , 0.09256756, 0.13617706, 0.15368698,
       0.11845154, 0.14788489, 0.14761816])

In [34]:
#lesa xval2
average, std = stats(f1_scores)
average

array([0.98866672, 0.        , 0.29476533, 0.481562  , 0.40216464,
       0.3238592 , 0.        , 0.12054146])

In [35]:
std

array([0.00237697, 0.        , 0.11647507, 0.07861173, 0.09929846,
       0.06139502, 0.        , 0.10661928])

In [None]:
#normal dataset
average, std = stats(f1_scores)
average

In [None]:
sum([0.99908665, 0.90285707, 0.96295646, 0.93019133, 0.9433118 ,
       0.97099422, 0.87677542, 0.83167779])/8

In [None]:
std

In [None]:
#blind dataset
average, std = stats(f1_scores)
average

In [None]:
std

In [None]:
sum([0.99029107, 0.58714279, 0.50960482, 0.46767567, 0.69331114,
       0.91340833, 0.49791413, 0.65334018])/8

In [None]:
std

## Some illustrations

In [None]:
def get_extended_attention_mask(attention_mask):
    # Provided a padding mask of dimensions [batch_size, seq_length]
    # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
    extended_attention_mask = attention_mask[:, None, None, :]
    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
    # masked positions, this operation will create a tensor which is 0.0 for
    # positions we want to attend and the dtype's smallest value for masked positions.
    # Since we are adding it to the raw scores before the softmax, this is
    # effectively the same as removing these entirely.
    extended_attention_mask = extended_attention_mask.to(dtype=torch.float)  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(torch.float).min
    return extended_attention_mask

In [None]:
#selection of the batch
it = iter(test_dataloader)
exit = False
while not exit:
    try:
        # Samples the batch
        tokenized_batch = next(it)
        sentences = decode_properly(tokenized_batch["input_ids"], tokenizer)
        for sentence in sentences:
            if sentence.startswith('Née à terme, Grossess et accouchement sans complication PN'):
                exit = True
    except StopIteration:
        exit = True

In [None]:
camembert_tok_classifier.eval()
with torch.no_grad():
    model_output = camembert_tok_classifier.model(
            input_ids = tokenized_batch["input_ids"].cuda(),
            attention_mask = tokenized_batch["attention_mask"].cuda(),
            output_hidden_states=True
    )
sentences = decode_properly(tokenized_batch["input_ids"], tokenizer)

In [None]:
key_padding_mask = tokenized_batch['attention_mask'].type(torch.float)

In [None]:
key_padding_mask = get_extended_attention_mask(key_padding_mask)

In [None]:
layer_id = 11
sentence_id = 1

In [None]:
sentences[sentence_id]

In [None]:
entry_layer = model_output["hidden_states"][layer_id] #l'entrée de la layer layer_id

In [None]:
attn_output, attn_output_weights = camembert_tok_classifier.model.roberta.encoder.layer[layer_id].attention.self(entry_layer, 
                                    attention_mask= key_padding_mask.cuda(), 
                                    output_attentions=True)

In [None]:
head_id = 4
attn_output_head_weights = attn_output_weights[:, head_id, :, :]#attn_output_weights.mean(1)
attn_output_head_weights.shape

In [None]:
attn_output_weights_visual = attn_output_head_weights[sentence_id].detach().cpu().numpy()[:15, :15]
token_names = tokenizer.convert_ids_to_tokens(tokenized_batch["input_ids"][sentence_id][:15])

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
        attn_output_weights_visual,
        annot = attn_output_weights_visual, 
        cbar=False,
        #fmt="d",
        xticklabels=token_names,
        yticklabels=token_names,
        cmap="viridis"
)    

## Completion task

In [None]:
camembert = CamembertForMaskedLM.from_pretrained(model_directory)

In [None]:
tokenizer_output = tokenizer(
    ["Patient en détresse respiratoire, gradient VG-VD ad <mask> mmgh."],
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

In [None]:
with torch.no_grad():
    model_output = camembert(**tokenizer_output, output_hidden_states=True)
    model_output

In [None]:
def get_probas_from_logits(logits):
    return logits.softmax(-1)


def visualize_mlm_predictions(tokenizer_output, model_output, tokenizer, nb_candidates=10):
    # Decode the tokenized sentences and clean-up the special tokens
    decoded_tokenized_sents = [sent.replace('<pad>', '').replace('<mask>', ' <mask>') for sent in tokenizer.batch_decode(tokenizer_output.input_ids)]

    # Retrieve the probas at the masked positions
    masked_tokens_mask = tokenizer_output.input_ids == tokenizer.mask_token_id
    batch_mask_probas = get_probas_from_logits(model_output.logits[masked_tokens_mask])

    for sentence, mask_probas in zip(decoded_tokenized_sents, batch_mask_probas):
        # Get top probas and plot them
        top_probas, top_token_ids = mask_probas.topk(nb_candidates, -1)
        top_tokens = tokenizer.convert_ids_to_tokens(top_token_ids)
        bar_chart = px.bar({"tokens": top_tokens[::-1], "probas": list(top_probas)[::-1]},
                        x="probas", y="tokens", orientation='h', title=sentence, width=800)
        bar_chart.show(config={'staticPlot': True})

In [None]:
visualize_mlm_predictions(tokenizer_output, model_output, tokenizer)