## BSV Multilabel classification with camembert
### The code and data are adapted from:  https://medium.com/@vitalshchutski/french-nlp-entamez-le-camembert-avec-les-librairies-fast-bert-et-transformers-14e65f84c148

In [1]:
# !conda install torch
# !pip install fast-bert==1.9.1
# !mkdir model
# !mkdir finetuned_model

In [9]:
import torch
from fast_bert.data_cls import BertDataBunch 
from fast_bert.learner_cls import BertLearner
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from fast_bert.metrics import fbeta, roc_auc
from fast_bert.prediction import BertClassificationPredictor
from pathlib import Path
import pandas as pd
import logging
import datetime
import sys


device_cuda = torch.device("cuda")

In [10]:
DATA_PATH = Path('./data/')
LOG_PATH = Path('./logs/')
MODEL_PATH = Path('./model/')
LABEL_PATH = Path('./labels/')

In [12]:
import logging

run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, "BSV Multilabel classification with camembert"))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])
logger = logging.getLogger()

In [13]:
df = pd.read_csv('./data/bsv_chunk256_ba_mld_1201-1500.csv')

In [14]:
val_set = df.sample(frac=0.2, replace=False, random_state=42)
train_set = df.drop(index = val_set.index)
print('Nombre de commentaires dans le val_set:',len(val_set))
print('Nombre de commentaires dans le train_set:', len(train_set))
val_set.to_csv('./data/val_set.csv')
train_set.to_csv('./data/train_set.csv')

Nombre de commentaires dans le val_set: 860
Nombre de commentaires dans le train_set: 3441


In [15]:
labels = df.columns[1:3].to_list() 
with open('./labels/labels.txt', 'w') as f:
    for i in labels:
        f.write(i + "\n")

In [16]:
df_texts = pd.read_csv('./data/bsv_chunk256_raw_1001-1200.csv')
all_texts = df_texts['report_text'].to_list()
print('Nombre de bloc de texte:', len(all_texts))

Nombre de bloc de texte: 2801


### Création de LMDataBunch

In [17]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
                    data_dir=DATA_PATH,
                    text_list=all_texts,
                    tokenizer='camembert-base',
                    batch_size_per_gpu=4, #was 16, even 8 won't do
                    max_seq_length=256, #was 512
                    multi_gpu=False,
                    model_type='camembert-base',
                    logger=logger)

03/01/2021 23:00:59 - INFO - root -   Formatting corpus for data\lm_train.txt


03/01/2021 23:00:59 - INFO - root -   Formatting corpus for data\lm_val.txt


03/01/2021 23:00:59 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model from cache at C:\Users\Raja/.cache\torch\transformers\3715e3a4a2de48834619b2a6f48979e13ddff5cabfb1f3409db689f9ce3bb98f.28d30f926f545047fc59da64289371eef0fbdc0764ce9ec56f808a646fcfec59
03/01/2021 23:00:59 - INFO - root -   Loading features from cached file data\lm_cache\cached_camembert-base_train_256
03/01/2021 23:00:59 - INFO - root -   Loading features from cached file data\lm_cache\cached_camembert-base_dev_256


### Création de LMLearner

In [18]:
lm_learner = BertLMLearner.from_pretrained_model(
                            dataBunch=databunch_lm,
                            pretrained_path='camembert-base',
                            output_dir=MODEL_PATH,
                            metrics=[],
                            device=device_cuda,
                            logger=logger,
                            multi_gpu=False,
                            logging_steps=500,
                            is_fp16=True) #was true with gpu

03/01/2021 23:01:13 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json from cache at C:\Users\Raja/.cache\torch\transformers\5152a7b8b97da26abdad9b3babb600e77c52a002331ea52a9eaf96ea8b31ef8f.5bd7a9a60b9a2d311368226259eaf870cfb2248e0752f28b444ec112977cf8fc
03/01/2021 23:01:13 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

03/01

In [None]:
lm_learner.fit(epochs=2, #was 30
            lr=1e-4,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

In [21]:
lm_learner.validate()

03/01/2021 23:07:48 - INFO - root -   Running evaluation
03/01/2021 23:07:48 - INFO - root -   Num examples = 28
03/01/2021 23:07:48 - INFO - root -   Validation Batch size = 8


{'loss': 0.11013594828546047, 'perplexity': 1.1164298057556152}

In [22]:
lm_learner.save_model()

03/01/2021 23:07:52 - INFO - transformers.configuration_utils -   Configuration saved in model\model_out\config.json
03/01/2021 23:07:53 - INFO - transformers.modeling_utils -   Model weights saved in model\model_out\pytorch_model.bin


In [23]:
del lm_learner

### Création de databunch pour la classification

In [24]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='camembert-base',
                          train_file='train_set.csv',
                          val_file='val_set.csv',
                          label_file='labels.txt',
                          text_col='report_text',
                          label_col=['Bioagressor','Disease'],
                          batch_size_per_gpu=8,
                          max_seq_length=256,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='camembert-base')

03/01/2021 23:08:35 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json from cache at C:\Users\Raja/.cache\torch\transformers\5152a7b8b97da26abdad9b3babb600e77c52a002331ea52a9eaf96ea8b31ef8f.5bd7a9a60b9a2d311368226259eaf870cfb2248e0752f28b444ec112977cf8fc
03/01/2021 23:08:35 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

03/01

### Création de Learner pour la classification

In [71]:
from fast_bert.metrics import roc_auc, accuracy_thresh, fbeta # accuracy_multilabel,
from sklearn.metrics import hamming_loss, accuracy_score, roc_curve, auc, roc_auc_score, f1_score, multilabel_confusion_matrix
from torch import Tensor

#### Metrics functions:

In [112]:
#metrics for multi-label
threshold = 0.5
def Hamming_loss(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred > thresh).float()
    return hamming_loss(y_true, y_pred, sample_weight = sample_weight)

def Exact_Match_Ratio(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, normalize:bool = True, sample_weight = None):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred > thresh).float()
    return accuracy_score(y_true, y_pred, normalize = normalize, sample_weight = sample_weight)

def roc_auc_score_macro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, average = 'macro', sample_weight = None):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    return roc_auc_score(y_true, y_pred, average = average, sample_weight = sample_weight)

def roc_auc_score_micro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    return roc_auc_score_macro(y_pred, y_true, sigmoid = sigmoid, average = 'micro')

def roc_auc_score_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    return roc_auc_score_macro(y_pred, y_true, sigmoid = sigmoid, average = None)

def ROC_AUC_by_label(y_pred: Tensor, y_true: Tensor, sigmoid:bool = True, labels:list = labels):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    # Compute ROC curve and ROC area for each label
    if sigmoid: y_pred = y_pred.sigmoid()
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(labels)):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[labels[i]] = auc(fpr[i], tpr[i])
    return roc_auc

def F1(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, threshold:float = threshold):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    return fbeta(y_pred, y_true, sigmoid = sigmoid, thresh = threshold, beta = 1)

def F1_macro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, average = 'macro', sample_weight = None):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred > thresh).float()
    return f1_score(y_true, y_pred, average = average, sample_weight = sample_weight)

def F1_micro(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    return F1_macro(y_pred, y_true, sigmoid = sigmoid, average = 'micro')

def F1_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None, labels:list = labels):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred > thresh).float()
    return f1_score(y_true, y_pred, average = None)

def accuracy_by_label(y_pred: Tensor, y_true: Tensor, sigmoid:bool = True, thresh:float = threshold, normalize:bool = True, sample_weight = None, labels:list = labels):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred > thresh).float()
    accuracies = {}
    for i in range(len(labels)):
        accuracies[labels[i]] = accuracy_score(y_true[:, i], y_pred[:, i], normalize = normalize, sample_weight = sample_weight)
    return accuracies

def confusion_matrix_by_label(y_pred:Tensor, y_true:Tensor, sigmoid:bool = True, thresh:float = threshold, sample_weight = None, samplewise = False, labels:list = labels):
    y_true = y_true.detach().cpu()
    y_pred = y_pred.detach().cpu()
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred > thresh).float()
    return multilabel_confusion_matrix(y_true, y_pred, labels = [i for i in range(len(labels))], sample_weight = sample_weight, samplewise = samplewise)

In [113]:
metrics = [{'name': 'Hamming_loss', 'function': Hamming_loss}, 
           {'name': 'accuracy_by_label', 'function': accuracy_by_label},
           {'name': 'roc_auc_score_by_label', 'function': roc_auc_score_by_label},
           {'name': 'F1_by_label', 'function': F1_by_label},
           {'name': 'F1', 'function': F1}, ]
OUTPUT_DIR = Path('./finetuned_model')
WGTS_PATH = Path('model/model_out/pytorch_model.bin')

In [114]:
# issue fast-bert pos_weight <= downgrade to 1.9.1 solve the prob
cl_learner = BertLearner.from_pretrained_model(
                        databunch,
                        pretrained_path='model/model_out',
                        metrics=metrics,
                        device=device_cuda, #was device_cuda
                        logger=logger,
                        output_dir=OUTPUT_DIR,
                        finetuned_wgts_path=WGTS_PATH,
                        warmup_steps=300,
                        multi_gpu=False,
                        multi_label=True,
                        is_fp16=True,#True when is cuda
                        logging_steps=8000)#was 50, aïe, #todo: fix tf board

03/02/2021 01:49:20 - INFO - transformers.configuration_utils -   loading configuration file model/model_out\config.json
03/02/2021 01:49:20 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

03/02/2021 01:49:20 - INFO - transformers.modeling_utils -   loading weights file model/model_out\pytorch_model.bin
- This IS expected if you are initializing CamembertForMultiLabelSequenceClassification from the checkpoint of a model train

In [115]:
cl_learner.fit(epochs=5,# was 30
            lr=1e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
03/02/2021 01:49:27 - INFO - root -   ***** Running training *****
03/02/2021 01:49:27 - INFO - root -     Num examples = 3441
03/02/2021 01:49:27 - INFO - root -     Num Epochs = 5
03/02/2021 01:49:27 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
03/02/2021 01:49:27 - 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
03/02/2021 01:51:13 - INFO - root -   Running evaluation
03/02/2021 01:51:13 - INFO - root -     Num examples = 860
03/02/2021 01:51:13 - INFO - root -     Batch size = 16


03/02/2021 01:51:16 - INFO - root -   eval_loss after epoch 1: 0.44288925771360044: 
03/02/2021 01:51:16 - INFO - root -   eval_Hamming_loss after epoch 1: 0.17616279069767443: 
03/02/2021 01:51:16 - INFO - root -   eval_accuracy_by_label after epoch 1: {'Bioagressor': 0.8058139534883721, 'Disease': 0.8418604651162791}: 
03/02/2021 01:51:16 - INFO - root -   eval_roc_auc_score_by_label after epoch 1: [0.86541721 0.85737604]: 
03/02/2021 01:51:16 - INFO - root -   eval_F1_by_label after epoch 1: [0.70546737 0.47286822]: 
03/02/2021 01:51:16 - INFO - root -   eval_F1 after epoch 1: 0.26356589794158936: 
03/02/2021 01:51:16 - INFO - root -   lr after epoch 1: 9.877450257768792e-06
03/02/2021 01:51:16 - INFO - root -   train_loss after epoch 1: 0.5960515837796603
03/02/2021 01:51:16 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
03/02/2021 01:53:07 - INFO - root -   Running evaluation
03/02/2021 01:53:07 - INFO - root -     Num examples = 860
03/02/2021 01:53:07 - INFO - root -     Batch size = 16


03/02/2021 01:53:10 - INFO - root -   eval_loss after epoch 2: 0.3520360691127954: 
03/02/2021 01:53:10 - INFO - root -   eval_Hamming_loss after epoch 2: 0.13837209302325582: 
03/02/2021 01:53:10 - INFO - root -   eval_accuracy_by_label after epoch 2: {'Bioagressor': 0.8511627906976744, 'Disease': 0.872093023255814}: 
03/02/2021 01:53:10 - INFO - root -   eval_roc_auc_score_by_label after epoch 2: [0.89867773 0.90955061]: 
03/02/2021 01:53:10 - INFO - root -   eval_F1_by_label after epoch 2: [0.80487805 0.67647059]: 
03/02/2021 01:53:10 - INFO - root -   eval_F1 after epoch 2: 0.3709302246570587: 
03/02/2021 01:53:10 - INFO - root -   lr after epoch 2: 7.901120577837316e-06
03/02/2021 01:53:10 - INFO - root -   train_loss after epoch 2: 0.38296379138588077
03/02/2021 01:53:10 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
03/02/2021 01:55:00 - INFO - root -   Running evaluation
03/02/2021 01:55:00 - INFO - root -     Num examples = 860
03/02/2021 01:55:00 - INFO - root -     Batch size = 16


03/02/2021 01:55:02 - INFO - root -   eval_loss after epoch 3: 0.34444435382330857: 
03/02/2021 01:55:02 - INFO - root -   eval_Hamming_loss after epoch 3: 0.1313953488372093: 
03/02/2021 01:55:02 - INFO - root -   eval_accuracy_by_label after epoch 3: {'Bioagressor': 0.8511627906976744, 'Disease': 0.8860465116279069}: 
03/02/2021 01:55:02 - INFO - root -   eval_roc_auc_score_by_label after epoch 3: [0.90002912 0.91812437]: 
03/02/2021 01:55:02 - INFO - root -   eval_F1_by_label after epoch 3: [0.80368098 0.73655914]: 
03/02/2021 01:55:02 - INFO - root -   eval_F1 after epoch 3: 0.37984499335289: 
03/02/2021 01:55:02 - INFO - root -   lr after epoch 3: 4.446489064626526e-06
03/02/2021 01:55:02 - INFO - root -   train_loss after epoch 3: 0.29475090804827464
03/02/2021 01:55:02 - INFO - root -   





03/02/2021 01:56:53 - INFO - root -   Running evaluation
03/02/2021 01:56:53 - INFO - root -     Num examples = 860
03/02/2021 01:56:53 - INFO - root -     Batch size = 16


03/02/2021 01:56:56 - INFO - root -   eval_loss after epoch 4: 0.3392933562949852: 
03/02/2021 01:56:56 - INFO - root -   eval_Hamming_loss after epoch 4: 0.13430232558139535: 
03/02/2021 01:56:56 - INFO - root -   eval_accuracy_by_label after epoch 4: {'Bioagressor': 0.8465116279069768, 'Disease': 0.8848837209302326}: 
03/02/2021 01:56:56 - INFO - root -   eval_roc_auc_score_by_label after epoch 4: [0.89966798 0.92392155]: 
03/02/2021 01:56:56 - INFO - root -   eval_F1_by_label after epoch 4: [0.79310345 0.73458445]: 
03/02/2021 01:56:56 - INFO - root -   eval_F1 after epoch 4: 0.3705427050590515: 
03/02/2021 01:56:56 - INFO - root -   lr after epoch 4: 1.2739058870670368e-06
03/02/2021 01:56:56 - INFO - root -   train_loss after epoch 4: 0.2622312100954233
03/02/2021 01:56:56 - INFO - root -   





03/02/2021 01:58:46 - INFO - root -   Running evaluation
03/02/2021 01:58:46 - INFO - root -     Num examples = 860
03/02/2021 01:58:46 - INFO - root -     Batch size = 16


03/02/2021 01:58:48 - INFO - root -   eval_loss after epoch 5: 0.34560933916105163: 
03/02/2021 01:58:48 - INFO - root -   eval_Hamming_loss after epoch 5: 0.13546511627906976: 
03/02/2021 01:58:48 - INFO - root -   eval_accuracy_by_label after epoch 5: {'Bioagressor': 0.8453488372093023, 'Disease': 0.8837209302325582}: 
03/02/2021 01:58:48 - INFO - root -   eval_roc_auc_score_by_label after epoch 5: [0.90017475 0.92288664]: 
03/02/2021 01:58:48 - INFO - root -   eval_F1_by_label after epoch 5: [0.79379845 0.7382199 ]: 
03/02/2021 01:58:48 - INFO - root -   eval_F1 after epoch 5: 0.37558144330978394: 
03/02/2021 01:58:48 - INFO - root -   lr after epoch 5: 0.0
03/02/2021 01:58:48 - INFO - root -   train_loss after epoch 5: 0.23524393644313524
03/02/2021 01:58:48 - INFO - root -   





(2155, 0.35424828595047486)

In [116]:
cl_learner.validate()

03/02/2021 01:58:49 - INFO - root -   Running evaluation
03/02/2021 01:58:49 - INFO - root -     Num examples = 860
03/02/2021 01:58:49 - INFO - root -     Batch size = 16


{'loss': 0.34560933916105163,
 'Hamming_loss': 0.13546511627906976,
 'accuracy_by_label': {'Bioagressor': 0.8453488372093023,
  'Disease': 0.8837209302325582},
 'roc_auc_score_by_label': array([0.90017475, 0.92288664]),
 'F1_by_label': array([0.79379845, 0.7382199 ]),
 'F1': 0.37558144330978394}

In [117]:
cl_learner.save_model()

03/02/2021 01:58:51 - INFO - transformers.configuration_utils -   Configuration saved in finetuned_model\model_out\config.json
03/02/2021 01:58:52 - INFO - transformers.modeling_utils -   Model weights saved in finetuned_model\model_out\pytorch_model.bin


### Prédictions

In [118]:
predictor = BertClassificationPredictor(
                model_path='finetuned_model/model_out',
                label_path='labels/',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

03/02/2021 01:58:52 - INFO - transformers.configuration_utils -   loading configuration file finetuned_model/model_out\config.json
03/02/2021 01:58:52 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

03/02/2021 01:58:52 - INFO - transformers.tokenization_utils_base -   Model name 'finetuned_model/model_out' not found in model shortcut name list (camembert-base). Assuming 'finetuned_model/model_out' is a

In [119]:
#cas disease: 0, bioagressor: 1 - cicadelle
predictor.predict("Cicadelles La cicadelle Edwardsiana est toujours observée sur les parcelles en été")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1




[('bioagressor', 0.8876953125), ('disease', 0.0850830078125)]

In [120]:
predictor.predict("election américane")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.05450439453125), ('disease', 0.047882080078125)]

In [121]:
#cas disease: 0, bioagressor: 0 - texte sur mouche de carotte mais pas de rique
predictor.predict("mouche de la carotte :ajouter trichloronate")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.73046875), ('disease', 0.056549072265625)]

In [122]:
#cas disease: 0, bioagressor: 0 - oïdium : pas d'intervention 
predictor.predict("oïdium : pas d'intervention dans l'immediat les conditions très chaudes et l'absence de rosées nocturnes sont défavorables à cette maladie. aucun symptôme n'a encore été observé.")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('disease', 0.7099609375), ('bioagressor', 0.060211181640625)]

In [123]:
#cas disease: 1, bioagressor: 0 - oidium  => good
predictor.predict("oïdium du pommierce champignon est à l'heure actuelle en pleine fructification. il est nécessaire d'ajouter un antioïdium aux bouillies pour assurer'la protection du jeune feuillage.")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('disease', 0.77734375), ('bioagressor', 0.07452392578125)]

In [124]:
#cas disease: 1, bioagressor: 0 - mildiou
predictor.predict("vigilant : en particulier vis-à-vis du mildiou, de l’oïdium et de la bactériose / cladosporiose. Les prévisions pour les prochains jours restent peu favorables à l’expression de la bactériose et la cladosporiose, si elles se confirment. Mildiou (Pseudoperonospora cubensis) : Le modèle annonce un risque élevé pour toutes les dates de plantation avec les données de la station de Thurageau. Avec les données de la station de Maulay, les plantations en S25 et S26 montrent un risque modéré. Le risque est un peu plus élevé dans le sud de la Charente- Maritime que dans le Poitou. Niveau de risque Faible Moyen Élevé Très élevé Indice : Log (Nb de taches/unité de surface) -14 à -9 -9 à -4 -4 +4 Équivalent en unité de surface 1 tâche par hectare par 100 m2 1 tâche par 100 m2 par m2 1 tâche par m2à 1 % de surface atteinte 1 % à 100 % de surface atteinte Évaluation du risque : les conditions restent favorables à ce microorganisme (qui n’est pas un champignon, mais proche d’une algue). Les BSV sont disponibles en accès direct sur le site  (rubrique : Nos publications - Bulletin de santé du végétal) ou par abonnement en ligne gratuit sur le site  BSV CULTURES LÉGUMIÈRES DE PLEIN")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('disease', 0.92529296875), ('bioagressor', 0.219970703125)]

In [125]:
#cas disease: 0, bioagressor: 0 - texte sur trump trump
predictor.predict("L'avance de Donald Trump dans cet Etat où 4,9 millions d'électeurs ont voté, a fondu vendredi 6 novembre. Le candidat républicain compte")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.05450439453125), ('disease', 0.047515869140625)]

In [126]:
#cas disease: 1, bioagressor: 1 - #mildiou (removed) mais pas de risque
predictor.predict("Pomme de te r re du 17 mai 2013 BS omme Ce - N° 9 N° 9 En résumé : Risque  au 17 mai : entrée en phase de risque pour les variétés sensibles pour 5 secteurs. Pour autant, le seuil de nuisibilité n’est pas atteint. Utilisation du modèle Mileos® (www.mileos.fr) Le BSV pomme de terre de la région Centre mobilise le modèle Mileos® qui se base sur le cycle épidémique de Phytophtora infestans. • Quand démarre le risque  ? Le suivi du nombre des générations de  est un bon indicateur pour connaître le début de la période à risque de cette maladie. En fonction de la sensibilité variétale, le risque démarre : - à la sortie de")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('disease', 0.0567626953125), ('bioagressor', 0.046478271484375)]

In [127]:
#cas disease: 0, bioagressor: 0 - zero mot clé
predictor.predict("par jour. Sur les autres secteurs, en production de carotte, les captures sont nulles. Évaluation du risque")

03/02/2021 01:58:55 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.06280517578125), ('disease', 0.040985107421875)]

In [128]:
#cas disease: 0, bioagressor: 0 + mildiou
predictor.predict("7 Action pilotée par le ministère chargé de l’agriculture mildiou, avec l’appui financier de l’Office National de l’Eau et des Milieux Aquatiques (ONEMA), par les crédits issus de la redevance pour pollutions diffuses attribués au")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('disease', 0.371826171875), ('bioagressor', 0.036285400390625)]

In [129]:
#cas disease: 0, bioagressor: 0
predictor.predict("financement du plan Ecophyto. Ce bulletin est rédigé par l'ACPEL avec la collaboration de référents par culture (techniciens des Chambres d'Agriculture de la Charente, de la Charente-Maritime et d'Indre et Loire et de la Vienne) sur la base d'observations réalisées par des producteurs et techniciens : Charentes- Alliance, les entreprises de production de melon, la coopérative AGROLEG, la coopérative UNIRE, des producteurs d'Agrobio Poitou-Charentes. Ce bulletin est réalisé à partir d'observations ponctuelles. Il a pour vocation de donner une tendance de la situation sanitaire régionale. Celle-ci ne peut être transposée telle quelle dans les parcelles de production légumières (conditions très variables). La Chambre Régionale d'Agriculture de Poitou-Charentes et le rédacteur dégagent toute responsabilité quant aux décisions prises par les producteurs pour la protection de leurs cultures. Elle les invite à prendre ces décisions sur la base des observation s qu'ils auront réalisées dans leurs parcelles. Les")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.05560302734375), ('disease', 0.044586181640625)]

In [130]:
#cas disease: 0, bioagressor: 0 - french crop usasge
predictor.predict("Ensemble de plantes cultivées pour leurs fruits ou leurs graines riches en matières grasses (lipides). De ces fruits et graines sont extrait une huile à usage alimentaire humaine, alimentaire animal ou industriel. Les résidus de l'extraction constituent les tourteaux utilisés pour l'alimentation animale.")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.055938720703125), ('disease', 0.044769287109375)]

In [131]:
#cas disease: 0, bioagressor: 0 - french crop usasge
predictor.predict("Orge semé après le 1er février, principalement de mars à mai. Orge de printemps est toujours à deux rangs")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.05389404296875), ('disease', 0.046722412109375)]

In [132]:
#cas disease: 1, bioagressor: 0 - tweet - jaunisse
predictor.predict("des parcelles qui ont eu une croissance presque nulle depuis cet été en majeur partie dû à la jaunisse.")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('disease', 0.64501953125), ('bioagressor', 0.055938720703125)]

In [133]:
#cas disease: 1, bioagressor: 0 - tweet - sécheresse
predictor.predict("Parfois,on nous demande. Que faites-vous,par cette sécheresse?")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.0552978515625), ('disease', 0.046905517578125)]

In [134]:
#cas disease: 0, bioagressor: 1 - tweet - pucerons

predictor.predict("Attention pucerons dans les blés")

03/02/2021 01:58:56 - INFO - root -   Writing example 0 of 1


[('bioagressor', 0.90673828125), ('disease', 0.10894775390625)]

### Analyse the fine-tuned model

In [135]:
predictor = BertClassificationPredictor(
                model_path='finetuned_model/model_out',
                label_path='labels/',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

03/02/2021 01:58:56 - INFO - transformers.configuration_utils -   loading configuration file finetuned_model/model_out\config.json
03/02/2021 01:58:56 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

03/02/2021 01:58:56 - INFO - transformers.tokenization_utils_base -   Model name 'finetuned_model/model_out' not found in model shortcut name list (camembert-base). Assuming 'finetuned_model/model_out' is a

In [136]:
#evaluation on fine-tuning set 
df_val = pd.read_csv('./data/bsv_chunk256_ba_mld_0-1000.csv')

#predictor.get_learner()

In [137]:
df_val.Disease = df_val.Disease.astype(int)
df_val.tail()

Unnamed: 0,report_text,Bioagressor,Disease,hazard_list,source_name
13109,"climatiques soient favorables à la libération de spores. A noter que la sensibilité sur feuillage n'est pas corrélée avec la sensibilité sur tubercules. Ainsi, une variété peut être résistante au mildiou sur feuillage et sensible au mildiou sur tubercules et inversement. Planter suffisamment profond, avec un buttage bien appuyé, pour éviter la",0,1,#mildiou,draaf.centre-val-de-loire.agriculture.gouv.fr_IMG_pdf_BSV_Pomme_de_terre_09_cle4c4d2f
13110,"formation de crevasses, permet de limiter la contamination des tubercules par les spores de mildiou. Analyse du risque et prévisions Prévisions météo : soleil et nuages accompagnés de températures plutôt fraiches cette semaine. Quelques pluies sont annoncées lundi prochain avec une hausse des températures. Analyse du risque : Les 4èmes et 5èmes générations ayant déjà terminé leur incubation sur l’ensemble des secteurs de l’Eure-et-Loir et du Loir-et-Cher, le risque est à prendre en compte sur tous les types variétaux dans ces départements. Dans le Loiret et l’Essone, le risque est à prendre en compte uniquement sur les variétés sensibles et intermédiaires. Cependant, sur la base des données prévisionnelles, le seuil de nuisibilité ne devrait pas être atteint du 27/05 au 28/05 sur tous les types de variétés sur l’ensemble de la Région (conditions fraiches et persistance d’un temps sec avec de faibles hygrométries). Il faut malgré tout rester vigilant, notamment en cas de retour de l’humidité en fin de semaine. Rappel, pour que le seuil de nuisibilité du mildiou soit atteint, il faut qu’il existe un potentiel de sporulation (« quantité de maladie qui pourrait apparaître ») et que les conditions climatiques soient favorables à son expression. Bulletin co-rédigé par la Chambre d'Agriculture du Loir-et-Cher et ARVALIS-Institut du végétal. Avec les",0,1,#mildiou,draaf.centre-val-de-loire.agriculture.gouv.fr_IMG_pdf_BSV_Pomme_de_terre_09_cle4c4d2f
13111,"observations de AGRI BEAUCE, BEAUCE PRODUCTIONS, Chambre d ’Agriculture 41, Chambre d ’Agricult ure 28, COMITE CENTRE ET SUD, PARMENTINE, POM ALLIANCE SA,, Les 3 Laboureurs - Agropithiviers, SOUFFLET AGRICULTURE, TERRE DE France,.",0,0,,draaf.centre-val-de-loire.agriculture.gouv.fr_IMG_pdf_BSV_Pomme_de_terre_09_cle4c4d2f
13112,"Directeur de publication : Jean-Pierre LEVEILLARD, Président de la Chambre régionale d’agriculture du Centre 13 avenue",0,0,,draaf.centre-val-de-loire.agriculture.gouv.fr_IMG_pdf_BSV_Pomme_de_terre_09_cle4c4d2f
13113,"des Droits de l’Homme – 45921 ORLEANS Ce bulletin est produit à partir d’observations ponctuelles. Il donne une tendance de la situation sanitaire régionale, qui ne peut pas être transposée telle quelle à la parcelle. La Chambre régionale d’agriculture du Centre dégage donc toute responsabilité quant aux décisions prises par les agriculteurs pour la protection de leurs cultures. Action pilotée par le Ministère chargé de l'agriculture avec l'appui financier de l'ONEMA, par les crédits issus de la redevance pour pollution diffuses attribués au financement du plan Ecophyto 2018",0,0,,draaf.centre-val-de-loire.agriculture.gouv.fr_IMG_pdf_BSV_Pomme_de_terre_09_cle4c4d2f


In [138]:
batch_predictions = predictor.predict_batch(df_val.report_text.to_list())

03/02/2021 01:58:59 - INFO - root -   Writing example 0 of 13114
03/02/2021 01:59:03 - INFO - root -   Writing example 10000 of 13114




In [139]:
batch_predictions[:10]

[[('disease', 0.95703125), ('bioagressor', 0.55126953125)],
 [('bioagressor', 0.9326171875), ('disease', 0.295654296875)],
 [('bioagressor', 0.921875), ('disease', 0.9072265625)],
 [('bioagressor', 0.055206298828125), ('disease', 0.044525146484375)],
 [('bioagressor', 0.05633544921875), ('disease', 0.04638671875)],
 [('bioagressor', 0.050994873046875), ('disease', 0.04949951171875)],
 [('disease', 0.837890625), ('bioagressor', 0.0975341796875)],
 [('disease', 0.89599609375), ('bioagressor', 0.1461181640625)],
 [('disease', 0.9248046875), ('bioagressor', 0.257568359375)],
 [('bioagressor', 0.8740234375), ('disease', 0.07305908203125)]]

In [140]:
#dict(batch_predictions[1]).values()
list_y_pred = [ dict(pred) for pred in batch_predictions]
#list_y_pred[-5:]

In [141]:
df_y_pred = pd.DataFrame(list_y_pred, columns =['bioagressor', 'disease']) 
df_y_pred = df_y_pred.rename(columns={"bioagressor": "Bioagressor", "disease": "Disease"})
df_y_pred.tail()

Unnamed: 0,Bioagressor,Disease
13109,0.093689,0.82959
13110,0.110107,0.860352
13111,0.054504,0.045593
13112,0.054291,0.046112
13113,0.053986,0.046112


In [142]:
df_y_pred.describe()

Unnamed: 0,Bioagressor,Disease
count,13114.0,13114.0
mean,0.405534,0.267286
std,0.384451,0.342006
min,0.032837,0.021988
25%,0.054596,0.045776
50%,0.149841,0.063843
75%,0.866211,0.48761
max,0.958008,0.958008


In [143]:
df_y_real = pd.DataFrame(df_val, columns=['Bioagressor', 'Disease'])
df_y_real.tail()

Unnamed: 0,Bioagressor,Disease
13109,0,1
13110,0,1
13111,0,0
13112,0,0
13113,0,0


In [144]:
df_y_real.describe()

Unnamed: 0,Bioagressor,Disease
count,13114.0,13114.0
mean,0.376849,0.227696
std,0.484615,0.419361
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,0.0
max,1.0,1.0


In [145]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

In [146]:
precision_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5,average=None) #If None, the scores for each class are returned

array([0.80314077, 0.7771973 ])

In [147]:
precision_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5, average='weighted')

0.7933694285773337

In [148]:
recall_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5,average=None)

array([0.86928369, 0.84695244])

In [149]:
recall_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5,average='weighted')

0.8608728557013118

In [150]:
f1_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5,average=None)

array([0.83490429, 0.81057692])

In [151]:
f1_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5,average='weighted')

0.8257416334825408

In [152]:
accuracy_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.5)

0.7952569772761934

In [153]:
accuracy_score(y_true=df_y_real.Bioagressor.values, y_pred=df_y_pred.Bioagressor.values > 0.5)


0.8704438005185298

In [154]:
accuracy_score(y_true=df_y_real.Disease.values, y_pred=df_y_pred.Disease.values > 0.5 )


0.9098673173707488

In [155]:
from sklearn.metrics import average_precision_score
average_precision_score(df_y_real.values, df_y_pred.values, average=None)

array([0.82784661, 0.80852299])

In [156]:
from sklearn.metrics import average_precision_score
average_precision_score(df_y_real.values, df_y_pred.values, average='weighted')

0.820568569928822

In [157]:
df_results = pd.concat([df_val, df_y_pred > 0.5], axis=1, ignore_index=True)

In [158]:
df_results.to_csv('predictions_camembert-base.csv')