### The code and data are adapted from:  https://medium.com/@vitalshchutski/french-nlp-entamez-le-camembert-avec-les-librairies-fast-bert-et-transformers-14e65f84c148

In [None]:
# !conda install torch
# !pip install fast-bert==1.9.1
# !mkdir model
# !mkdir finetuned_model

In [2]:
import torch
from fast_bert.data_cls import BertDataBunch 
from fast_bert.learner_cls import BertLearner
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from fast_bert.metrics import fbeta, roc_auc
from fast_bert.prediction import BertClassificationPredictor
import sys

from pathlib import Path
import pandas as pd
import logging
#create logger
logfile = str('logfile.txt')

logging.basicConfig(
    level=logging.INFO,  #CRITICAL ERROR WARNING  INFO    DEBUG    NOTSET
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile, 'w', 'utf-8'),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()


device_cuda = torch.device("cuda")

In [60]:
DATA_PATH = Path('./data/')
LOG_PATH = Path('./logs/')
MODEL_PATH = Path('./model/')
LABEL_PATH = Path('./labels/')

In [10]:
df = pd.read_csv('./data/pyrale+maïs_tweets_2019-2020_labelled.csv')

In [11]:
val_set = df.sample(frac=0.2, replace=False, random_state=42)
train_set = df.drop(index = val_set.index)
print('Nombre de commentaires dans le val_set:',len(val_set))
print('Nombre de commentaires dans le train_set:', len(train_set))
val_set.to_csv('./data/twt_val_set.csv')
train_set.to_csv('./data/twt_train_set.csv')

Nombre de commentaires dans le val_set: 53
Nombre de commentaires dans le train_set: 213


In [61]:
labels = ['Observation']
with open('./labels/twt_labels_1.txt', 'w') as f:
    for i in labels:
        f.write(i + "\n")

In [67]:
#df_texts = pd.read_csv('./data/raw_xml_bsv_0-200.csv')
df_texts = pd.read_csv('./data/bsv_chunk256_raw_1001-1200.csv')

In [68]:
import nltk
import re

# make all elements string
df_texts['report_text'] = df_texts['report_text'].astype(str)
# Remove null fields
df_texts['report_text'] = df_texts['report_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# Make all text lowercase
df_texts['report_text'] = df_texts['report_text'].apply(lambda x: x.lower())
# Delete stop-words => to be tesred later
#stopwords = nltk.corpus.stopwords.words('french')


In [70]:
all_texts = df_texts['report_text'].to_list()
print('Nombre de bloc de texte:', len(all_texts))

Nombre de bloc de texte: 2801


### Création de LMDataBunch

In [71]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
                    data_dir=DATA_PATH,
                    text_list=all_texts,
                    tokenizer='camembert-base',
                    batch_size_per_gpu=4, #was 16, even 8 won't do
                    max_seq_length=256, #was 512
                    multi_gpu=False,
                    model_type='camembert-base',
                    logger=logger)

08/30/2021 20:16:53 - INFO - root -   Formatting corpus for data\lm_train.txt


08/30/2021 20:16:53 - INFO - root -   Formatting corpus for data\lm_val.txt


08/30/2021 20:16:54 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model from cache at C:\Users\Shufan/.cache\torch\transformers\3715e3a4a2de48834619b2a6f48979e13ddff5cabfb1f3409db689f9ce3bb98f.28d30f926f545047fc59da64289371eef0fbdc0764ce9ec56f808a646fcfec59
08/30/2021 20:16:54 - INFO - root -   Loading features from cached file data\lm_cache\cached_camembert-base_train_256
08/30/2021 20:16:54 - INFO - root -   Loading features from cached file data\lm_cache\cached_camembert-base_dev_256


### Création de LMLearner

In [72]:
lm_learner = BertLMLearner.from_pretrained_model(
                            dataBunch=databunch_lm,
                            pretrained_path='camembert-base',
                            output_dir=MODEL_PATH,
                            metrics=[],
                            device=device_cuda,
                            logger=logger,
                            multi_gpu=False,
                            logging_steps=50,
                            is_fp16=False) #was true with gpu

08/30/2021 20:17:36 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json from cache at C:\Users\Shufan/.cache\torch\transformers\5152a7b8b97da26abdad9b3babb600e77c52a002331ea52a9eaf96ea8b31ef8f.5bd7a9a60b9a2d311368226259eaf870cfb2248e0752f28b444ec112977cf8fc
08/30/2021 20:17:36 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

08/

In [73]:
lm_learner.fit(epochs=2, #was 30
            lr=1e-4,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

08/30/2021 20:19:19 - INFO - root -   ***** Running training *****
08/30/2021 20:19:19 - INFO - root -     Num examples = 281
08/30/2021 20:19:19 - INFO - root -     Num Epochs = 2
08/30/2021 20:19:19 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 4
08/30/2021 20:19:19 - INFO - root -     Gradient Accumulation steps = 1
08/30/2021 20:19:19 - INFO - root -     Total optimization steps = 142


08/30/2021 20:19:29 - INFO - root -   Running evaluation
08/30/2021 20:19:29 - INFO - root -   Num examples = 28
08/30/2021 20:19:29 - INFO - root -   Validation Batch size = 8


08/30/2021 20:19:29 - INFO - root -   eval_loss after step 50: 0.14754392579197884: 
08/30/2021 20:19:29 - INFO - root -   eval_perplexity after step 50: 1.1589841842651367: 
08/30/2021 20:19:29 - INFO - root -   lr after step 50: 7.240333919937893e-05
08/30/2021 20:19:29 - INFO - root -   train_loss after step 50: 3.683648777008057




08/30/2021 20:19:34 - INFO - root -   Running evaluation
08/30/2021 20:19:34 - INFO - root -   Num examples = 28
08/30/2021 20:19:34 - INFO - root -   Validation Batch size = 8


08/30/2021 20:19:34 - INFO - root -   eval_loss after epoch 1: 0.16351912170648575: 
08/30/2021 20:19:34 - INFO - root -   eval_perplexity after epoch 1: 1.1776478290557861: 
08/30/2021 20:19:34 - INFO - root -   lr after epoch 1: 5e-05
08/30/2021 20:19:34 - INFO - root -   train_loss after epoch 1: 3.536128430299356
08/30/2021 20:19:34 - INFO - root -   

08/30/2021 20:19:40 - INFO - root -   Running evaluation
08/30/2021 20:19:40 - INFO - root -   Num examples = 28
08/30/2021 20:19:40 - INFO - root -   Validation Batch size = 8


08/30/2021 20:19:40 - INFO - root -   eval_loss after step 100: 0.15777598693966866: 
08/30/2021 20:19:40 - INFO - root -   eval_perplexity after step 100: 1.1709039211273193: 
08/30/2021 20:19:40 - INFO - root -   lr after step 100: 2.0076384291297134e-05
08/30/2021 20:19:40 - INFO - root -   train_loss after step 100: 3.1521368074417113
08/30/2021 20:19:48 - INFO - root -   Running evaluation
08/30/2021 20:19:48 - INFO - root -   Num examples = 28
08/30/2021 20:19:48 - INFO - root -   Validation Batch size = 8


08/30/2021 20:19:49 - INFO - root -   eval_loss after epoch 2: 0.14986704289913177: 
08/30/2021 20:19:49 - INFO - root -   eval_perplexity after epoch 2: 1.161679744720459: 
08/30/2021 20:19:49 - INFO - root -   lr after epoch 2: 0.0
08/30/2021 20:19:49 - INFO - root -   train_loss after epoch 2: 3.0767343245761496
08/30/2021 20:19:49 - INFO - root -   



(142, 3.306431377437753)

In [74]:
lm_learner.validate()

08/30/2021 20:21:56 - INFO - root -   Running evaluation
08/30/2021 20:21:56 - INFO - root -   Num examples = 28
08/30/2021 20:21:56 - INFO - root -   Validation Batch size = 8


{'loss': 0.1531503163278103, 'perplexity': 1.1655001640319824}

In [75]:
lm_learner.save_model()

08/30/2021 20:23:00 - INFO - transformers.configuration_utils -   Configuration saved in model\model_out\config.json
08/30/2021 20:23:00 - INFO - transformers.modeling_utils -   Model weights saved in model\model_out\pytorch_model.bin


In [76]:
del lm_learner

### Création de databunch pour la classification

In [77]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='camembert-base',
                          train_file='twt_train_set.csv',
                          val_file='twt_val_set.csv',
                          label_file='twt_labels_1.txt',
                          text_col='text',
                          label_col=['Observation'],
                          batch_size_per_gpu=8,
                          max_seq_length=256,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='camembert-base')

08/30/2021 20:23:12 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json from cache at C:\Users\Shufan/.cache\torch\transformers\5152a7b8b97da26abdad9b3babb600e77c52a002331ea52a9eaf96ea8b31ef8f.5bd7a9a60b9a2d311368226259eaf870cfb2248e0752f28b444ec112977cf8fc
08/30/2021 20:23:12 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

08/

### Création de Learner

In [78]:
from fast_bert.metrics import roc_auc, accuracy_thresh, F1, fbeta, confusion_matrix

metrics = [{'name': 'fbeta', 'function': fbeta},
           {'name': 'roc_auc', 'function': roc_auc}
          ]
OUTPUT_DIR = Path('./twt_cls_model')
WGTS_PATH = Path('model/model_out/pytorch_model.bin')

In [79]:
# issue fast-bert pos_weight <= downgrade to 1.9.1 solve the prob
cl_learner = BertLearner.from_pretrained_model(
                        databunch,
                        pretrained_path='model/model_out',
                        metrics=metrics,
                        device=device_cuda, #was device_cuda
                        logger=logger,
                        output_dir=OUTPUT_DIR,
                        finetuned_wgts_path=WGTS_PATH,
                        warmup_steps=300,
                        multi_gpu=False,
                        multi_label=True,
                        is_fp16=False,#True when is cuda
                        logging_steps=50)

08/30/2021 20:24:05 - INFO - transformers.configuration_utils -   loading configuration file model/model_out\config.json
08/30/2021 20:24:05 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

08/30/2021 20:24:05 - INFO - transformers.modeling_utils -   loading weights file model/model_out\pytorch_model.bin
- This IS expected if you are initializing Cam

In [80]:
cl_learner.fit(epochs=10,# was 30
            lr=2e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")

08/30/2021 20:24:19 - INFO - root -   ***** Running training *****
08/30/2021 20:24:19 - INFO - root -     Num examples = 213
08/30/2021 20:24:19 - INFO - root -     Num Epochs = 10
08/30/2021 20:24:19 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
08/30/2021 20:24:19 - INFO - root -     Gradient Accumulation steps = 1
08/30/2021 20:24:19 - INFO - root -     Total optimization steps = 270


08/30/2021 20:24:21 - INFO - root -   Running evaluation
08/30/2021 20:24:21 - INFO - root -     Num examples = 53
08/30/2021 20:24:21 - INFO - root -     Batch size = 16


08/30/2021 20:24:21 - INFO - root -   eval_loss after epoch 1: 0.6761570870876312: 
08/30/2021 20:24:21 - INFO - root -   eval_fbeta after epoch 1: 0.1320754736661911: 
08/30/2021 20:24:21 - INFO - root -   eval_roc_auc after epoch 1: 0.7049689440993789: 
08/30/2021 20:24:21 - INFO - root -   lr after epoch 1: 1.8000000000000001e-06
08/30/2021 20:24:21 - INFO - root -   train_loss after epoch 1: 0.6917647542776885
08/30/2021 20:24:21 - INFO - root -   





08/30/2021 20:24:23 - INFO - root -   Running evaluation
08/30/2021 20:24:23 - INFO - root -     Num examples = 53
08/30/2021 20:24:23 - INFO - root -     Batch size = 16


08/30/2021 20:24:23 - INFO - root -   eval_loss after step 50: 0.6357951015233994: 
08/30/2021 20:24:23 - INFO - root -   eval_fbeta after step 50: 0.1320754736661911: 
08/30/2021 20:24:23 - INFO - root -   eval_roc_auc after step 50: 0.670807453416149: 
08/30/2021 20:24:23 - INFO - root -   lr after step 50: 3.3333333333333333e-06
08/30/2021 20:24:23 - INFO - root -   train_loss after step 50: 0.6830101251602173




08/30/2021 20:24:24 - INFO - root -   Running evaluation
08/30/2021 20:24:24 - INFO - root -     Num examples = 53
08/30/2021 20:24:24 - INFO - root -     Batch size = 16


08/30/2021 20:24:24 - INFO - root -   eval_loss after epoch 2: 0.6256362497806549: 
08/30/2021 20:24:24 - INFO - root -   eval_fbeta after epoch 2: 0.1320754736661911: 
08/30/2021 20:24:24 - INFO - root -   eval_roc_auc after epoch 2: 0.65527950310559: 
08/30/2021 20:24:24 - INFO - root -   lr after epoch 2: 3.6000000000000003e-06
08/30/2021 20:24:24 - INFO - root -   train_loss after epoch 2: 0.6682611328584177
08/30/2021 20:24:24 - INFO - root -   





08/30/2021 20:24:26 - INFO - root -   Running evaluation
08/30/2021 20:24:26 - INFO - root -     Num examples = 53
08/30/2021 20:24:26 - INFO - root -     Batch size = 16


08/30/2021 20:24:26 - INFO - root -   eval_loss after epoch 3: 0.5546899139881134: 
08/30/2021 20:24:26 - INFO - root -   eval_fbeta after epoch 3: 0.1320754736661911: 
08/30/2021 20:24:26 - INFO - root -   eval_roc_auc after epoch 3: 0.577639751552795: 
08/30/2021 20:24:26 - INFO - root -   lr after epoch 3: 5.400000000000001e-06
08/30/2021 20:24:26 - INFO - root -   train_loss after epoch 3: 0.6338703676506325
08/30/2021 20:24:26 - INFO - root -   





08/30/2021 20:24:28 - INFO - root -   Running evaluation
08/30/2021 20:24:28 - INFO - root -     Num examples = 53
08/30/2021 20:24:28 - INFO - root -     Batch size = 16


08/30/2021 20:24:28 - INFO - root -   eval_loss after step 100: 0.5070816352963448: 
08/30/2021 20:24:28 - INFO - root -   eval_fbeta after step 100: 0.1320754736661911: 
08/30/2021 20:24:28 - INFO - root -   eval_roc_auc after step 100: 0.5807453416149069: 
08/30/2021 20:24:28 - INFO - root -   lr after step 100: 6.666666666666667e-06
08/30/2021 20:24:28 - INFO - root -   train_loss after step 100: 0.6222876858711243




08/30/2021 20:24:28 - INFO - root -   Running evaluation
08/30/2021 20:24:28 - INFO - root -     Num examples = 53
08/30/2021 20:24:28 - INFO - root -     Batch size = 16


08/30/2021 20:24:29 - INFO - root -   eval_loss after epoch 4: 0.4885435253381729: 
08/30/2021 20:24:29 - INFO - root -   eval_fbeta after epoch 4: 0.1320754736661911: 
08/30/2021 20:24:29 - INFO - root -   eval_roc_auc after epoch 4: 0.5714285714285714: 
08/30/2021 20:24:29 - INFO - root -   lr after epoch 4: 7.2000000000000005e-06
08/30/2021 20:24:29 - INFO - root -   train_loss after epoch 4: 0.5952279104126824
08/30/2021 20:24:29 - INFO - root -   





08/30/2021 20:24:31 - INFO - root -   Running evaluation
08/30/2021 20:24:31 - INFO - root -     Num examples = 53
08/30/2021 20:24:31 - INFO - root -     Batch size = 16


08/30/2021 20:24:31 - INFO - root -   eval_loss after epoch 5: 0.44193675369024277: 
08/30/2021 20:24:31 - INFO - root -   eval_fbeta after epoch 5: 0.056603774428367615: 
08/30/2021 20:24:31 - INFO - root -   eval_roc_auc after epoch 5: 0.670807453416149: 
08/30/2021 20:24:31 - INFO - root -   lr after epoch 5: 9e-06
08/30/2021 20:24:31 - INFO - root -   train_loss after epoch 5: 0.5628496927243692
08/30/2021 20:24:31 - INFO - root -   





08/30/2021 20:24:32 - INFO - root -   Running evaluation
08/30/2021 20:24:32 - INFO - root -     Num examples = 53
08/30/2021 20:24:32 - INFO - root -     Batch size = 16


08/30/2021 20:24:32 - INFO - root -   eval_loss after step 150: 0.4238702207803726: 
08/30/2021 20:24:32 - INFO - root -   eval_fbeta after step 150: 0.0: 
08/30/2021 20:24:32 - INFO - root -   eval_roc_auc after step 150: 0.6677018633540373: 
08/30/2021 20:24:32 - INFO - root -   lr after step 150: 1e-05
08/30/2021 20:24:32 - INFO - root -   train_loss after step 150: 0.564663223028183




08/30/2021 20:24:33 - INFO - root -   Running evaluation
08/30/2021 20:24:33 - INFO - root -     Num examples = 53
08/30/2021 20:24:33 - INFO - root -     Batch size = 16


08/30/2021 20:24:34 - INFO - root -   eval_loss after epoch 6: 0.40656592696905136: 
08/30/2021 20:24:34 - INFO - root -   eval_fbeta after epoch 6: 0.0: 
08/30/2021 20:24:34 - INFO - root -   eval_roc_auc after epoch 6: 0.7546583850931676: 
08/30/2021 20:24:34 - INFO - root -   lr after epoch 6: 1.0800000000000002e-05
08/30/2021 20:24:34 - INFO - root -   train_loss after epoch 6: 0.5399255233782309
08/30/2021 20:24:34 - INFO - root -   





08/30/2021 20:24:36 - INFO - root -   Running evaluation
08/30/2021 20:24:36 - INFO - root -     Num examples = 53
08/30/2021 20:24:36 - INFO - root -     Batch size = 16


08/30/2021 20:24:36 - INFO - root -   eval_loss after epoch 7: 0.3811151012778282: 
08/30/2021 20:24:36 - INFO - root -   eval_fbeta after epoch 7: 0.0: 
08/30/2021 20:24:36 - INFO - root -   eval_roc_auc after epoch 7: 0.7732919254658385: 
08/30/2021 20:24:36 - INFO - root -   lr after epoch 7: 1.2600000000000001e-05
08/30/2021 20:24:36 - INFO - root -   train_loss after epoch 7: 0.5316968703711474
08/30/2021 20:24:36 - INFO - root -   





08/30/2021 20:24:37 - INFO - root -   Running evaluation
08/30/2021 20:24:37 - INFO - root -     Num examples = 53
08/30/2021 20:24:37 - INFO - root -     Batch size = 16


08/30/2021 20:24:37 - INFO - root -   eval_loss after step 200: 0.3643853887915611: 
08/30/2021 20:24:37 - INFO - root -   eval_fbeta after step 200: 0.056603774428367615: 
08/30/2021 20:24:37 - INFO - root -   eval_roc_auc after step 200: 0.8260869565217391: 
08/30/2021 20:24:37 - INFO - root -   lr after step 200: 1.3333333333333333e-05
08/30/2021 20:24:37 - INFO - root -   train_loss after step 200: 0.5062865763902664




08/30/2021 20:24:38 - INFO - root -   Running evaluation
08/30/2021 20:24:38 - INFO - root -     Num examples = 53
08/30/2021 20:24:38 - INFO - root -     Batch size = 16


08/30/2021 20:24:38 - INFO - root -   eval_loss after epoch 8: 0.3714302107691765: 
08/30/2021 20:24:38 - INFO - root -   eval_fbeta after epoch 8: 0.09433962404727936: 
08/30/2021 20:24:38 - INFO - root -   eval_roc_auc after epoch 8: 0.826086956521739: 
08/30/2021 20:24:38 - INFO - root -   lr after epoch 8: 1.4400000000000001e-05
08/30/2021 20:24:38 - INFO - root -   train_loss after epoch 8: 0.4591172613479473
08/30/2021 20:24:38 - INFO - root -   





08/30/2021 20:24:41 - INFO - root -   Running evaluation
08/30/2021 20:24:41 - INFO - root -     Num examples = 53
08/30/2021 20:24:41 - INFO - root -     Batch size = 16


08/30/2021 20:24:41 - INFO - root -   eval_loss after epoch 9: 0.4249720349907875: 
08/30/2021 20:24:41 - INFO - root -   eval_fbeta after epoch 9: 0.09433962404727936: 
08/30/2021 20:24:41 - INFO - root -   eval_roc_auc after epoch 9: 0.8478260869565217: 
08/30/2021 20:24:41 - INFO - root -   lr after epoch 9: 1.62e-05
08/30/2021 20:24:41 - INFO - root -   train_loss after epoch 9: 0.3319021667595263
08/30/2021 20:24:41 - INFO - root -   





08/30/2021 20:24:41 - INFO - root -   Running evaluation
08/30/2021 20:24:41 - INFO - root -     Num examples = 53
08/30/2021 20:24:41 - INFO - root -     Batch size = 16


08/30/2021 20:24:42 - INFO - root -   eval_loss after step 250: 0.3984830304980278: 
08/30/2021 20:24:42 - INFO - root -   eval_fbeta after step 250: 0.09433962404727936: 
08/30/2021 20:24:42 - INFO - root -   eval_roc_auc after step 250: 0.860248447204969: 
08/30/2021 20:24:42 - INFO - root -   lr after step 250: 1.6666666666666667e-05
08/30/2021 20:24:42 - INFO - root -   train_loss after step 250: 0.372293955385685




08/30/2021 20:24:43 - INFO - root -   Running evaluation
08/30/2021 20:24:43 - INFO - root -     Num examples = 53
08/30/2021 20:24:43 - INFO - root -     Batch size = 16


08/30/2021 20:24:43 - INFO - root -   eval_loss after epoch 10: 0.3055841773748398: 
08/30/2021 20:24:43 - INFO - root -   eval_fbeta after epoch 10: 0.09433962404727936: 
08/30/2021 20:24:43 - INFO - root -   eval_roc_auc after epoch 10: 0.8478260869565217: 
08/30/2021 20:24:43 - INFO - root -   lr after epoch 10: 1.8e-05
08/30/2021 20:24:43 - INFO - root -   train_loss after epoch 10: 0.20875743473017658
08/30/2021 20:24:43 - INFO - root -   





(270, 0.5223373114510819)

In [81]:
cl_learner.validate()

08/30/2021 20:25:14 - INFO - root -   Running evaluation
08/30/2021 20:25:14 - INFO - root -     Num examples = 53
08/30/2021 20:25:14 - INFO - root -     Batch size = 16


{'loss': 0.3055841773748398,
 'fbeta': 0.09433962404727936,
 'roc_auc': 0.8478260869565217}

In [82]:
cl_learner.save_model()

08/30/2021 20:27:27 - INFO - transformers.configuration_utils -   Configuration saved in twt_cls_model\model_out\config.json
08/30/2021 20:27:28 - INFO - transformers.modeling_utils -   Model weights saved in twt_cls_model\model_out\pytorch_model.bin


In [83]:
del cl_learner

In [None]:
cl_learner.model.eval()

### Prédictions

In [84]:
predictor = BertClassificationPredictor(
                model_path='twt_cls_model/model_out',
                label_path='twt_cls_labels',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

08/30/2021 22:26:28 - INFO - transformers.configuration_utils -   loading configuration file twt_cls_model/model_out\config.json
08/30/2021 22:26:28 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

08/30/2021 22:26:28 - INFO - transformers.tokenization_utils_base -   Model name 'twt_cls_model/model_out' not found in model shor

In [85]:
#observation 2018
predictor.predict("[ Broyer les tiges pour lutter contre les pyrales ] &gt;&gt;&gt; https://t.co/sksqutcqEW &gt;&gt;&gt; - Les dégâts causés par la pyrale sont en progression en Bretagne. Les cultures de maïs fourrage, jusqu’ici peu concernées, présentent des ... https://t.co/bZdzvMPVhB")

08/30/2021 22:26:51 - INFO - root -   Writing example 0 of 1




[('Observation', 0.22204630076885223)]

In [86]:
# Observation 2018 => raté
predictor.predict("Dégâts de pyrale sur un essai variétés maïs chez #trichogrammes #luttebiologique #surleterrain https://t.co/qz4OKVTeRh")

08/30/2021 22:26:54 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10423579066991806)]

In [87]:
# Observation 2018
predictor.predict("Début des vols de #pyrale en Bretagne : lutte biologique avec des #trichogrammes via #maïs https://t.co/9UvTqRI3jd https://t.co/RcumfTOx7O")

08/30/2021 22:26:56 - INFO - root -   Writing example 0 of 1


[('Observation', 0.8562948703765869)]

In [88]:
# Observation 2018
predictor.predict("Pyrale du maïs : la pression monte en #Bretagne mais pas seulement !! https://t.co/ERMrVfKEim … via Dès les premiers vols, luttez naturellement avec les #trichogrammes (micro-insectes) plus d'infos https://t.co/yNzPIiusLS https://t.co/hYhikBGsq4")

08/30/2021 22:27:04 - INFO - root -   Writing example 0 of 1


[('Observation', 0.8135897517204285)]

In [89]:
# Info 2018
predictor.predict("Démonstration lutte alternative contre la pyrale du maïs : sucre et insecticide biologique, piégeage des pyrales pour cibler le pic de vol https://t.co/8Q8jZ9euFo")

08/30/2021 22:27:05 - INFO - root -   Writing example 0 of 1


[('Observation', 0.09959875792264938)]

In [90]:
# Info 2018
predictor.predict("Rencontre ce matin avec un cultivateur de la #Beauce heureux de m’expliquer comment il lutte de manière écologique contre la pyrale, un ravageur du maïs, à l’aide d’un autre insecte le trichogramme qui pond dans son œuf, tuant sa larve. https://t.co/iiEdkhWP68 #écologie #bio https://t.co/Ei2d5d0mb9")

08/30/2021 22:27:12 - INFO - root -   Writing example 0 of 1


[('Observation', 0.09977701306343079)]

In [91]:
# Info 2016
predictor.predict("Salon de l'Agriculture 2016 : des drones pour lutter contre la pyrale du maïs - https://t.co/HE5CIAl9Pw #SIA2016 #salondelagriculture")

08/30/2021 22:27:13 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10100332647562027)]

In [92]:
# conseil 
predictor.predict("Principal ravageur du #maïs la #pyrale est en cette période à son pic d’activité. Deux méthodes de lutte existent : un insecticide qui nécessite le passage d’un tracteur, ou une autre moins connue, le lâcher de #trichogrammes par #drone https://t.co/xusE9oUo4f")

08/30/2021 22:27:14 - INFO - root -   Writing example 0 of 1


[('Observation', 0.09980399161577225)]

In [93]:
# conseil 
predictor.predict("le #maïs est l'une des grandes cultures pour laquelle il existe une solution de lutte biologique contre la #pyrale #Biocontrole - retrouvez ces infos en page 3 du suppléments de https://t.co/DXG18mLG89 https://t.co/2cPqxJcYoc")

08/30/2021 22:27:15 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10009700804948807)]

In [94]:
# observation in the training/val set
predictor.predict("Code rouge pour vos #maïs les foreurs #Pyrale et #Sésamie sont bien présents ‼ Mais comment suivre les vols de ces ravageurs pour bien les connaître et positionner au mieux les moyens de lutte On vous dit tout en 3 épisodes #ComPositive #Agriculture https://t.co/oPlpzWd2AA")

08/30/2021 22:27:17 - INFO - root -   Writing example 0 of 1


[('Observation', 0.8559390306472778)]

In [95]:
# observation in the training/val set
predictor.predict("Une parcelle de maïs ensilage defoncée par la #pyrale Je crois que celle-ci détient la palme pour l’instant https://t.co/fqqXujDtgo")

08/30/2021 22:27:19 - INFO - root -   Writing example 0 of 1


[('Observation', 0.8557798266410828)]

In [96]:
#politique
predictor.predict("Quand on voit la convergence des luttes contre le bon sens, je crois que toute les bonnes volontés devront s'unir... Après parler de pyrale du Maïs à un dispatcheur du réseau électrique, et un oncologue , ca demande de la pédagogie.")

08/30/2021 22:27:23 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10003858059644699)]

In [97]:
#politique
predictor.predict("C'est un peu analogue au cas du maïs Bt qui favorise les cultures conventionnelles et bio avoisinantes en diminuant la population des ravageurs (pyrale et sésamie).")

08/30/2021 22:27:24 - INFO - root -   Writing example 0 of 1


[('Observation', 0.09950041770935059)]

In [98]:
#pub
predictor.predict("[Communiqué de presse] #Digital #Biocontrole L’appli mobile #GeoInsecta de permet maintenant de traquer la #pyrale du #maïs pour mieux la contrer cc d’infos https://t.co/pwOxHeYJbD https://t.co/loWUM5LahJ")

08/30/2021 22:27:27 - INFO - root -   Writing example 0 of 1


[('Observation', 0.1023852750658989)]

In [99]:
#info
predictor.predict("Les trichogrammes volent dans la #Limagne. Alexandre Bresson, agriculteur à Entraigues dans le Puy-de-Dôme utilise les #trichogrammes dans la lutte contre la pyrale du maïs... https://t.co/l8CHmpMkzg")

08/30/2021 22:27:29 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10159479826688766)]

In [100]:
# pub
predictor.predict("J'aime une vidéo : \"Utilisation de drones dans la lutte biologique à la pyrale du maïs\" à l'adresse")

08/30/2021 22:27:30 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10015053302049637)]

In [101]:
# pub
predictor.predict("Bioline Agrosciences est le 1er acteur français de Bioprotection contre la pyrale du maïs avec Trichotop Max solution naturelle de lutte biologique 100% produite en France")

08/30/2021 22:27:31 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10013306140899658)]

In [102]:
# scolytes Les scolytes forment une sous-famille d'insectes coléoptères de la famille des Curculionidae.
predictor.predict("Sur notre site du Morvan, grave attaque de scolytes et réflexion de l'ONF pour savoir quoi planter")

08/30/2021 22:27:37 - INFO - root -   Writing example 0 of 1


[('Observation', 0.8535301685333252)]

In [103]:
# pub
predictor.predict("Bioline Agrosciences est le 1er acteur français de Bioprotection contre la pyrale du maïs avec Trichotop Max solution naturelle de lutte biologique 100% produite en France")

08/30/2021 22:27:43 - INFO - root -   Writing example 0 of 1


[('Observation', 0.10013306140899658)]

### Analyse the fine-tuned model

In [5]:
predictor = BertClassificationPredictor(
                model_path='twt_cls_model/model_out',
                label_path='twt_cls_labels',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

08/29/2021 02:24:35 - INFO - transformers.configuration_utils -   loading configuration file finetuned_model/model_out/config.json
08/29/2021 02:24:35 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

08/29/2021 02:24:35 - INFO - transformers.tokenization_utils_base -   Model name 'finetuned_model/model_out' not found in model shortcut name list (camembert-base). Assuming 'finetuned_model/model_out' is a

In [104]:
#evaluation on validation set
df_val = pd.read_csv('./data/twt_val_set.csv')

#predictor.get_learner()

In [105]:
#df_val.Disease = df_val.Disease.astype(int)
df_val.tail()

Unnamed: 0.1,Unnamed: 0,index,_id,author_id,display_username,username,user_description,time,twt_id,text,...,crop_tags,place_name,place_id,country,place_type,Observation,Pertinence? (/10),Info temporalite,Info localisation,Classification
48,16,16,60f0083fbd62ed8c4e41565c,1199420514818830336,Mandrin Jean-pierre,jpman70,Agriculteur en voie de conservation des sols e...,2020-09-25T18:18:11.000Z,1309557826977660930,"Chrysomèle, pyrale et celle-ci...superbe aveni...",...,['maïs'],,,,,0,0,0,0,conseil
49,10,10,60f0083fbd62ed8c4e415635,1018885295318749184,Serge FOURQUET,sergefourquet,,2020-10-12T18:45:05.000Z,1315725193508278272,Et en France on dit non à un maïs OGM résistan...,...,['maïs'],,,,,0,0,0,0,politique
50,125,126,60f00848bd62ed8c4e416235,3031925218,Tanguy Lozac'h,tgylzh,Agronome 🌽🌱| Chef de marché semence (maïs/soya...,2020-02-09T15:43:02.000Z,1226531962707419136,En attendant grâce au Bt je n’ai jamais vu de ...,...,"['attier', 'maïs', 'maïs grain']",McMasterville,06c4516af7f31c69,Canada,city,0,0,0,0,conseil
51,258,259,60f00858bd62ed8c4e417952,1017769924658417670,TalkAG,talk_ag,TalkAG est un réseau social pour les agriculte...,2019-02-04T16:19:52.000Z,1092457723478900737,"Pyrale du maïs, comment la combattre ? https:/...",...,['maïs'],,,,,0,0,0,0,conseil
52,104,105,60f00845bd62ed8c4e415e02,492112419,AGRI72,AGRI_72,Toute l'information agricole sarthoise au serv...,2020-05-20T16:55:27.000Z,1263151360276680706,"A la Une de votre prochain numéro, un dossier ...",...,['maïs'],,,,,0,0,0,0,politique


In [106]:
batch_predictions = predictor.predict_batch(df_val.text.to_list())

08/30/2021 22:27:57 - INFO - root -   Writing example 0 of 53




In [107]:
batch_predictions[:10]

[[('Observation', 0.09971734136343002)],
 [('Observation', 0.09997959434986115)],
 [('Observation', 0.0997748151421547)],
 [('Observation', 0.11049496382474899)],
 [('Observation', 0.09952470660209656)],
 [('Observation', 0.09886086732149124)],
 [('Observation', 0.09941551834344864)],
 [('Observation', 0.12124031782150269)],
 [('Observation', 0.09907016158103943)],
 [('Observation', 0.855156421661377)]]

In [108]:
#dict(batch_predictions[1]).values()
list_y_pred = [ dict(pred) for pred in batch_predictions]
#list_y_pred[-5:]

In [109]:
df_y_pred = pd.DataFrame(list_y_pred, columns =['Observation']) 
df_y_pred = df_y_pred.rename(columns={"Observation": "pred_O"})
df_y_pred.tail()

Unnamed: 0,pred_O
48,0.100543
49,0.099385
50,0.103938
51,0.099505
52,0.133095


In [110]:
df_y_pred.describe()

Unnamed: 0,pred_O
count,53.0
mean,0.229545
std,0.265801
min,0.098542
25%,0.099775
50%,0.100372
75%,0.114495
max,0.856407


In [111]:
df_y_real = pd.DataFrame(df_val, columns=['Observation'])
df_y_real.tail()

Unnamed: 0,Observation
48,0
49,0
50,0
51,0
52,0


In [112]:
df_y_real.describe()

Unnamed: 0,Observation
count,53.0
mean,0.132075
std,0.341813
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [113]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve

In [114]:
precision, recall, threshold = precision_recall_curve(df_y_real.values, df_y_pred.values)

In [115]:
precision

array([0.18421053, 0.16216216, 0.16666667, 0.17142857, 0.17647059,
       0.18181818, 0.1875    , 0.19354839, 0.2       , 0.20689655,
       0.21428571, 0.22222222, 0.23076923, 0.24      , 0.25      ,
       0.26086957, 0.27272727, 0.28571429, 0.3       , 0.31578947,
       0.33333333, 0.35294118, 0.3125    , 0.33333333, 0.35714286,
       0.38461538, 0.41666667, 0.45454545, 0.4       , 0.44444444,
       0.5       , 0.57142857, 0.66666667, 0.8       , 0.75      ,
       1.        , 1.        , 1.        , 1.        ])

In [116]:
recall

array([1.        , 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.71428571, 0.71428571, 0.71428571,
       0.71428571, 0.71428571, 0.71428571, 0.57142857, 0.57142857,
       0.57142857, 0.57142857, 0.57142857, 0.57142857, 0.42857143,
       0.42857143, 0.28571429, 0.14285714, 0.        ])

In [117]:
threshold

array([0.09989703, 0.09995586, 0.09997497, 0.09997959, 0.10003053,
       0.10004236, 0.10007516, 0.10012134, 0.10017262, 0.10020913,
       0.10021029, 0.10037222, 0.10054343, 0.10059154, 0.10093002,
       0.1014892 , 0.10169376, 0.10178363, 0.1019374 , 0.10239314,
       0.10371134, 0.10376374, 0.10393775, 0.11049496, 0.11449456,
       0.12124032, 0.13309534, 0.3285988 , 0.36732274, 0.63343799,
       0.6881339 , 0.75898892, 0.81807935, 0.85515642, 0.85576648,
       0.85599571, 0.85637265, 0.85640705])

In [121]:
precision_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.85, average='binary')

0.8

In [122]:
recall_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.85,average='binary')

0.5714285714285714

In [123]:
f1_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.85,average='binary')

0.6666666666666666

In [126]:
accuracy_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.85)

0.9245283018867925

In [128]:
df_results = pd.concat([df_val, df_y_pred > 0.85], axis=1, ignore_index=True)

In [129]:
df_results.to_csv('twt_predictions.csv')