### The code and data are adapted from:  https://medium.com/@vitalshchutski/french-nlp-entamez-le-camembert-avec-les-librairies-fast-bert-et-transformers-14e65f84c148

In [None]:
# !conda install torch
# !pip install fast-bert==1.9.1
# !mkdir model
# !mkdir finetuned_model

In [1]:
import torch
from fast_bert.data_cls import BertDataBunch 
from fast_bert.learner_cls import BertLearner
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from fast_bert.metrics import fbeta, roc_auc
from fast_bert.prediction import BertClassificationPredictor
import sys

from pathlib import Path
import pandas as pd
import logging
#create logger
logfile = str('logfile.txt')

logging.basicConfig(
    level=logging.INFO,  #CRITICAL ERROR WARNING  INFO    DEBUG    NOTSET
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile, 'w', 'utf-8'),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()


device_cpu = torch.device("cpu")

In [2]:
DATA_PATH = Path('./data/')
LOG_PATH = Path('./logs/')
MODEL_PATH = Path('./model/')
LABEL_PATH = Path('./labels/')

In [10]:
df = pd.read_csv('./data/pyrale+maïs_tweets_2019-2020_labelled.csv')

In [11]:
val_set = df.sample(frac=0.2, replace=False, random_state=42)
train_set = df.drop(index = val_set.index)
print('Nombre de commentaires dans le val_set:',len(val_set))
print('Nombre de commentaires dans le train_set:', len(train_set))
val_set.to_csv('./data/twt_val_set.csv')
train_set.to_csv('./data/twt_train_set.csv')

Nombre de commentaires dans le val_set: 53
Nombre de commentaires dans le train_set: 213


In [12]:
df.head()

Unnamed: 0,index,_id,author_id,display_username,username,user_description,time,twt_id,text,tag,crop_tags,place_name,place_id,country,place_type,Observation,Pertinence? (/10),Info temporalite,Info localisation,Classification
0,0,60f0083fbd62ed8c4e4155ad,2870270541,Yann Duroc 🌱 #jesoutiensnosagriculteurs,DurocYann,Dr génétique végétale & Ingénieur Agronome - S...,2020-12-20T21:04:28.000Z,1340765032783749121,Les lignées BT sont utiles contre les insectes...,pyrale,['maïs'],,,,,0,0,0,0,conseil
1,1,60f0083fbd62ed8c4e4155af,303818720,Benco,benco_c,"Quest XS pour la ballade #QXS178, R&M Nevo pou...",2020-12-19T21:00:07.000Z,1340401550573879299,Vous êtes au courant que le bt c'est issu dune...,pyrale,['maïs'],,,,,0,0,0,0,politique
2,2,60f0083fbd62ed8c4e4155c0,713466976698560512,julien_54 🌾🌽🚜🏍,BADURAUX_j,#agriculteur #lorrain qui recherche des soluti...,2020-12-07T18:07:26.000Z,1336009437417500673,il y a 2ans jour pour jour j’étais invité à vi...,pyrale,['maïs'],,,,,0,1,1,0,Conseil
3,3,60f0083fbd62ed8c4e4155d5,781129373764247554,Bioline #bioControle 🐝🐞🕷,Bioline_AS,👉 produit & commercialise micro/macro organism...,2020-11-24T11:53:59.000Z,1331204411998285829,Bioline Agrosciences nominé aux #innovation 20...,pyrale,"['lin', 'maïs']",,,,,0,0,0,0,pub
4,4,60f0083fbd62ed8c4e4155d7,1260451087447883776,Wackes Seppi,SeppiWackes,"Agriculture, alimentation, santé publique... s...",2020-11-20T08:06:27.000Z,1329697600535224320,"Oui, pendant ce temps là.. C'est en principe l...",pyrale,['maïs'],,,,,0,0,0,0,politique


In [13]:
labels = ['Observation']
with open('./labels/twt_labels_1.txt', 'w') as f:
    for i in labels:
        f.write(i + "\n")

In [6]:
df_texts = pd.read_csv('./data/raw_xml_bsv_0-200.csv')

### text cleaning

In [None]:
import nltk
import re

# make all elements string
df_texts['report_text'] = df_texts['report_text'].astype(str)
# Remove null fields
df_texts['report_text'] = df_texts['report_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# Make all text lowercase
df_texts['report_text'] = df_texts['report_text'].apply(lambda x: x.lower())
# Delete stop-words => to be tesred later
#stopwords = nltk.corpus.stopwords.words('french')


In [8]:
all_texts = df_texts['report_text'].to_list()[500:]
print('Nombre de bloc de texte:', len(all_texts))

Nombre de bloc de texte: 2297


### Création de LMDataBunch

In [9]:
all_texts[:10]

["oïdium-les preüiers symptômes naladie viennent d'apparaître ceps atteints 1966. pour prévenir attaques, premier traitenent devrait être appliqué, semaine prochaine, dans foyers l'an dernier cépages sensibles. bordeaux, avril 1967 contrôleur chargé avertissements roussel l'inspecteur protection végétaux bruneteau",
 'bulletin mars 1979 tous departements ..........................',
 'arbres fruitiers',
 "chancre commun pommier -dans vergers situés dans expositions très favorables cette maladie, variétés sensibles, conseillé d'exécuter pulvérisation cuprique lorsque arbres auront atteint stade fleckinger (premier gonflement bourgeon, écailles s'écartant légèrement).",
 "anthonome pommier -très localement, constate recrudescence l'activité insecte occasionné,en 1978, dégâts, parfois importants, dans quelques rares vergers. dans vergers pommiers l'anthonome s'est manifesté printemps dernier, conseillé d'effectuer traitement insecticide lcrsque arbres auront atteint stade (premier gonflem

In [8]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
                    data_dir=DATA_PATH,
                    text_list=all_texts,
                    tokenizer='camembert-base',
                    batch_size_per_gpu=4, #was 16, even 8 won't do
                    max_seq_length=256, #was 512
                    multi_gpu=False,
                    model_type='camembert-base',
                    logger=logger)

### Création de LMLearner

In [None]:
lm_learner = BertLMLearner.from_pretrained_model(
                            dataBunch=databunch_lm,
                            pretrained_path='camembert-base',
                            output_dir=MODEL_PATH,
                            metrics=[],
                            device=device_cpu,
                            logger=logger,
                            multi_gpu=False,
                            logging_steps=50,
                            is_fp16=False) #was true with gpu

In [11]:
lm_learner.fit(epochs=10, #was 30
            lr=1e-4,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")



(710, 1.9302346756760502)

In [12]:
lm_learner.validate()

{'loss': 0.10752222873270512, 'perplexity': 1.1135156154632568}

In [13]:
lm_learner.save_model()

In [14]:
del lm_learner

### Création de databunch pour la classification

In [16]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='camembert-base',
                          train_file='twt_train_set.csv',
                          val_file='twt_val_set.csv',
                          label_file='twt_labels_1.txt',
                          text_col='text',
                          label_col=['Observation'],
                          batch_size_per_gpu=8,
                          max_seq_length=256,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='camembert-base')

### Création de Learner

In [17]:
metrics = [{'name': 'fbeta', 'function': fbeta}, {'name': 'roc_auc', 'function': roc_auc}]
OUTPUT_DIR = Path('./twt_cls_model')
WGTS_PATH = Path('model/model_out/pytorch_model.bin')

In [18]:
# issue fast-bert pos_weight <= downgrade to 1.9.1 solve the prob
cl_learner = BertLearner.from_pretrained_model(
                        databunch,
                        pretrained_path='model/model_out',
                        metrics=metrics,
                        device=device_cpu, #was device_cuda
                        logger=logger,
                        output_dir=OUTPUT_DIR,
                        finetuned_wgts_path=WGTS_PATH,
                        warmup_steps=300,
                        multi_gpu=False,
                        multi_label=True,
                        is_fp16=False,#True when is cuda
                        logging_steps=50)

Some weights of the model checkpoint at model/model_out were not used when initializing CamembertForMultiLabelSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing CamembertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing CamembertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForMultiLabelSequenceClassification were not initialized from the model checkpoint at model/model_out and are newly initialized: ['classifier.dense.w

In [20]:
cl_learner.fit(epochs=10,# was 30
            lr=2e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="adamw")



(270, 0.50403933144278)

In [21]:
cl_learner.validate()

{'loss': 0.3383052535355091,
 'fbeta': 0.07547169923782349,
 'roc_auc': 0.7888198757763975}

In [22]:
cl_learner.save_model()

In [22]:
del cl_learner

In [None]:
cl_learner.model.eval()

### Prédictions

In [3]:
predictor = BertClassificationPredictor(
                model_path='twt_cls_model/model_out',
                label_path='twt_cls_labels',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

09/14/2021 00:20:36 - INFO - transformers.configuration_utils -   loading configuration file twt_cls_model/model_out/config.json
09/14/2021 00:20:36 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

09/14/2021 00:20:36 - INFO - transformers.tokenization_utils_base -   Model name 'twt_cls_model/model_out' not found in model shor

In [26]:
#observation 2018
predictor.predict("[ Broyer les tiges pour lutter contre les pyrales ] &gt;&gt;&gt; https://t.co/sksqutcqEW &gt;&gt;&gt; - Les dégâts causés par la pyrale sont en progression en Bretagne. Les cultures de maïs fourrage, jusqu’ici peu concernées, présentent des ... https://t.co/bZdzvMPVhB")



[('Observation', 0.3916700780391693)]

In [40]:
# Observation 2018 => raté
predictor.predict("Dégâts de pyrale sur un essai variétés maïs chez #trichogrammes #luttebiologique #surleterrain https://t.co/qz4OKVTeRh")

[('Observation', 0.07869870215654373)]

In [36]:
# Observation 2018
predictor.predict("Début des vols de #pyrale en Bretagne : lutte biologique avec des #trichogrammes via #maïs https://t.co/9UvTqRI3jd https://t.co/RcumfTOx7O")

[('Observation', 0.8942203521728516)]

In [37]:
# Observation 2018
predictor.predict("Pyrale du maïs : la pression monte en #Bretagne mais pas seulement !! https://t.co/ERMrVfKEim … via Dès les premiers vols, luttez naturellement avec les #trichogrammes (micro-insectes) plus d'infos https://t.co/yNzPIiusLS https://t.co/hYhikBGsq4")

[('Observation', 0.8958384394645691)]

In [38]:
# Info 2018
predictor.predict("Démonstration lutte alternative contre la pyrale du maïs : sucre et insecticide biologique, piégeage des pyrales pour cibler le pic de vol https://t.co/8Q8jZ9euFo")

[('Observation', 0.07528550922870636)]

In [39]:
# Info 2018
predictor.predict("Rencontre ce matin avec un cultivateur de la #Beauce heureux de m’expliquer comment il lutte de manière écologique contre la pyrale, un ravageur du maïs, à l’aide d’un autre insecte le trichogramme qui pond dans son œuf, tuant sa larve. https://t.co/iiEdkhWP68 #écologie #bio https://t.co/Ei2d5d0mb9")

[('Observation', 0.07484354078769684)]

In [45]:
# Info 2016
predictor.predict("Salon de l'Agriculture 2016 : des drones pour lutter contre la pyrale du maïs - https://t.co/HE5CIAl9Pw #SIA2016 #salondelagriculture")

[('Observation', 0.07513724267482758)]

In [30]:
# conseil 
predictor.predict("Principal ravageur du #maïs la #pyrale est en cette période à son pic d’activité. Deux méthodes de lutte existent : un insecticide qui nécessite le passage d’un tracteur, ou une autre moins connue, le lâcher de #trichogrammes par #drone https://t.co/xusE9oUo4f")

[('Observation', 0.0748760998249054)]

In [44]:
# conseil 
predictor.predict("le #maïs est l'une des grandes cultures pour laquelle il existe une solution de lutte biologique contre la #pyrale #Biocontrole - retrouvez ces infos en page 3 du suppléments de https://t.co/DXG18mLG89 https://t.co/2cPqxJcYoc")

[('Observation', 0.07479020208120346)]

In [31]:
# observation in the training/val set
predictor.predict("Code rouge pour vos #maïs les foreurs #Pyrale et #Sésamie sont bien présents ‼ Mais comment suivre les vols de ces ravageurs pour bien les connaître et positionner au mieux les moyens de lutte On vous dit tout en 3 épisodes #ComPositive #Agriculture https://t.co/oPlpzWd2AA")

[('Observation', 0.8987360596656799)]

In [32]:
# observation in the training/val set
predictor.predict("Une parcelle de maïs ensilage defoncée par la #pyrale Je crois que celle-ci détient la palme pour l’instant https://t.co/fqqXujDtgo")

[('Observation', 0.8991539478302002)]

In [33]:
#politique
predictor.predict("Quand on voit la convergence des luttes contre le bon sens, je crois que toute les bonnes volontés devront s'unir... Après parler de pyrale du Maïs à un dispatcheur du réseau électrique, et un oncologue , ca demande de la pédagogie.")

[('Observation', 0.07583270967006683)]

In [43]:
#politique
predictor.predict("C'est un peu analogue au cas du maïs Bt qui favorise les cultures conventionnelles et bio avoisinantes en diminuant la population des ravageurs (pyrale et sésamie).")

[('Observation', 0.07514072209596634)]

In [34]:
#pub
predictor.predict("[Communiqué de presse] #Digital #Biocontrole L’appli mobile #GeoInsecta de permet maintenant de traquer la #pyrale du #maïs pour mieux la contrer cc d’infos https://t.co/pwOxHeYJbD https://t.co/loWUM5LahJ")

[('Observation', 0.07541383057832718)]

In [35]:
#info
predictor.predict("Les trichogrammes volent dans la #Limagne. Alexandre Bresson, agriculteur à Entraigues dans le Puy-de-Dôme utilise les #trichogrammes dans la lutte contre la pyrale du maïs... https://t.co/l8CHmpMkzg")

[('Observation', 0.07782867550849915)]

In [41]:
# pub
predictor.predict("J'aime une vidéo : \"Utilisation de drones dans la lutte biologique à la pyrale du maïs\" à l'adresse")

[('Observation', 0.07534167915582657)]

In [42]:
# pub
predictor.predict("Bioline Agrosciences est le 1er acteur français de Bioprotection contre la pyrale du maïs avec Trichotop Max solution naturelle de lutte biologique 100% produite en France")

[('Observation', 0.07505712658166885)]

In [46]:
# scolytes Les scolytes forment une sous-famille d'insectes coléoptères de la famille des Curculionidae.
predictor.predict("Sur notre site du Morvan, grave attaque de scolytes et réflexion de l'ONF pour savoir quoi planter")

[('Observation', 0.8964828848838806)]

In [42]:
# pub
predictor.predict("Bioline Agrosciences est le 1er acteur français de Bioprotection contre la pyrale du maïs avec Trichotop Max solution naturelle de lutte biologique 100% produite en France")

[('Observation', 0.07505712658166885)]

### Analyse the fine-tuned model

In [5]:
predictor = BertClassificationPredictor(
                model_path='twt_cls_model/model_out',
                label_path='twt_cls_labels',
                multi_label=True,
                model_type='camembert-base',
                do_lower_case=False)

08/29/2021 02:24:35 - INFO - transformers.configuration_utils -   loading configuration file finetuned_model/model_out/config.json
08/29/2021 02:24:35 - INFO - transformers.configuration_utils -   Model config CamembertConfig {
  "architectures": [
    "CamembertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 32005
}

08/29/2021 02:24:35 - INFO - transformers.tokenization_utils_base -   Model name 'finetuned_model/model_out' not found in model shortcut name list (camembert-base). Assuming 'finetuned_model/model_out' is a

In [4]:
#evaluation on validation set
df_val = pd.read_csv('./data/twt_val_set.csv')

#predictor.get_learner()

In [7]:
#df_val.Disease = df_val.Disease.astype(int)
df_val.tail()

Unnamed: 0.1,Unnamed: 0,index,_id,author_id,display_username,username,user_description,time,twt_id,text,...,crop_tags,place_name,place_id,country,place_type,Observation,Pertinence? (/10),Info temporalite,Info localisation,Classification
48,16,16,60f0083fbd62ed8c4e41565c,1199420514818830336,Mandrin Jean-pierre,jpman70,Agriculteur en voie de conservation des sols e...,2020-09-25T18:18:11.000Z,1309557826977660930,"Chrysomèle, pyrale et celle-ci...superbe aveni...",...,['maïs'],,,,,0,0,0,0,conseil
49,10,10,60f0083fbd62ed8c4e415635,1018885295318749184,Serge FOURQUET,sergefourquet,,2020-10-12T18:45:05.000Z,1315725193508278272,Et en France on dit non à un maïs OGM résistan...,...,['maïs'],,,,,0,0,0,0,politique
50,125,126,60f00848bd62ed8c4e416235,3031925218,Tanguy Lozac'h,tgylzh,Agronome 🌽🌱| Chef de marché semence (maïs/soya...,2020-02-09T15:43:02.000Z,1226531962707419136,En attendant grâce au Bt je n’ai jamais vu de ...,...,"['attier', 'maïs', 'maïs grain']",McMasterville,06c4516af7f31c69,Canada,city,0,0,0,0,conseil
51,258,259,60f00858bd62ed8c4e417952,1017769924658417670,TalkAG,talk_ag,TalkAG est un réseau social pour les agriculte...,2019-02-04T16:19:52.000Z,1092457723478900737,"Pyrale du maïs, comment la combattre ? https:/...",...,['maïs'],,,,,0,0,0,0,conseil
52,104,105,60f00845bd62ed8c4e415e02,492112419,AGRI72,AGRI_72,Toute l'information agricole sarthoise au serv...,2020-05-20T16:55:27.000Z,1263151360276680706,"A la Une de votre prochain numéro, un dossier ...",...,['maïs'],,,,,0,0,0,0,politique


In [8]:
batch_predictions = predictor.predict_batch(df_val.text.to_list())

09/14/2021 00:21:42 - INFO - root -   Writing example 0 of 53




In [48]:
batch_predictions[:10]

[[('Observation', 0.07505868375301361)],
 [('Observation', 0.07539146393537521)],
 [('Observation', 0.07547540217638016)],
 [('Observation', 0.07830528169870377)],
 [('Observation', 0.07431840151548386)],
 [('Observation', 0.07494246959686279)],
 [('Observation', 0.07441007345914841)],
 [('Observation', 0.07901930809020996)],
 [('Observation', 0.07463334500789642)],
 [('Observation', 0.8942217826843262)]]

In [9]:
#dict(batch_predictions[1]).values()
list_y_pred = [ dict(pred) for pred in batch_predictions]
#list_y_pred[-5:]

In [10]:
df_y_pred = pd.DataFrame(list_y_pred, columns =['Observation']) 
df_y_pred = df_y_pred.rename(columns={"Observation": "pred_O"})
df_y_pred.tail()

Unnamed: 0,pred_O
48,0.074614
49,0.078071
50,0.078551
51,0.074924
52,0.076817


In [51]:
df_y_pred.describe()

Unnamed: 0,pred_O
count,53.0
mean,0.21219
std,0.295237
min,0.074305
25%,0.074929
50%,0.075391
75%,0.079309
max,0.897978


In [11]:
df_y_real = pd.DataFrame(df_val, columns=['Observation'])
df_y_real.tail()

Unnamed: 0,Observation
48,0
49,0
50,0
51,0
52,0


In [53]:
df_y_real.describe()

Unnamed: 0,Observation
count,53.0
mean,0.132075
std,0.341813
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [5]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve

In [13]:
#find the best f1
precision, recall, thresholds = precision_recall_curve(df_y_real.values, df_y_pred.values)
numerator = 2 * recall * precision
denom = recall + precision
f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom!=0))
max_f1 = np.max(f1_scores)
max_f1_thresh = thresholds[np.argmax(f1_scores)]

In [14]:
max_f1

0.6666666666666666

In [15]:
max_f1_thresh

0.8942217826843262

In [56]:
precision

array([0.14285714, 0.125     , 0.12765957, 0.13043478, 0.13333333,
       0.13636364, 0.13953488, 0.14285714, 0.14634146, 0.15      ,
       0.15384615, 0.15789474, 0.16216216, 0.16666667, 0.17142857,
       0.17647059, 0.18181818, 0.1875    , 0.19354839, 0.2       ,
       0.20689655, 0.21428571, 0.22222222, 0.23076923, 0.24      ,
       0.25      , 0.26086957, 0.27272727, 0.23809524, 0.25      ,
       0.26315789, 0.27777778, 0.29411765, 0.3125    , 0.33333333,
       0.35714286, 0.38461538, 0.33333333, 0.36363636, 0.4       ,
       0.44444444, 0.5       , 0.57142857, 0.66666667, 0.8       ,
       0.75      , 0.66666667, 1.        , 1.        , 1.        ])

In [57]:
recall

array([1.        , 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.85714286, 0.85714286,
       0.85714286, 0.85714286, 0.85714286, 0.71428571, 0.71428571,
       0.71428571, 0.71428571, 0.71428571, 0.71428571, 0.71428571,
       0.71428571, 0.71428571, 0.57142857, 0.57142857, 0.57142857,
       0.57142857, 0.57142857, 0.57142857, 0.57142857, 0.57142857,
       0.42857143, 0.28571429, 0.28571429, 0.14285714, 0.        ])

In [58]:
thresholds

array([0.07451152, 0.07461438, 0.07463335, 0.07468299, 0.07475414,
       0.07476106, 0.07487386, 0.0748859 , 0.07492432, 0.07492901,
       0.07494247, 0.0749509 , 0.074958  , 0.0749658 , 0.07501642,
       0.07503505, 0.07505214, 0.07505868, 0.07511425, 0.07514347,
       0.07515999, 0.07522195, 0.07539146, 0.07541384, 0.0754754 ,
       0.07574085, 0.07583273, 0.07584672, 0.07681745, 0.07699905,
       0.0780709 , 0.07807524, 0.07830528, 0.07855142, 0.07901931,
       0.07930885, 0.08184381, 0.08188963, 0.16723929, 0.23709509,
       0.5806374 , 0.85153979, 0.86937463, 0.87127566, 0.89422178,
       0.89477533, 0.89642942, 0.89730245, 0.89797831])

In [59]:
precision_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.89, average='binary')

0.8

In [60]:
recall_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.89,average='binary')

0.5714285714285714

In [64]:
f1_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.89,average='binary')

0.6666666666666666

In [62]:
accuracy_score(y_true=df_y_real.values, y_pred=df_y_pred.values > 0.89)

0.9245283018867925

In [26]:
df_results = pd.concat([df_val, df_y_pred > 0.89], axis=1, ignore_index=True)

In [27]:
df_results.to_csv('twt_predictions.csv')