In [None]:
#First of all, you must install next software requirements

#!/opt/conda/bin/python3.7 -m pip install --upgrade pip
#!pip install seqeval
#!pip install tensorflow-addons

In [None]:
import sys
sys.path.append('libs/')

import datetime, os
import random
import time

from tqdm import tqdm
from tabulate import tabulate
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

from IPython.core.display import display, HTML

# ****** DEFINICION DE PARAMETROS *********
MAX_LEN        = 348
NUM_LABELS     = 12
WORD_PAD_TOKEN = 0

#ESPECIAL_TOKEN = 9
#SEP_TOKEN      = 10
#PAD_TOKEN      = 11

configuration = BertConfig()
BERT_MODEL = "bert-base-multilingual-cased"

#MODEL         = 'model'
log_dir       = "saved_model/logs/model/"
save_dir      = "saved_model/model/" 

le_dicti = {'B-NEG': 0, 'B-NSCO': 1, 'B-UNC': 2, 'B-USCO': 3, 'I-NEG': 4, 'I-NSCO': 5, 'I-UNC': 6, 'I-USCO': 7, 'O': 8, '[CLS]': 9, '[SEP]': 10, '[PAD]': 11}

le_dict = {}
for key in le_dicti:
    #print(key, '->', le_dict[key])
    le_dict[le_dicti[key]] = key

In [None]:
def convert_to_input(sentences, tags, in_ou_put):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                #label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        # special_tokens_count =  2
        
        #if len(tokens) > LEN_SENTS - special_tokens_count:
        #    tokens = tokens[: (LEN_SENTS - special_tokens_count)]

        #    if in_ou_put == 1:
        #        label_ids = label_ids[: (LEN_SENTS - special_tokens_count)]
        
        if in_ou_put == 1:
            #label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        #print(attention_masks)
        #attention_masks = [17] + [1] * (len(input_ids)-2) + [17]
        #print(attention_masks)
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)
        #label_id_list   = [to_categorical(i, num_classes=num_labels, dtype ="int32") for i in label_id_list]
        #label_id_list   = np.array(label_id_list)

    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [None]:

MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)


In [None]:
#Loading the model
print("\nLoading the model for negation and speculation detection ...")

new_model = tf.saved_model.load(save_dir)

print ("\nModel loaded ...")

In [None]:
#Note: Sentences must be previously tokenized (e.g Using Spacy for spanish)
#The model receives a tokenized sentence and returns a negation or speculation label for each token.
#Next sentences have been proviously tokenized. 
#It is recommended to use  Spacy tokenizerfor spanish language.

negation_samples = [
    "No dolor toráxico.".split(),
    "Paciente con probable carcinoma pulmonar.".split(),
    "inflamacion aguda negativa.".split(),
    "helicobacter pylori negativo.".split(),
    "negativo para malignidad.".split(),
    "No se puede descartar cáncer de pulmón .".split(),
    "helicobacter pylori negativo.".split(),
    "Test negativo para malignidad.".split() 
    
]

dummy_y_train = []
#dummy_y_train = ['b-neg', 'scope', 'o'....]

for snt in negation_samples:
    senti = []
    for wds in snt:
        senti.append('-PAD-')
    
    dummy_y_train.append(senti)


demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train, label_ids_train = convert_to_input(negation_samples, dummy_y_train, 0)
#demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train, label_ids_train = convert_to_input(negation_samples, dummy_y_train, 1)

In [None]:
#Make predictions
demo_prediction = new_model([demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train])

demo_pred_tags = np.argmax(demo_prediction, -1)

In [None]:
demo_y_pred = logits_to_tokens(demo_pred_tags, le_dict)
#print(demo_y_pred)

In [None]:
for h, oracc in enumerate(negation_samples):
    #heads = oracc
    #if h == 0:
    tokensito = []
    for wordi in oracc:
        wordi_tokens = tokenizer.tokenize(str(wordi))
        tokensito.extend(wordi_tokens)

    print(oracc)
    #print(tokensito)
    #print(demo_y_pred[h])
    heads = tokensito
    body  = [demo_y_pred[h][1:len(tokensito)+1]]
    
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))
    
   
    print("\n")