In [1]:
import sys
sys.path.append('../libs')

import datetime, os
import random
import time

from tqdm import tqdm
from tabulate import tabulate
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

from IPython.core.display import display, HTML

# ****** DEFINICION DE PARAMETROS *********
MAX_LEN        = 409
NUM_LABELS     = 9 + 3
WORD_PAD_TOKEN = 0

ESPECIAL_TOKEN = 9
SEP_TOKEN      = 10
PAD_TOKEN      = 11

configuration = BertConfig()
BERT_MODEL = "dccuchile/bert-base-spanish-wwm-cased"

save_dir      = "./model"

le_dict = {}

le_dicti = {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8, '[CLS]': 9, '[SEP]': 10, '[PAD]': 11}

for key in le_dicti:
    #print(key, '->', le_dict[key])
    le_dict[le_dicti[key]] = key

In [2]:
def convert_to_input(sentences, tags, in_ou_put):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                #label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        # special_tokens_count =  2
        
        #if len(tokens) > LEN_SENTS - special_tokens_count:
        #    tokens = tokens[: (LEN_SENTS - special_tokens_count)]

        #    if in_ou_put == 1:
        #        label_ids = label_ids[: (LEN_SENTS - special_tokens_count)]
        
        if in_ou_put == 1:
            #label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        #print(attention_masks)
        #attention_masks = [17] + [1] * (len(input_ids)-2) + [17]
        #print(attention_masks)
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)
        #label_id_list   = [to_categorical(i, num_classes=num_labels, dtype ="int32") for i in label_id_list]
        #label_id_list   = np.array(label_id_list)

    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [2]:
MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)

In [4]:
word_tokens = tokenizer.tokenize("La biopsia no muestra células de cáncer.")
print(word_tokens)

['La', 'bio', '##psia', 'no', 'muestra', 'células', 'de', 'cáncer', '.']


In [4]:
negation_samples = [
    "Correr en Colombia con James Rodriguez .".split(),
    "Éste gran hombre ganó con el Real Madrid de España y en Alemania con Roler.".split()
]

dummy_y_train = []

for snt in negation_samples:
    senti = []
    for wds in snt:
        senti.append('-PAD-')
    
    dummy_y_train.append(senti)


demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train, label_ids_train = convert_to_input(negation_samples, dummy_y_train, 0)

  0%|          | 0/2 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2/2 [00:00<00:00, 1314.21it/s]


In [5]:
new_model = tf.saved_model.load(save_dir)

In [6]:
demo_prediction = new_model([demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train])

demo_pred_tags = np.argmax(demo_prediction, -1)

In [7]:
demo_y_pred = logits_to_tokens(demo_pred_tags, le_dict)

In [8]:
for h, oracc in enumerate(negation_samples):
    #heads = oracc
    #if h == 0:
    tokensito = []
    for wordi in oracc:
        wordi_tokens = tokenizer.tokenize(str(wordi))
        tokensito.extend(wordi_tokens)

    #print(oracc)
    #print(tokensito)
    #print(demo_y_pred[h])
    heads = tokensito
    body  = [demo_y_pred[h][1:len(tokensito)+1]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))

['Correr', 'en', 'Colombia', 'con', 'James', 'Rodriguez', '.']


Corre,##r,en,Colombia,con,J,##ames,Rodri,##guez,.
O,[SEP],O,B-LOC,O,B-PER,[SEP],I-PER,[SEP],O


['Éste', 'gran', 'hombre', 'ganó', 'con', 'el', 'Real', 'Madrid', 'de', 'España', 'y', 'en', 'Alemania', 'con', 'Roler.']


É,##ste,gran,hombre,ganó,con,el,Real,Madrid,de,España,y,en,Alemania,con.1,Rol,##er,.
O,[SEP],O,O,O,O,O,B-ORG,I-ORG,I-ORG,I-ORG,O,O,B-LOC,O,B-PER,[SEP],O
