In [1]:
import sys
#sys.path.append('../libs')
sys.path.append('libs/')

import datetime, os
import random
import time

from tqdm import tqdm
from tabulate import tabulate
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

from IPython.core.display import display, HTML

# ****** DEFINICION DE PARAMETROS *********
NUM_WORDS   = 12071 + 2
LEN_SENTS   = 153
NUM_LABELS    = 30 + 3
MAX_LEN = 203

WORD_PAD_TOKEN = 0



configuration = BertConfig()
BERT_MODEL = "bert-base-multilingual-cased"


MODEL         = 'BERT'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL


le_dicti ={'B_CANCER_CONCEPT': 0, 'B_CHEMOTHERAPY': 1, 'B_DATE': 2, 'B_DRUG': 3, 'B_FAMILY': 4, 
         'B_FREQ': 5, 'B_IMPLICIT_DATE': 6, 'B_INTERVAL': 7, 'B_METRIC': 8, 'B_OCURRENCE_EVENT': 9, 
         'B_QUANTITY': 10, 'B_RADIOTHERAPY': 11, 'B_SMOKER_STATUS': 12, 'B_STAGE': 13, 'B_SURGERY': 14, 
        'B_TNM': 15, 'I_CANCER_CONCEPT': 16, 'I_DATE': 17, 'I_DRUG': 18, 'I_FAMILY': 19, 
        'I_FREQ': 20, 'I_IMPLICIT_DATE': 21, 'I_INTERVAL': 22, 'I_METRIC': 23,
        'I_OCURRENCE_EVENT': 24, 'I_SMOKER_STATUS': 25, 'I_STAGE': 26, 'I_SURGERY': 27, 
        'I_TNM': 28, 'O': 29, '[CLS]': 30, '[SEP]': 31, '[PAD]': 32}




le_dict = {}
for key in le_dicti:
    #print(key, '->', le_dict[key])
    le_dict[le_dicti[key]] = key

In [2]:
def convert_to_input(sentences, tags, in_ou_put):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                #label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        # special_tokens_count =  2
        
        #if len(tokens) > LEN_SENTS - special_tokens_count:
        #    tokens = tokens[: (LEN_SENTS - special_tokens_count)]

        #    if in_ou_put == 1:
        #        label_ids = label_ids[: (LEN_SENTS - special_tokens_count)]
        
        if in_ou_put == 1:
            #label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        #print(attention_masks)
        #attention_masks = [17] + [1] * (len(input_ids)-2) + [17]
        #print(attention_masks)
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)
        #label_id_list   = [to_categorical(i, num_classes=num_labels, dtype ="int32") for i in label_id_list]
        #label_id_list   = np.array(label_id_list)

    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [3]:
MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)

In [4]:
ner_samples = [
    "CICLO 2 CARBOPLATINO / PACLITAXEL. ".split(),
    "En Agosto de 2015 ha recibido 3 ciclos de CISPLATINO / VINORELBINA buena tolerancia clinica .".split(),
    "QT : CISPLATINO 75 mg / m2 DIA 1 IV + VINORELBINA 25 mg / m2 IV DIAS 1,8 - Adenocarcinoma pulmon lobulo superior derecho ".split(),
    "El dia 27 de junio iniciamos tratamiento con quimioterapia segun esquema CARBOPLATINO / PEMETREXED .".split(),
    "CICLO 1 CARBOPLATINO AUC 5 - PEMETREXED 500 mg/m2 IV cada 21 dias..".split(),
    "RT con dosis 50 Gy, se encuentra bien. ".split(),
    "Carcinoma escamoso de pulmón cT3 cN2 cM0 (al menos estadio IIIB de TNM 8ª ed .".split(),
    "Diagnosticado en marzo de 2016 de Adenoca de pulmón cT2cN2cM1a .".split(),
    "Ha sido diagnosticada de cancer de pulmon en marzo de 2019 .".split(),
    "Inicia tratamiento con Cisplatino + Pemetrexed + Bevacizumab (5 ciclos administrados, ultimo en enero de 2014).".split(),
    "Carcinoma escamoso de pulmón intervenido en marzo 2017 .".split(),
    "En 2014, intervenido de carcinoma de pulmón pT2bN1cM0 realizandose nefrectomia derecha .".split(),
    "carcinoma microcitico de pulmon t4n2m0 en tto quimioterapico: carboplatino / etoposido .".split(),
    "Colico renoureteral derecho con fracaso renal obstructivo en Julio de 2015. ".split(),
    "Paciente con diabetes mellitus. ".split()
    ]

In [5]:
dummy_y_train = []

for snt in ner_samples:
    senti = []
    for wds in snt:
        senti.append('-PAD-')
    
    dummy_y_train.append(senti)


demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train, label_ids_train = convert_to_input(ner_samples, dummy_y_train, 0)

  0%|          | 0/15 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 15/15 [00:00<00:00, 1389.52it/s]


In [6]:
new_model = tf.saved_model.load(save_dir)
print("BERT model loaded succesfully")


BERT model loaded succesfully


In [7]:
demo_prediction = new_model([demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train])

demo_pred_tags = np.argmax(demo_prediction, -1)

In [8]:
demo_y_pred = logits_to_tokens(demo_pred_tags, le_dict)

In [9]:
for h, oracc in enumerate(ner_samples):
    #heads = oracc
    #if h == 0:
    tokensito = []
    for wordi in oracc:
        wordi_tokens = tokenizer.tokenize(str(wordi))
        tokensito.extend(wordi_tokens)

    print(oracc)
    #print(tokensito)
    #print(demo_y_pred[h])
    heads = tokensito
    body  = [demo_y_pred[h][1:len(tokensito)+1]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))

['CICLO', '2', 'CARBOPLATINO', '/', 'PACLITAXEL.']


C,##IC,##L,##O,2,CA,##R,##B,##OP,##LA,##TI,##NO,/,PA,##CL,##IT,##AX,##EL,.
B_METRIC,[SEP],[SEP],[SEP],B_QUANTITY,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],[SEP],O


['En', 'Agosto', 'de', '2015', 'ha', 'recibido', '3', 'ciclos', 'de', 'CISPLATINO', '/', 'VINORELBINA', 'buena', 'tolerancia', 'clinica', '.']


En,Agosto,de,2015,ha,recibido,3,ciclo,##s,de.1,C,##IS,##P,##LA,##TI,##NO,/,VI,##NO.1,##RE,##L,##B,##IN,##A,buena,tol,##eran,##cia,c,##lini,##ca,.
O,B_DATE,I_DATE,I_DATE,O,O,B_QUANTITY,B_METRIC,[SEP],O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],O,O,[SEP],[SEP],O,[SEP],[SEP],O


['QT', ':', 'CISPLATINO', '75', 'mg', '/', 'm2', 'DIA', '1', 'IV', '+', 'VINORELBINA', '25', 'mg', '/', 'm2', 'IV', 'DIAS', '1,8', '-', 'Adenocarcinoma', 'pulmon', 'lobulo', 'superior', 'derecho']


Q,##T,:,C,##IS,##P,##LA,##TI,##NO,75,mg,/,m2,DI,##A,1,IV,+,VI,##NO.1,##RE,##L,##B,##IN,##A.1,25,mg.1,/.1,m2.1,IV.1,DI.1,##AS,1.1,",",8,-,Aden,##oca,##rci,##noma,pu,##lm,##on,lo,##bulo,superior,derecho
B_CHEMOTHERAPY,[SEP],O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],B_QUANTITY,B_METRIC,O,B_METRIC,B_METRIC,[SEP],B_QUANTITY,O,O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],B_QUANTITY,B_METRIC,O,B_METRIC,O,B_METRIC,[SEP],B_QUANTITY,[SEP],[SEP],O,B_CANCER_CONCEPT,[SEP],[SEP],[SEP],I_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,[SEP],I_CANCER_CONCEPT,I_CANCER_CONCEPT


['El', 'dia', '27', 'de', 'junio', 'iniciamos', 'tratamiento', 'con', 'quimioterapia', 'segun', 'esquema', 'CARBOPLATINO', '/', 'PEMETREXED', '.']


El,dia,27,de,junio,inicia,##mos,tratamiento,con,qui,##mio,##tera,##pia,segun,esquema,CA,##R,##B,##OP,##LA,##TI,##NO,/,PE,##ME,##T,##RE,##X,##ED,.
O,O,B_DATE,I_DATE,I_DATE,B_OCURRENCE_EVENT,[SEP],I_OCURRENCE_EVENT,O,B_CHEMOTHERAPY,[SEP],[SEP],[SEP],O,O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],O


['CICLO', '1', 'CARBOPLATINO', 'AUC', '5', '-', 'PEMETREXED', '500', 'mg/m2', 'IV', 'cada', '21', 'dias..']


C,##IC,##L,##O,1,CA,##R,##B,##OP,##LA,##TI,##NO,AU,##C,5,-,PE,##ME,##T,##RE,##X,##ED,500,mg,/,m2,IV,cada,21,dias,.,..1
B_METRIC,[SEP],[SEP],[SEP],B_QUANTITY,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],B_METRIC,[SEP],B_QUANTITY,O,B_DRUG,[SEP],[SEP],[SEP],[SEP],[SEP],B_QUANTITY,B_METRIC,O,B_METRIC,O,B_FREQ,I_FREQ,I_FREQ,O,O


['RT', 'con', 'dosis', '50', 'Gy,', 'se', 'encuentra', 'bien.']


RT,con,dos,##is,50,G,##y,",",se,encuentra,bien,.
B_RADIOTHERAPY,O,O,[SEP],B_QUANTITY,B_METRIC,[SEP],O,O,O,O,O


['Carcinoma', 'escamoso', 'de', 'pulmón', 'cT3', 'cN2', 'cM0', '(al', 'menos', 'estadio', 'IIIB', 'de', 'TNM', '8ª', 'ed', '.']


Car,##cino,##ma,es,##cam,##oso,de,pu,##lm,##ón,c,##T,##3,c.1,##N,##2,c.2,##M,##0,(,al,menos,estadio,III,##B,de.1,TN,##M.1,8,##ª,ed,.
B_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,I_CANCER_CONCEPT,[SEP],[SEP],B_TNM,[SEP],[SEP],I_TNM,[SEP],[SEP],I_TNM,[SEP],[SEP],O,O,O,B_STAGE,I_STAGE,[SEP],O,O,[SEP],B_QUANTITY,[SEP],B_METRIC,O


['Diagnosticado', 'en', 'marzo', 'de', '2016', 'de', 'Adenoca', 'de', 'pulmón', 'cT2cN2cM1a', '.']


Dia,##gno,##stica,##do,en,marzo,de,2016,de.1,Aden,##oca,de.2,pu,##lm,##ón,c,##T,##2,##c,##N,##2.1,##c.1,##M,##1,##a,.
B_OCURRENCE_EVENT,[SEP],[SEP],[SEP],O,B_DATE,I_DATE,I_DATE,O,B_CANCER_CONCEPT,[SEP],I_CANCER_CONCEPT,I_CANCER_CONCEPT,[SEP],[SEP],B_TNM,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],O


['Ha', 'sido', 'diagnosticada', 'de', 'cancer', 'de', 'pulmon', 'en', 'marzo', 'de', '2019', '.']


Ha,sido,diagnostic,##ada,de,cancer,de.1,pu,##lm,##on,en,marzo,de.2,2019,.
O,O,B_OCURRENCE_EVENT,[SEP],O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,[SEP],[SEP],O,B_DATE,I_DATE,I_DATE,O


['Inicia', 'tratamiento', 'con', 'Cisplatino', '+', 'Pemetrexed', '+', 'Bevacizumab', '(5', 'ciclos', 'administrados,', 'ultimo', 'en', 'enero', 'de', '2014).']


Ini,##cia,tratamiento,con,Ci,##sp,##lati,##no,+,Pe,##metre,##xe,##d,+.1,Be,##vac,##iz,##uma,##b,(,5,ciclo,##s,ad,##mini,##strado,##s.1,",",ultimo,en,enero,de,2014,),.
B_OCURRENCE_EVENT,[SEP],I_OCURRENCE_EVENT,O,B_DRUG,[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],[SEP],O,B_QUANTITY,B_METRIC,[SEP],O,[SEP],[SEP],[SEP],O,B_OCURRENCE_EVENT,O,B_DATE,I_DATE,I_DATE,O,O


['Carcinoma', 'escamoso', 'de', 'pulmón', 'intervenido', 'en', 'marzo', '2017', '.']


Car,##cino,##ma,es,##cam,##oso,de,pu,##lm,##ón,inter,##veni,##do,en,marzo,2017,.
B_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,I_CANCER_CONCEPT,[SEP],[SEP],B_OCURRENCE_EVENT,[SEP],[SEP],O,B_DATE,I_DATE,O


['En', '2014,', 'intervenido', 'de', 'carcinoma', 'de', 'pulmón', 'pT2bN1cM0', 'realizandose', 'nefrectomia', 'derecha', '.']


En,2014,",",inter,##veni,##do,de,car,##cino,##ma,de.1,pu,##lm,##ón,p,##T,##2,##b,##N,##1,##c,##M,##0,realizando,##se,nef,##rec,##tomia,derecha,.
O,B_DATE,O,B_OCURRENCE_EVENT,[SEP],[SEP],O,B_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,I_CANCER_CONCEPT,[SEP],[SEP],B_TNM,[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],[SEP],O,[SEP],I_SURGERY,[SEP],[SEP],I_SURGERY,O


['carcinoma', 'microcitico', 'de', 'pulmon', 't4n2m0', 'en', 'tto', 'quimioterapico:', 'carboplatino', '/', 'etoposido', '.']


car,##cino,##ma,micro,##citi,##co,de,pu,##lm,##on,t,##4,##n,##2,##m,##0,en,t.1,##to,qui,##mio,##tera,##pico,:,car.1,##bo,##pla,##tino,/,et,##op,##osi,##do,.
B_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,[SEP],[SEP],I_CANCER_CONCEPT,I_CANCER_CONCEPT,[SEP],[SEP],B_TNM,[SEP],[SEP],[SEP],[SEP],[SEP],O,O,[SEP],B_CHEMOTHERAPY,[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],O,B_DRUG,[SEP],[SEP],[SEP],O


['Colico', 'renoureteral', 'derecho', 'con', 'fracaso', 'renal', 'obstructivo', 'en', 'Julio', 'de', '2015.']


Col,##ico,ren,##our,##etera,##l,derecho,con,frac,##aso,ren.1,##al,ob,##stru,##ctivo,en,Julio,de,2015,.
O,[SEP],O,[SEP],[SEP],[SEP],O,O,O,[SEP],O,[SEP],O,[SEP],[SEP],O,B_DATE,I_DATE,I_DATE,O


['Paciente', 'con', 'diabetes', 'mellitus.']


Pac,##iente,con,diabetes,me,##lli,##tus,.
O,[SEP],O,O,O,[SEP],[SEP],O
