In [None]:
import sys
#sys.path.append('../libs')
sys.path.append('libs/')

import datetime, os
import random
import time

from tqdm import tqdm
from tabulate import tabulate
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

from IPython.core.display import display, HTML

# ****** DEFINICION DE PARAMETROS *********
NUM_WORDS    = 14485 + 2 # 13705
LEN_SENTS    = 306
MAX_LEN      = 596
NUM_TAGS     = 37 + 3
NUM_LABELS   = 37 + 3

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 15
_DROPOUT     = 0.4
_BACH_SIZE   = 64
VAL_SPLIT    = 0.3
NUM_FOLDS    = 10


ESPECIAL_TOKEN = 37
SEP_TOKEN      = 38
PAD_TOKEN      = 39
WORD_PAD_TOKEN = 0

configuration = BertConfig()
BERT_MODEL = "bert-base-multilingual-cased"

logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/"
save_base_dir = './model'
save_dir      = save_base_dir + "/"


le_dicti = {'B-BIOMARKER': 0, 'B-BIOMARKER_STATUS': 1, 'B-CANCER_CONCEPT': 2, 'B-CHEMOTHERAPY': 3, 'B-CLINICAL_SERVICE': 4, 'B-COMORBIDITY': 5, 'B-DRUG': 6, 'B-EXPLICIT_DATE': 7, 'B-FAMILY': 8, 'B-FREQ': 9, 'B-IMPLICIT_DATE': 10, 'B-METRIC': 11, 'B-OCURRENCE_EVENT': 12, 'B-QUANTITY': 13, 'B-RADIOTHERAPY': 14, 'B-STAGE': 15, 'B-SURGERY': 16, 'B-TNM': 17, 'B-TOXIC_HABIT': 18, 'I-BIOMARKER': 19, 'I-BIOMARKER_STATUS': 20, 'I-CANCER_CONCEPT': 21, 'I-CLINICAL_SERVICE': 22, 'I-COMORBIDITY': 23, 'I-DRUG': 24, 'I-EXPLICIT_DATE': 25, 'I-FAMILY': 26, 'I-FREQ': 27, 'I-IMPLICIT_DATE': 28, 'I-METRIC': 29, 'I-OCURRENCE_EVENT': 30, 'I-QUANTITY': 31, 'I-STAGE': 32, 'I-SURGERY': 33, 'I-TNM': 34, 'I-TOXIC_HABIT': 35, 'O': 36, '[CLS]': 37, '[SEP]': 38, '[PAD]': 39}

#print(le_dicti )


le_dict = {}
for key in le_dicti:
    #print(key, '->', le_dict[key])
    le_dict[le_dicti[key]] = key
    
    



In [None]:
MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)

In [None]:
il_tokens = []

def convert_to_input(sentences, tags, in_ou_put, testy):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    len_tokens          = []
    len_sentences       = []
    
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        if testy == 1:
            il_tokens.append(['[CLS]'] + tokens + (['[PAD]'] * (595 - len(tokens))))
        
        if in_ou_put == 1:
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        if in_ou_put == 1:
            len_tokens.append(len(label_ids))
            len_sentences.append(len(x))
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    if in_ou_put == 1:
        print(">>> :", max(len_tokens))
        print(">>>> :", max(len_sentences))
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)


    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [None]:
ner_samples = [
    "CICLO 2 CARBOPLATINO / PACLITAXEL. ".split(),
    "En Agosto de 2015 ha recibido 3 ciclos de CISPLATINO / VINORELBINA buena tolerancia clinica .".split(),
    "QT : CISPLATINO 75 mg / m2 DIA 1 IV + VINORELBINA 25 mg / m2 IV DIAS 1,8 - Adenocarcinoma pulmon lobulo superior derecho ".split(),
    "El dia 27 de junio iniciamos tratamiento con quimioterapia segun esquema CARBOPLATINO / PEMETREXED .".split(),
    "CICLO 1 CARBOPLATINO AUC 5 - PEMETREXED 500 mg/m2 IV cada 21 dias..".split(),
    "RT con dosis 50 Gy, se encuentra bien. ".split(),
    "Carcinoma escamoso de pulmón cT3 cN2 cM0 (al menos estadio IIIB de TNM 8ª ed .".split(),
    "Diagnosticado en marzo de 2016 de Adenoca de pulmón cT2cN2cM1a .".split(),
    "Ha sido diagnosticada de cancer de pulmon en marzo de 2019 .".split(),
    "Inicia tratamiento con Cisplatino + Pemetrexed + Bevacizumab (5 ciclos administrados, ultimo en enero de 2014).".split(),
    "Carcinoma escamoso de pulmón intervenido en marzo 2017 .".split(),
    "En 2014, intervenido de carcinoma de pulmón pT2bN1cM0 realizandose nefrectomia derecha .".split(),
    "carcinoma microcitico de pulmon t4n2m0 en tto quimioterapico: carboplatino / etoposido .".split(),
    "Colico renoureteral derecho con fracaso renal obstructivo en Julio de 2015. ".split()
    ]

In [None]:
dummy_y_train = []

for snt in ner_samples:
    senti = []
    for wds in snt:
        senti.append('-PAD-')
    
    dummy_y_train.append(senti)

demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train, label_ids_train = convert_to_input(ner_samples, dummy_y_train, 0, 0)

In [None]:
print (len(demo_input_ids_train[0]), len(demo_token_ids_train[0]), len(demo_attention_masks_train[0]),label_ids_train )


In [None]:
new_model = tf.saved_model.load(save_dir)
#new_model = tf.keras.models.load_model(save_dir)
print("BERT model loaded succesfully")


In [None]:
demo_prediction = new_model([demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train])

In [None]:
demo_pred_tags = np.argmax(demo_prediction, -1)

In [None]:
demo_y_pred = logits_to_tokens(demo_pred_tags, le_dict)

In [None]:
for h, oracc in enumerate(ner_samples):
    #heads = oracc
    #if h == 0:
    tokensito = []
    for wordi in oracc:
        wordi_tokens = tokenizer.tokenize(str(wordi))
        tokensito.extend(wordi_tokens)

    print(oracc)
    #print(tokensito)
    #print(demo_y_pred[h])
    heads = tokensito
    body  = [demo_y_pred[h][1:len(tokensito)+1]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))