In [12]:
import tensorflow as tf

In [1]:
#Python 3.6

In [1]:
import sys
sys.path.append('./libs')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules

from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'BiLSTM_clinical_embeddings'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

###########
# ****** DEFINICION DE PARAMETROS *********

NUM_WORDS   = 12071 + 2
LEN_SENTS   = 153
NUM_TAGS    = 30 + 2

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 60
EMBED_DIM    = 300#350
CHAR_EMBEDD  = 50
_DROPOUT     = 0.5
REC_DROPOUT  = 0.1
LEARN_RATE   = 1e-3
N_TRAIN      = int(1e4)
EP_DECAY     = 1e-8
BETA_1       = 0.9
BETA_2       = 0.999
_BACH_SIZE   = 500
VAL_SPLIT    = 0.1
STEPS_PER_EPOCH = N_TRAIN//_BACH_SIZE
###
#############


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [2]:
word2idx = np.load('vectors/word2index.npy', allow_pickle=True).item()
tag2idx  = np.load('vectors/tag2index.npy', allow_pickle=True).item()
idx2tag  = np.load('vectors/index2tag.npy', allow_pickle=True).item()

In [3]:
print(tag2idx)

{'B_FAMILY': 2, 'I_FREQ': 3, 'I_DATE': 4, 'B_SMOKER_STATUS': 5, 'B_RADIOTHERAPY': 6, 'B_DATE': 7, 'B_FREQ': 8, 'I_STAGE': 9, 'I_SMOKER_STATUS': 10, 'I_CANCER_CONCEPT': 11, 'I_DRUG': 12, 'B_SURGERY': 13, 'I_IMPLICIT_DATE': 14, 'I_OCURRENCE_EVENT': 15, 'B_STAGE': 16, 'I_TNM': 17, 'I_SURGERY': 18, 'B_CHEMOTHERAPY': 19, 'I_METRIC': 20, 'B_METRIC': 21, 'B_CANCER_CONCEPT': 22, 'B_QUANTITY': 23, 'B_OCURRENCE_EVENT': 24, 'B_TNM': 25, 'I_FAMILY': 26, 'O': 27, 'B_DRUG': 28, 'B_IMPLICIT_DATE': 29, 'B_INTERVAL': 30, 'I_INTERVAL': 31, '-PAD-': 0, '-OOV-': 1}


In [4]:
#Loading the model
print("\nLoading the model NER ...")
model = tf.saved_model.load(save_dir + "/")
print ("\nModel loaded ...")


Loading the model NER ...

Model loaded ...


In [5]:
#Note: Sentences must be previously tokenized (e.g NLTK or Spacy for spanish)
#The model receives a tokenized sentence and returns a negation or speculation label for each token.
#Next sentences have been proviously tokenized. 
#It is recommended to use  NLTK o r Spacy tokenizerfor spanish language.
#NER = Named Entity Recognition
NER_samples = [
    "CICLO 2 CARBOPLATINO / PACLITAXEL . ".split(),
    "En Agosto de 2015 ha recibido 3 ciclos de CISPLATINO / VINORELBINA buena tolerancia clinica .".split(),
    "QT : CISPLATINO 75 mg / m2 DIA 1 IV + VINORELBINA 25 mg / m2 IV DIAS 1,8 - Adenocarcinoma pulmon lobulo superior derecho ".split(),
    "El dia 27 de junio iniciamos tratamiento con quimioterapia segun esquema CARBOPLATINO / PEMETREXED .".split(),
    "CICLO 1 CARBOPLATINO AUC 5 - PEMETREXED 500 mg/m2 IV cada 21 dias..".split(),
    "RT con dosis 50 Gy, se encuentra bien . ".split(),
    "Carcinoma escamoso de pulmón cT3 cN2 cM0 (al menos estadio IIIB de TNM 8ª ed .".split(),
    "Diagnosticado en marzo de 2016 de Adenoca de pulmón cT2cN2cM1a .".split(),
    "Ha sido diagnosticada de cancer de pulmon en marzo de 2019 .".split(),
    "Inicia tratamiento con Cisplatino + Pemetrexed + Bevacizumab (5 ciclos administrados, ultimo en enero de 2014).".split(),
    "Carcinoma escamoso de pulmón intervenido en marzo 2017 .".split(),
    "En 2014, intervenido de carcinoma de pulmón pT2bN1cM0 realizandose nefrectomia derecha .".split(),
    "Carcinoma microcitico de pulmon t4n2m0 en tto quimioterapico: carboplatino / etoposido .".split(),
    "Carcinoma de pulmón SEGÚN PROTOCOLO DEL COLEGIO AMERICANO DE PATOLOGOS . ".split(),
    "Biopsia hepatica 07/05/2019 : bajo control ecografico se localiza masa homogenea tenuemente hipoecogenica de 4,8 x 2,9 x 2,3 cm en segmento ii y se procede a realizar bag ecoguiada de la misma . ".split()
]
    


#NER_samples =["biopsia hepatica 07/05/2019 : bajo control ecografico se localiza masa homogenea tenuemente hipoecogenica de 4,8 x 2,9 x 2,3 cm en segmento ii y se procede a realizar bag ecoguiada de la misma . ".split()]

In [6]:
NER_samples_X    = []

for s1 in NER_samples:
    s1_int = []
    for w in s1:
        try:
            s1_int.append(word2idx[w.lower()])
        except KeyError:
            s1_int.append(word2idx['-OOV-'])
    NER_samples_X.append(s1_int)

NER_samples_X = pad_sequences(NER_samples_X, maxlen=LEN_SENTS, padding='post')

print("Negation: \n", NER_samples_X)

Negation: 
 [[ 2926  3770  4158 ...     0     0     0]
 [ 1044  8808  2706 ...     0     0     0]
 [    1  4616  7093 ...     0     0     0]
 ...
 [10469   531  2706 ...     0     0     0]
 [10469  2706   682 ...     0     0     0]
 [ 8187  9923 11041 ...     0     0     0]]


In [7]:
predictions1 = model(NER_samples_X).numpy()
#print("Negation: \n", predictions1, predictions1.shape)

In [8]:
#log_tokens1 = logits_to_tokens(np.argmax(predictions1, -1), idx2tag)
log_tokens1 = logits_to_tokens(predictions1, idx2tag)
print ("Sentence " , NER_samples_X[0])
print("Negation: \n", log_tokens1[0])

Sentence  [2926 3770 4158 4313 2954  735    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]
Negation: 
 ['B_METRIC', 'B_QUANTITY', 'B_DRUG', 'O', 'B_DRUG', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

In [9]:
for h, oracc in enumerate(NER_samples):
    heads = oracc
    body  = [log_tokens1[h][:len(oracc)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))

CICLO,2,CARBOPLATINO,/,PACLITAXEL,.
B_METRIC,B_QUANTITY,B_DRUG,O,B_DRUG,O


En,Agosto,de,2015,ha,recibido,3,ciclos,de.1,CISPLATINO,/,VINORELBINA,buena,tolerancia,clinica,.
O,B_DATE,I_DATE,I_DATE,O,O,B_QUANTITY,B_METRIC,O,B_DRUG,O,B_DRUG,O,O,O,O


QT,:,CISPLATINO,75,mg,/,m2,DIA,1,IV,+,VINORELBINA,25,mg.1,/.1,m2.1,IV.1,DIAS,"1,8",-,Adenocarcinoma,pulmon,lobulo,superior,derecho
B_DATE,O,B_DRUG,B_QUANTITY,B_METRIC,O,B_METRIC,B_METRIC,B_QUANTITY,O,O,B_DRUG,B_QUANTITY,B_METRIC,O,B_METRIC,O,O,B_QUANTITY,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT


El,dia,27,de,junio,iniciamos,tratamiento,con,quimioterapia,segun,esquema,CARBOPLATINO,/,PEMETREXED,.
O,O,B_DATE,I_DATE,I_DATE,B_OCURRENCE_EVENT,I_OCURRENCE_EVENT,O,B_CHEMOTHERAPY,O,O,B_DRUG,O,B_DRUG,O


CICLO,1,CARBOPLATINO,AUC,5,-,PEMETREXED,500,mg/m2,IV,cada,21,dias..
B_METRIC,B_QUANTITY,B_DRUG,B_DRUG,B_QUANTITY,O,B_DRUG,B_QUANTITY,B_METRIC,O,B_FREQ,I_FREQ,-PAD-


RT,con,dosis,50,"Gy,",se,encuentra,bien,.
O,O,O,B_QUANTITY,B_METRIC,O,O,O,O


Carcinoma,escamoso,de,pulmón,cT3,cN2,cM0,(al,menos,estadio,IIIB,de.1,TNM,8ª,ed,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,B_TNM,I_TNM,B_METRIC,O,O,B_STAGE,I_STAGE,O,O,B_QUANTITY,B_METRIC,O


Diagnosticado,en,marzo,de,2016,de.1,Adenoca,de.2,pulmón,cT2cN2cM1a,.
B_OCURRENCE_EVENT,O,B_DATE,I_DATE,I_DATE,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,B_TNM,O


Ha,sido,diagnosticada,de,cancer,de.1,pulmon,en,marzo,de.2,2019,.
O,O,B_OCURRENCE_EVENT,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,B_DATE,I_DATE,I_DATE,O


Inicia,tratamiento,con,Cisplatino,+,Pemetrexed,+.1,Bevacizumab,(5,ciclos,"administrados,",ultimo,en,enero,de,2014).
B_OCURRENCE_EVENT,I_OCURRENCE_EVENT,O,B_DRUG,O,B_DRUG,O,B_DATE,O,B_METRIC,O,B_OCURRENCE_EVENT,O,B_DATE,I_DATE,-PAD-


Carcinoma,escamoso,de,pulmón,intervenido,en,marzo,2017,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,B_OCURRENCE_EVENT,O,B_DATE,I_DATE,O


En,"2014,",intervenido,de,carcinoma,de.1,pulmón,pT2bN1cM0,realizandose,nefrectomia,derecha,.
O,B_DATE,B_OCURRENCE_EVENT,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,O,B_SURGERY,I_SURGERY,O


Carcinoma,microcitico,de,pulmon,t4n2m0,en,tto,quimioterapico:,carboplatino,/,etoposido,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,O,O,O,B_DRUG,O,B_DRUG,O


Carcinoma,de,pulmón,SEGÚN,PROTOCOLO,DEL,COLEGIO,AMERICANO,DE,PATOLOGOS,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,O,O,-PAD-,O,O,B_DATE,O


Biopsia,hepatica,07/05/2019,:,bajo,control,ecografico,se,localiza,masa,homogenea,tenuemente,hipoecogenica,de,"4,8",x,"2,9",x.1,"2,3",cm,en,segmento,ii,y,se.1,procede,a,realizar,bag,ecoguiada,de.1,la,misma,.
O,O,B_DATE,O,O,O,O,O,O,O,O,O,O,O,B_QUANTITY,O,B_QUANTITY,O,B_QUANTITY,B_METRIC,O,O,O,O,O,O,O,O,O,O,O,O,O,O
