In [None]:
#First of all, you must install next software requirements

#!/opt/conda/bin/python3.7 -m pip install --upgrade pip
#!pip install seqeval
#!pip install tensorflow-addons


In [42]:
# Testing NER
import sys
#sys.path.append('../libs')
sys.path.append('libs/')
import tensorflow as tf
#import keras as k
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential, model_from_json
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten
from tensorflow.keras.optimizers import Adam, schedules

from crfta import CRF as crf4
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

# ****** DEFINICION DE PARAMETROS *********

NUM_WORDS   = 12071 + 2
LEN_SENTS   = 153
NUM_TAGS    = 30 + 2

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 50
EMBED_DIM    = 350#300
CHAR_EMBEDD  = 50
_DROPOUT     = 0.5
REC_DROPOUT  = 0.1
LEARN_RATE   = 1e-3
N_TRAIN      = int(1e4)
EP_DECAY     = 1e-8
BETA_1       = 0.9
BETA_2       = 0.999
_BACH_SIZE   = 500
VAL_SPLIT    = 0.1
STEPS_PER_EPOCH = N_TRAIN//_BACH_SIZE
###

from IPython.core.display import display, HTML

import datetime, os
import random


In [43]:
#word2idx = np.load('../vectors/word2index.npy', allow_pickle=True).item()
#idx2tag  = np.load('../vectors/index2tag.npy', allow_pickle=True).item()

word2idx = np.load('vectors/word2index.npy', allow_pickle=True).item()
idx2tag  = np.load('vectors/index2tag.npy', allow_pickle=True).item()

In [44]:
input1 = Input(shape=(LEN_SENTS,), dtype='int32')

# Embedding sentences Layer
sentences = Embedding(NUM_WORDS,
                      EMBED_DIM,
                      input_length=LEN_SENTS,
                      mask_zero=False)(input1)

drp_sentences = Dropout(_DROPOUT, name='dropout_sentences')(sentences)

# BI-LSTM Layer
myModel = Bidirectional(LSTM(EMBED_DIM, 
                             return_sequences=True
                            ),
                        name='bilstm1')(drp_sentences)

# TimeDistributed Layer
myModel  = TimeDistributed(Dropout(_DROPOUT))(myModel)
myModel  = TimeDistributed(Dense(units=EMBED_DIM * 2, activation='relu'))(myModel)
myModel  = TimeDistributed(Dense(units=NUM_TAGS, activation='relu'))(myModel)

# CRF Layer
crf= crf4(NUM_TAGS,  sparse_target=True, name='crf_layer')

merged_chain = crf(myModel)

model = Model(inputs=input1, outputs=merged_chain)

model.compile(optimizer='adam', loss=crf.loss, metrics=[crf.accuracy])

In [45]:
# load json and create model../model-save
#json_file = open('../model-save/model-00/mb-00.json', 'r')
#loaded_model_json = json_file.read()
#json_file.close()
#loaded_model = model_from_json(loaded_model_json)

# load weights into new model
model.load_weights("saved_models/BiLSTM/saved/model04/model04.h5")
print("Loaded model from disk")

Loaded model from disk


In [53]:
#Note: Sentences must be previously tokenized (e.g Using Spacy for spanish)
#The model receives a tokenized sentence and returns a NER label for each token.
#Next sentences have been proviously tokenized. 
#It is recommended to use  Spacy tokenizerfor spanish language.

ner_samples = [
    "CICLO 2 CARBOPLATINO / PACLITAXEL . ".split(),
    "En Agosto de 2015 ha recibido 3 ciclos de CISPLATINO / VINORELBINA buena tolerancia clinica .".split(),
    "QT : CISPLATINO 75 mg / m2 DIA 1 IV + VINORELBINA 25  mg / m2 IV DIAS 1,8 - Adenocarcinoma pulmon lobulo superior derecho ".split(),
    "El dia 27 de junio iniciamos tratamiento con quimioterapia segun esquema CARBOPLATINO / PEMETREXED .".split(),
    "CICLO 1 CARBOPLATINO AUC 5 - PEMETREXED 500 mg/m2 IV cada 21 dias..".split(),
    "RT con dosis 50 Gy, se encuentra bien. ".split(),
    "Carcinoma escamoso de pulmón cT3 cN2 cM0 (al menos estadio IIIB de TNM 8ª ed .".split(),
    "Diagnosticado en marzo de 2016 de Adenoca de pulmón cT2cN2cM1a .".split(),
    "Ha sido diagnosticada de cancer de pulmon en marzo de 2019 .".split(),
    "Inicia tratamiento con Cisplatino + Pemetrexed + Bevacizumab (5 ciclos administrados, ultimo en enero de 2014).".split(),
    "Carcinoma escamoso de pulmón intervenido en marzo 2017 .".split(),
    "En 2014, intervenido de carcinoma de pulmón pT2bN1cM0 realizandose nefrectomia derecha .".split(),
    "carcinoma microcitico de pulmon t4n2m0 en tto quimioterapico: carboplatino / etoposido .".split(),
    "Colico renoureteral derecho con fracaso renal obstructivo en Julio de 2015 . ".split()
    
]

In [None]:
#Spacy code
#import spacy
#import es_core_news_md
#spacy_tokenizer = es_core_news_md.load()
#
#def word_tokenizer_spacy(textSentence):
#	sentence=""
#	doc =spacy_tokenizer(textSentence)
 
#	for token in doc:
#		sentence= sentence + token.text + " "
#	return sentence


In [54]:
ner_samples_X    = []

for s1 in ner_samples:
    s1_int = []
    for w in s1:
        try:
            s1_int.append(word2idx[w.lower()])
        except KeyError:
            s1_int.append(word2idx['-OOV-'])
    ner_samples_X.append(s1_int)

ner_samples_X = pad_sequences(ner_samples_X, maxlen=LEN_SENTS, padding='post')


print("Examples: \n", ner_samples_X)



Examples: 
 [[ 2926  3770  4158 ...     0     0     0]
 [ 1044  8808  2706 ...     0     0     0]
 [    1  4616  7093 ...     0     0     0]
 ...
 [ 1044     1  2607 ...     0     0     0]
 [10469   531  2706 ...     0     0     0]
 [ 1628 10337  7713 ...     0     0     0]]


In [50]:
predictions1 = model.predict(ner_samples_X)
#print("Examples: \n", predictions1, predictions1.shape)


In [55]:
log_tokens1 = logits_to_tokens(np.argmax(predictions1, -1), idx2tag)

print("Ner: \n", log_tokens1[0])
#print("\nUncertainty: \n", log_tokens2[0])
#print("\nBoth: \n", log_tokens3[0])

Ner: 
 ['B_METRIC', 'B_QUANTITY', 'B_DRUG', 'O', 'B_DRUG', 'O', 'B_DRUG', 'O', 'B_DRUG', 'B_DRUG', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'B_FREQ', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I_OCURRENCE_EVENT']


In [56]:
for h, oracc in enumerate(ner_samples):
    heads = oracc
    body  = [log_tokens1[h][:len(oracc)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))

CICLO,2,CARBOPLATINO,/,PACLITAXEL,.
B_METRIC,B_QUANTITY,B_DRUG,O,B_DRUG,O


En,Agosto,de,2015,ha,recibido,3,ciclos,de.1,CISPLATINO,/,VINORELBINA,buena,tolerancia,clinica,.
O,B_DATE,I_DATE,I_DATE,O,O,B_QUANTITY,B_METRIC,O,B_DRUG,O,B_DRUG,O,O,O,O


QT,:,CISPLATINO,75,mg,/,m2,DIA,1,IV,+,VINORELBINA,25,mg.1,/.1,m2.1,IV.1,DIAS,"1,8",-,Adenocarcinoma,pulmon,lobulo,superior,derecho
B_QUANTITY,O,B_DRUG,B_QUANTITY,B_METRIC,O,B_METRIC,B_METRIC,B_QUANTITY,O,O,B_DRUG,B_QUANTITY,B_METRIC,O,B_METRIC,O,B_METRIC,B_QUANTITY,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,O,O,O


El,dia,27,de,junio,iniciamos,tratamiento,con,quimioterapia,segun,esquema,CARBOPLATINO,/,PEMETREXED,.
O,O,B_DATE,I_DATE,I_DATE,B_OCURRENCE_EVENT,I_OCURRENCE_EVENT,O,B_CHEMOTHERAPY,O,O,B_DRUG,O,B_DRUG,O


CICLO,1,CARBOPLATINO,AUC,5,-,PEMETREXED,500,mg/m2,IV,cada,21,dias..
B_METRIC,B_QUANTITY,B_DRUG,O,B_QUANTITY,O,B_DRUG,B_QUANTITY,B_METRIC,O,B_FREQ,I_FREQ,O


RT,con,dosis,50,"Gy,",se,encuentra,bien.
O,O,O,B_QUANTITY,O,O,O,O


Carcinoma,escamoso,de,pulmón,cT3,cN2,cM0,(al,menos,estadio,IIIB,de.1,TNM,8ª,ed,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,B_TNM,O,B_TNM,O,O,B_STAGE,I_STAGE,O,O,B_QUANTITY,B_METRIC,O


Diagnosticado,en,marzo,de,2016,de.1,Adenoca,de.2,pulmón,cT2cN2cM1a,.
B_OCURRENCE_EVENT,I_OCURRENCE_EVENT,B_DATE,I_DATE,I_DATE,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,O


Ha,sido,diagnosticada,de,cancer,de.1,pulmon,en,marzo,de.2,2019,.
B_OCURRENCE_EVENT,O,B_OCURRENCE_EVENT,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,B_DATE,I_DATE,I_DATE,O


Inicia,tratamiento,con,Cisplatino,+,Pemetrexed,+.1,Bevacizumab,(5,ciclos,"administrados,",ultimo,en,enero,de,2014).
B_OCURRENCE_EVENT,I_OCURRENCE_EVENT,O,B_DRUG,O,B_DRUG,O,O,B_QUANTITY,B_METRIC,O,B_OCURRENCE_EVENT,O,B_DATE,I_DATE,I_DATE


Carcinoma,escamoso,de,pulmón,intervenido,en,marzo,2017,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,B_OCURRENCE_EVENT,O,B_DATE,I_DATE,O


En,"2014,",intervenido,de,carcinoma,de.1,pulmón,pT2bN1cM0,realizandose,nefrectomia,derecha,.
O,B_DATE,B_OCURRENCE_EVENT,O,B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,O,O,B_SURGERY,I_SURGERY,O


carcinoma,microcitico,de,pulmon,t4n2m0,en,tto,quimioterapico:,carboplatino,/,etoposido,.
B_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,I_CANCER_CONCEPT,B_TNM,I_TNM,I_TNM,O,O,O,B_DRUG,O


Colico,renoureteral,derecho,con,fracaso,renal,obstructivo,en,Julio,de,2015,.
O,O,O,O,O,O,O,O,B_DATE,I_DATE,I_DATE,O
