In [None]:
!/opt/conda/bin/python3.7 -m pip install --upgrade pip
!pip install seqeval

### Modelo Deep learning NER con bilstm y crf - model-04-tfa-fastText-(Scielo+wiki-uncased-cbow)+(char-50)

### Definicion de Parametro e Hiperparametros del Modelo

In [None]:
import sys
sys.path.append('../input/libsutils')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten
from tensorflow.keras.optimizers import Adam, schedules
from crfta import CRF as crf4
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'model-04-tfa-fastText-(Scielo+wiki-cased-cbow)+(char-50)'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

# ****** DEFINICION DE PARAMETROS *********

#NUM_WORDS   = 12154 + 2
#LEN_SENTS   = 153
#NUM_TAGS    = 19 + 2

NUM_WORDS   = 12071 + 2
LEN_SENTS   = 153
NUM_TAGS    = 30 + 2

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 50 #50
EMBED_DIM    = 300
CHAR_EMBEDD  = 50
_DROPOUT     = 0.5
REC_DROPOUT  = 0.1
LEARN_RATE   = 1e-3
N_TRAIN      = int(1e4)
EP_DECAY     = 1e-8
BETA_1       = 0.9
BETA_2       = 0.999
_BACH_SIZE   = 500
VAL_SPLIT    = 0.1
STEPS_PER_EPOCH = N_TRAIN//_BACH_SIZE


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

path_vectors     = '../input/new-lung-vectors/'
path_embeddings1 = '../input/embedding/'

## Se cargan los datos de Entrenamiento

In [None]:
test_sentences = []
print("path: " + path_vectors)
with open(path_vectors + "sentences_test.txt", "rb") as fp:
    test_sentences = pickle.load(fp)

print(test_sentences[0])

## ********** Oraciones ********** ##
word2idx = np.load(path_vectors + 'word2index.npy', allow_pickle=True).item()
tag2idx  = np.load(path_vectors + 'tag2index.npy', allow_pickle=True).item()
idx2tag  = np.load(path_vectors + 'index2tag.npy', allow_pickle=True).item()

X_train = np.load(path_vectors + 'X_train.npy')
X_test  = np.load(path_vectors + 'X_test.npy')
X_dev   = np.load(path_vectors + 'X_dev.npy')

y_train  = np.load(path_vectors + 'y_train.npy')
y_test   = np.load(path_vectors + 'y_test.npy')
y_dev    = np.load(path_vectors + 'y_dev.npy')

## Pruebas de carga de datos

In [None]:
#print('**** Diccionario de palabras: ****\n')
#for key, value in word2idx.items():
#    if value == 10:
#        break
#    else:
#        print(key, ' : ', value)
        
#print(X_train[0])
#print(len(X_train))

#print(y_train[0])
print(len(y_train))
#print(len(y_test))
#print(len(y_dev))

### Se carga el embedding de palabras

In [None]:
file1 = path_embeddings1 + 'ScieloWiki_cbow_cased.vec'
#file2 ='../input/new-lung-vectors/char_embedding_lung.txt'
file2 = '../input/new-lung-vectors/char_embedding_new.txt'
embedding_matrix = np.concatenate([bme(file1, NUM_WORDS, EMBED_DIM, word2idx),
                                   bme(file2, NUM_WORDS, CHAR_EMBEDD, word2idx)], axis=1)

## Definición del Modelo

In [None]:
#strategy = tf.distribute.MirroredStrategy()

#with strategy.scope():
# Input Layer
input1 = Input(shape=(LEN_SENTS,), dtype='int32')

# Embedding Layer
sentences = Embedding(NUM_WORDS,
                    EMBED_DIM + CHAR_EMBEDD,
                    input_length=LEN_SENTS,  
                    weights=[embedding_matrix],
                    trainable=False,
                    mask_zero=True)(input1)

# BI-LSTM Layer
myModel = Bidirectional(LSTM(EMBED_DIM + CHAR_EMBEDD, 
                             return_sequences=True
                            ),
                        name='bilstm1')(sentences)

# TimeDistributed Layer
myModel  = TimeDistributed(Dropout(_DROPOUT))(myModel)
myModel  = TimeDistributed(Dense(units=(EMBED_DIM + CHAR_EMBEDD)*2, activation='relu'))(myModel)
myModel  = TimeDistributed(Dense(units=NUM_TAGS, activation='relu'))(myModel)

# CRF Layer
crf= crf4(NUM_TAGS, sparse_target=True, name='crf_layer')

merged_chain = crf(myModel)

model = Model(inputs=input1, outputs=merged_chain)

model.compile(optimizer='Nadam', loss=crf.loss, metrics=[crf.accuracy])

model.summary()

## Entranamiento del modelo

In [None]:
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Fit the best model
history = model.fit(X_train, y_train, 
                      validation_data  = (X_dev, y_dev),
                      batch_size       = _BACH_SIZE, 
                      epochs           = _EPOCHS,
                      verbose          = 1, 
                      callbacks        = [tensorboard_callback])

### se almacena el modelo

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open(save_dir + "/" + MODEL + ".json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights(save_dir + "/" + MODEL + ".h5")
print("Saved model to disk")

### Evaluamos el modelo y calculamos el valor de precision con respecto a los datos de prueba

In [None]:
print(y_test[0][0], "\n")
print(len(y_test))

In [None]:
scores = model.evaluate(X_test, y_test)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

###  Procedemos a Graficar el comportamiento del Entrenamiento, tanto del conjunto de entrenamiento como el de validación con respecto a la cantidad de epocas

In [None]:
plot_model_performance(
    train_loss     = history.history.get('loss', []),
    train_acc      = history.history.get('viterbi_accuracy', []),
    train_val_loss = history.history.get('val_loss', []),
    train_val_acc  = history.history.get('val_viterbi_accuracy', [])
)

### Hacemos la prediccion sobre el conjunto de pruebas

In [None]:
print(idx2tag[6])
prediction = model.predict(X_test)

In [None]:
y_pred = logits_to_tokens(np.argmax(prediction, -1), idx2tag)

In [None]:
print(y_pred[0], "\n")
print(test_sentences[0])

### Hallamos los valores de F1 score, recall, precision

In [None]:
y_true = []
for i, oracion in enumerate(test_sentences):
    row_sent = []
 
    for j, lista_palabras in enumerate(oracion):
        row_sent.append(lista_palabras[1])

    qekk = ['-PAD-'] * LEN_SENTS

    qekk[:len(row_sent)] = row_sent
    y_true.append(qekk)

In [None]:
prub = np.argmax(y_test, axis=-1)
print(idx2tag)
#print(prub[0])
#print(y_true[0])
#print(len(y_true))
#print(len(y_true[0]))
#print(len(y_true[1]), "\n")

In [None]:
li1 = sum(y_true, [])
li2 = sum(y_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [None]:
#hh1 = seqclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', hh1)


print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

### Hallamos el calculo de cada una de las etiquetas por separado

In [None]:
report = eskclarep(results['Expected'], results['Predicted'])
print(report_to_df(report))

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

data = {'y_Actual':    results['Expected'],
        'y_Predicted': results['Predicted']
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)

sn.heatmap(confusion_matrix, annot=True)

plt.rcParams["figure.figsize"] = (20,10)
plt.show()

## Testing

### Creamos un pequeño Ejemplo

In [None]:
ner_samples = [
    "CICLO 2 CARBOPLATINO / PACLITAXEL . ".split(),
    "En Agosto de 2015 ha recibido 3 ciclos de CISPLATINO / VINORELBINA buena tolerancia clinica .".split(),
    "QT : CISPLATINO 75 mg / m2 DIA 1 IV + VINORELBINA 25 mg / m2 IV DIAS 1,8 - Adenocarcinoma pulmon lobulo superior derecho ".split(),
    "El dia 27 de junio iniciamos tratamiento con quimioterapia segun esquema CARBOPLATINO / PEMETREXED .".split(),
    "CICLO 1 CARBOPLATINO AUC 5 - PEMETREXED 500 mg/m2 IV cada 21 dias..".split(),
    "RT con dosis 50 Gy, se encuentra bien. .".split(),
    "Carcinoma escamoso de pulmón cT3 cN2 cM0 (al menos estadio IIIB de TNM 8ª ed .".split(),
    "Diagnosticado en marzo de 2016 de Adenoca de pulmón cT2cN2cM1a .".split(),
    "Ha sido diagnosticada de cancer de pulmon en marzo de 2019 .".split(),
    "Inicia tratamiento con Cisplatino + Pemetrexed + Bevacizumab (5 ciclos administrados, ultimo en enero de 2014).".split(),
    "Carcinoma escamoso de pulmón intervenido en marzo 2017 .".split(),
    "En 2014, intervenido de carcinoma de pulmón pT2bN1cM0 realizandose nefrectomia derecha .".split(),
    "carcinoma microcitico de pulmon t4 n2 m0 en tto quimioterapico: carboplatino / etoposido .".split(),
    "Colico renoureteral derecho con fracaso renal obstructivo en Julio de 2015 . ".split()
]






print("Ner: \n", ner_samples)



### Convertimos las Entradas del modelo

In [None]:
ner_samples_X    = []

for s1 in ner_samples:
    s1_int = []
    for w in s1:
        try:
            s1_int.append(word2idx[w.lower()])
        except KeyError:
            s1_int.append(word2idx['-OOV-'])
    ner_samples_X.append(s1_int)

ner_samples_X = pad_sequences(ner_samples_X, maxlen=LEN_SENTS, padding='post')


print("Examples: \n", ner_samples_X)



### Se ejecuta la predición con la entrada de ejemplo en el modelo entrenado

In [None]:
predictions1 = model.predict(ner_samples_X)
#predictions2 = model.predict(uncertainty_samples_X)
#predictions3 = model.predict(both_samples_X)

print("Examples: \n", predictions1, predictions1.shape)
#print("\nUncertainty: \n",predictions2, predictions2.shape)
#print(predictions3, predictions3.shape)

### Conversión de la salida del modelo a un lista de indices de tags

In [None]:
log_tokens1 = logits_to_tokens(np.argmax(predictions1, -1), idx2tag)
#log_tokens2 = logits_to_tokens(np.argmax(predictions2, -1), idx2tag)
#log_tokens3 = logits_to_tokens(np.argmax(predictions3, -1), idx2tag)


print("Ner: \n", log_tokens1[0])
#print("\nUncertainty: \n", log_tokens2[0])
#print("\nBoth: \n", log_tokens3[0])

## Resultado de Ner: 

In [None]:
for h, oracc in enumerate(ner_samples):
    heads = oracc
    body  = [log_tokens1[h][:len(oracc)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))