# Modelo Deep learning NER con bilstm y crf - clinicalEmb-300-skyp
## NewDataset Lung Cancer

### Definicion de Parametro e Hiperparametros del Modelo

In [None]:
import sys
sys.path.append('./libs')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import seaborn as sn
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from sklearn.model_selection import KFold

from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules

from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df2

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random

In [None]:
SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'new-clinical-base_model_epoch_60'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

#%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
# ****** DEFINICION DE PARAMETROS *********
NUM_WORDS       = 13705 + 2
LEN_SENTS       = 306
NUM_TAGS        = 37 + 2


# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS         = 50
EMBED_DIM       = 300
_DROPOUT        = 0.5
_BACH_SIZE      = 512 + (256+128)
_LEARN_RATE     = 5e-3
NUM_FOLDS       = 7
VAL_SPLIT       = 0.2

prime_data_fold = []
total_data_fold = []

## Se cargan los datos de Entrenamiento

In [None]:
total_sentences = []

with open("./vectors/sentences_train.txt", "rb") as fp:
    total_sentences = pickle.load(fp)
    
    
## ********** Oraciones ********** ##
word2idx = np.load('./vectors/word2index.npy', allow_pickle=True).item()
idx2word = {v: k for k, v in word2idx.items()}

idx2tag  = np.load('./vectors/index2tag.npy', allow_pickle=True).item()

x_inputs = np.load('./vectors/X_train.npy')

print(x_inputs.shape)

## ********** Salidas ********** ##
y_output = np.load('./vectors/y_train.npy')


print(y_output.shape)

In [None]:
#print(type(y_train))
#print(y_train[0])
#(11773, 310)
#(2790, 310)
#(3480, 310)
print(total_sentences[0])

## Pruebas de carga de datos

In [None]:
#print('**** Diccionario de palabras: ****\n')
#for key, value in feature_lemma_to_idx.items():
#    if value == 10:
#        break
#    else:
#        print(key, ' : ', value)
        
#print(X_train[0], "\n")
#print(len(X_train))

#print(y_train[0])
#print(len(y_train))
#print(len(y_test))
#print(len(y_dev))

### Se carga el embedding

In [None]:
file = '../oswaldo-nubes/embedding/Biomed/clinical/cased/skipgram/d300/clinic_es.vec'
embedding_matrix = bme(file, NUM_WORDS, EMBED_DIM, word2idx)

## Definición del Modelo

In [None]:
def get_model(print_sumary=0):
    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        loss      = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        optimizer = keras.optimizers.Adam(lr=_LEARN_RATE)
        
        # Input Layer
        input1 = Input(shape=(LEN_SENTS,), dtype='int32')

        # Embedding Layer
        sentences = Embedding(NUM_WORDS,
                              EMBED_DIM,
                              input_length=LEN_SENTS,
                              weights=[embedding_matrix],
                              trainable=False,
                              mask_zero=False)(input1)

        drp_sentences = Dropout(_DROPOUT, name='dropout_sentences')(sentences)

        # BI-LSTM Layer
        myModel = Bidirectional(LSTM(EMBED_DIM, 
                                     return_sequences=True
                                    ),
                                name='bilstm1')(drp_sentences)


        myModel  = Dropout(_DROPOUT, name='dropout_lstm')(myModel)
        myModel  = Dense(units=EMBED_DIM * 2, activation='relu')(myModel)
        myModel  = Dense(units=NUM_TAGS, activation='relu')(myModel)

        myModel  = Masking(mask_value=0.,input_shape=(LEN_SENTS, NUM_TAGS))(myModel)

        crf = crf6(units=NUM_TAGS, name="ner_crf")
        predictions = crf(myModel)

        base_model = Model(inputs=input1, outputs=predictions)
        model = ModelWithCRFLoss(base_model, sparse_target=True)

        #model.compile(optimizer='adam')
        model.compile(optimizer=optimizer, loss=loss)

    if print_sumary == 1:
        base_model.summary()
    
    return model

## Entranamiento del modelo

In [None]:
kfold = KFold(n_splits=NUM_FOLDS, shuffle=True)

fold_no = 1
for train_index, test_index in kfold.split(x_inputs):
    
    model = get_model(fold_no)
    
    # Fit the best model
    history = model.fit([x_inputs[train_index]], y_output[train_index],
                        validation_split = VAL_SPLIT,
                        batch_size       = _BACH_SIZE,
                        epochs           = _EPOCHS,
                        verbose          = 2)
    
    ### se almacena el modelo
    
    
    ### Evaluamos el modelo y calculamos el valor de precision con respecto a los datos de prueba
    scores = model.evaluate([x_inputs[test_index]], y_output[test_index])
    print(f"{model.metrics_names[1]}: {scores[1] * 100}")
    
    
    ###  Procedemos a Graficar el comportamiento del Entrenamiento, tanto del conjunto  
    ### de entrenamiento  como el de validación con respecto a la cantidad de epocas
    plot_model_performance(
        train_loss     = history.history.get('loss', []),
        train_acc      = history.history.get('accuracy', []),
        train_val_loss = history.history.get('val_loss_val', []),
        train_val_acc  = history.history.get('val_val_accuracy', [])
    )
    
    ### Hacemos la prediccion sobre el conjunto de pruebas
    
    #print(idx2tag[6])
    prediction = model.predict([x_inputs[test_index]])
    
    y_pred = logits_to_tokens(prediction, idx2tag)
    #print(y_pred[0])
    
    ### Hallamos los valores de F1 score, recall, precision

    y_true = []
    for i, index in enumerate(test_index):
        oracion = total_sentences[index]
        row_sent = []

        for j, lista_palabras in enumerate(oracion):
            row_sent.append(lista_palabras[1])

        qekk = ['-PAD-'] * LEN_SENTS

        qekk[:len(row_sent)] = row_sent
        y_true.append(qekk)
    
  
    
    li1 = sum(y_true, [])
    li2 = sum(y_pred, [])

    results = pd.DataFrame(columns=['Expected', 'Predicted'])

    results['Expected'] = li1
    results['Predicted'] = li2
    
    #print('\nclassification_report:\n', seqclarep(y_true, y_pred))

    print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
    print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
    print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
    print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))
    
    info = []

    info.append("precision: {:.1%}".format(precision_score(y_true, y_pred)))
    info.append("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
    info.append(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
    info.append(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

    prime_data_fold.append(info)
    
    ### Hallamos el calculo de cada una de las etiquetas por separado
    report = eskclarep(results['Expected'], results['Predicted'])
    #print('\nclassification_report:\n', report)

    data = {'y_Actual':    results['Expected'],
            'y_Predicted': results['Predicted']
            }

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)

    sn.heatmap(confusion_matrix, annot=True)

    plt.rcParams["figure.figsize"] = (20,10)
    plt.show()
    
    rep_kfl = report_to_df2(report, '-fol' + str(fold_no))
    
    total_data_fold.append(rep_kfl)
    
    print(rep_kfl)
    
    ######### ----- Codigo nuevo para salvar los array: y_true, y_pred ----------
    array_result = []
    array_result.append((y_true, y_pred))# Se usara para calcular NER Eval
    arrayNER = np.array(array_result)
    
    dirName= './NER-scoring-data/'
    fileName =  dirName + "fold-" + str(fold_no) + '.npy'
    np.save(fileName, arrayNER)
    
    #####---------------------------------------------------------------------
        
        
    # Increase fold number
    fold_no = fold_no + 1
    
    
    

### se almacena el resultado del modelo

In [None]:
output1 = open('prime_data_' + MODEL + '.pkl', 'wb')
pickle.dump(prime_data_fold, output1)
output1.close()

output2 = open('total_data_' + MODEL + '.pkl', 'wb')
pickle.dump(total_data_fold, output2)
output2.close()

## # Ejemplos

In [None]:
NER_samples = [
    "Paciente diagnosticado con cáncer de pulmón en marzo de 2019. ".split(),
    "No clínica urinaria ni respiratoria aguda .".split(),
    "Niega HTA, DM y DM .".split(),
    "No se observan signos de insuficiencia cardíaca .".split(),
    "Paciente tratado con 3 ciclos de de carboplatino .".split()
    
]



print("Examples: \n", NER_samples)



### Convertimos las Entradas del modelo

In [None]:
NER_samples_X    = []

for s1 in NER_samples:
    s1_int = []
    for w in s1:
        try:
            s1_int.append(word2idx[w.lower()])
        except KeyError:
            s1_int.append(word2idx['-OOV-'])
    NER_samples_X.append(s1_int)

NER_samples_X = pad_sequences(NER_samples_X, maxlen=LEN_SENTS, padding='post')




print("Examples: \n", NER_samples_X)



### Se ejecuta la predición con la entrada de ejemplo en el modelo entrenado

In [None]:
predictions1 = model.predict(NER_samples_X)
print("NER: \n", predictions1, predictions1.shape)


### Conversión de la salida del modelo a un lista de indices de tags

In [None]:
log_tokens1 = logits_to_tokens(predictions1, idx2tag)


print("NER: \n", log_tokens1[0])


## Resultado de NER: 

In [None]:
for h, oracc in enumerate(NER_samples):
    heads = oracc
    body  = [log_tokens1[h][:len(oracc)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))