## Modelo Deep learning NER con bilstm y crf - bioEmb_300_skyp

### Definicion de Parametro e Hiperparametros del Modelo

In [None]:
import sys
sys.path.append('./libs')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules

from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'base_model_bioEmb_300_skyp'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

# ****** DEFINICION DE PARAMETROS *********
NUM_WORDS       = 25331 + 2
LEN_SENTS       = 190
NUM_TAGS        = 9 + 2

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS         = 75
EMBED_DIM       = 300
_DROPOUT        = 0.5
_BACH_SIZE      = 512 + 256
_LEARN_RATE     = 1e-2


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

## Se cargan los datos de Entrenamiento

In [None]:
test_sentences = []
with open("./vectors/sentences_test.txt", "rb") as fp:
    test_sentences = pickle.load(fp)
    
## ********** Oraciones ********** ##
word2idx = np.load('./vectors/word2index.npy', allow_pickle=True).item()
tag2idx  = np.load('./vectors/tag2index.npy', allow_pickle=True).item()
idx2tag  = np.load('./vectors/index2tag.npy', allow_pickle=True).item()

X_train = np.load('./vectors/X_train.npy')
X_test  = np.load('./vectors/X_test.npy')
X_dev   = np.load('./vectors/X_dev.npy')


## ********** Salidas ********** ##
y_train  = np.load('./vectors/y_train.npy')
y_test   = np.load('./vectors/y_test.npy')
y_dev    = np.load('./vectors/y_dev.npy')

## Pruebas de carga de datos

In [None]:
#print('**** Diccionario de palabras: ****\n')
#for key, value in word2idx.items():
#    if value == 10:
#        break
#    else:
#        print(key, ' : ', value)
        
#print(X_train[0])
#print(len(X_train))

#print(y_train[0])
#print(len(y_train))
#print(len(y_test))
#print(len(y_dev))

print(tag2idx)

### Se carga el embedding

In [None]:
file = './embedding/Biomed/biomedical/cased/skipgram/d300/bio_es.vec'
embedding_matrix = bme(file, NUM_WORDS, EMBED_DIM, word2idx)

## Definición del Modelo

In [None]:
loss      = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam(lr=_LEARN_RATE)

In [None]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Input Layer
    input1 = Input(shape=(LEN_SENTS,), dtype='int32')

    # Embedding Layer
    sentences = Embedding(NUM_WORDS,
                        EMBED_DIM,
                        input_length=LEN_SENTS,  
                        weights=[embedding_matrix],
                        trainable=False,
                        mask_zero=False)(input1)

    drp_sentences = Dropout(_DROPOUT, name='dropout_sentences')(sentences)

    # BI-LSTM Layer
    myModel = Bidirectional(LSTM(EMBED_DIM, 
                                 return_sequences=True
                                ),
                            name='bilstm1')(drp_sentences)
    

    myModel  = Dropout(_DROPOUT, name='dropout_lstm')(myModel)
    myModel  = Dense(units=EMBED_DIM * 2, activation='relu')(myModel)
    myModel  = Dense(units=NUM_TAGS, activation='relu')(myModel)
    
    myModel  = Masking(mask_value=0.,input_shape=(LEN_SENTS, NUM_TAGS))(myModel)
    
    crf = crf6(units=NUM_TAGS, name="ner_crf")
    predictions = crf(myModel)

    base_model = Model(inputs=input1, outputs=predictions)
    model = ModelWithCRFLoss(base_model, sparse_target=True)
    
    model.compile(optimizer=optimizer, loss=loss)

#model.summary()

## Entranamiento del modelo

In [None]:
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Fit the best model
history = model.fit(X_train, y_train, 
                      validation_data  = (X_dev, y_dev),
                      batch_size       = _BACH_SIZE, 
                      epochs           = _EPOCHS,
                      verbose          = 1, 
                      callbacks        = [tensorboard_callback])

### se almacena el modelo

In [None]:
# serialize model to JSON
#model_json = model.to_json()
#with open(save_dir + "/" + MODEL + ".json", "w") as json_file:
#    json_file.write(model_json)

# serialize weights to HDF5
#model.save_weights(save_dir + "/" + MODEL + ".h5")
#print("Saved model to disk")
model.save(save_dir + "/")
print("Saved model to disk")

### Evaluamos el modelo y calculamos el valor de precision con respecto a los datos de prueba

In [None]:
scores = model.evaluate(X_test, y_test)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

###  Procedemos a Graficar el comportamiento del Entrenamiento, tanto del conjunto de entrenamiento como el de validación con respecto a la cantidad de epocas

In [None]:
plot_model_performance(
    train_loss     = history.history.get('loss', []),
    train_acc      = history.history.get('accuracy', []),
    train_val_loss = history.history.get('val_loss_val', []),
    train_val_acc  = history.history.get('val_val_accuracy', [])
)

### Hacemos la prediccion sobre el conjunto de pruebas

In [None]:
#print(idx2tag[6])
prediction = model.predict(X_test)

In [None]:
y_pred = logits_to_tokens(prediction, idx2tag)
#print(y_pred[0])

In [None]:
#print(y_pred[0], "\n")
#print(test_sentences[0])

### Hallamos los valores de F1 score, recall, precision

In [None]:
y_true = []
for i, oracion in enumerate(test_sentences):
    row_sent = []
 
    for j, lista_palabras in enumerate(oracion):
        row_sent.append(lista_palabras[1])

    qekk = ['-PAD-'] * LEN_SENTS

    qekk[:len(row_sent)] = row_sent
    y_true.append(qekk)

In [None]:
#prub = np.argmax(y_test, axis=-1)
#print(idx2tag)
#print(prub[0])
#print(y_true[0])
#print(len(y_true))
#print(len(y_true[0]))
#print(len(y_true[1]), "\n")

In [None]:
li1 = sum(y_true, [])
li2 = sum(y_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [None]:
#hh1 = seqclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', hh1)


print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

### Hallamos el calculo de cada una de las etiquetas por separado

In [None]:
report = eskclarep(results['Expected'], results['Predicted'])
print(report_to_df(report))

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

data = {'y_Actual':    results['Expected'],
        'y_Predicted': results['Predicted']
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)

sn.heatmap(confusion_matrix, annot=True)

plt.rcParams["figure.figsize"] = (20,10)
plt.show()

### Conversión de la salida del modelo a un lista de indices de tags

In [None]:
log_tokens1 = logits_to_tokens(predictions1, idx2tag)
log_tokens2 = logits_to_tokens(predictions2, idx2tag)
log_tokens3 = logits_to_tokens(predictions3, idx2tag)


print("Negation: \n", log_tokens1[0])
print("\nUncertainty: \n", log_tokens2[0])
print("\nBoth: \n", log_tokens3[0])

## Resultado de Negations: 

In [None]:
for h, oracc in enumerate(negation_samples):
    heads = oracc
    body  = [log_tokens1[h][:len(oracc)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))

## Resultado de Uncertainty:

In [None]:
for i, oracci in enumerate(uncertainty_samples):
    heads = oracci
    body  = [log_tokens2[i][:len(oracci)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))

## Resultado de Both:

In [None]:
for j, oraccio in enumerate(both_samples):
    heads = oraccio
    body  = [log_tokens3[j][:len(oraccio)]]
    display(HTML("<div style='overflow-x: auto; white-space: nowrap;'>" + 
                 tabulate(body, headers=heads, tablefmt="html") + 
                 "</div>"))
    print("\n")