### Modelo Deep learning NER con bilstm y crf model-03-tfa-fastText-cbow-(scielo-wiki-cased)-lemma-postag

### Definicion de Parametro e Hiperparametros del Modelo

In [None]:
import sys
sys.path.append('../libs')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten
from tensorflow.keras.optimizers import Adam, schedules
from crfta import CRF as crf4
from utils import build_matrix_embeddings as bme

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'model-03-tfa-fastText-cbow-(scielo-wiki-cased)-lemma-postag'
logs_base_dir = "../logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = '../model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

# ****** DEFINICION DE PARAMETROS *********
NUM_WORDS   = 12154 + 2
LEN_SENTS   = 153
NUM_TAGS    = 19 + 2
NUM_LEMA    = 9189 + 2
NUM_POSTAG  = 16 + 2


# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 30
EMBED_DIM    = 300
CHAR_EMBEDD  = 50
_DROPOUT     = 0.5
REC_DROPOUT  = 0.1
LEARN_RATE   = 1e-3
N_TRAIN      = int(1e4)
EP_DECAY     = 1e-8
BETA_1       = 0.9
BETA_2       = 0.999
_BACH_SIZE   = 500
VAL_SPLIT    = 0.1
STEPS_PER_EPOCH = N_TRAIN//_BACH_SIZE


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
test_sentences = []
with open("../vectors/sentences_test.txt", "rb") as fp:
    test_sentences = pickle.load(fp)

print(test_sentences[0])

## ********** Oraciones ********** ##
word2idx = np.load('../vectors/word2index.npy', allow_pickle=True).item()
idx2tag  = np.load('../vectors/index2tag.npy', allow_pickle=True).item()

X_train = np.load('../vectors/X_train.npy')
X_test  = np.load('../vectors/X_test.npy')
X_dev   = np.load('../vectors/X_dev.npy')


## ********** Lemma ********** ##
feature_lemma_to_idx = np.load('../vectors/features/index/lema_to_index.npy', allow_pickle=True).item()
#feature[lemma]_to_tag  = np.load('../vectors/features index2tag.npy', allow_pickle=True).item()

X_lemma_train = np.load('../vectors/features/input_train/X_lema_train.npy')
X_lemma_test  = np.load('../vectors/features/input_test/X_lema_test.npy')
X_lemma_dev   = np.load('../vectors/features/input_dev/X_lema_dev.npy')


## ********** Postag ********** ##
feature_pos_to_idx = np.load('../vectors/features/index/postag_to_index.npy', allow_pickle=True).item()
#feature[pos]_to_tag  = np.load('../vectors/features index2tag.npy', allow_pickle=True).item()

X_pos_train = np.load('../vectors/features/input_train/X_postag_train.npy')
X_pos_test  = np.load('../vectors/features/input_test/X_postag_test.npy')
X_pos_dev   = np.load('../vectors/features/input_dev/X_postag_dev.npy')


## ********** Salidas ********** ##
y_train  = np.load('../vectors/y_train.npy')
y_test   = np.load('../vectors/y_test.npy')
y_dev    = np.load('../vectors/y_dev.npy')

## Pruebas de carga de datos

In [None]:
#print('**** Diccionario de palabras: ****\n')
#for key, value in feature_lemma_to_idx.items():
#    if value == 10:
#        break
#    else:
#        print(key, ' : ', value)
        
#print(X_train[0], "\n")
#print(X_lemma_train[0], "\n")
#print(X_pos_train[0], "\n")
print(len(X_train[0]))
print(len(X_lemma_train[0]))
print(len(X_pos_train[0]))


#print(y_train[0])
#print(len(y_train))
#print(len(y_test))
#print(len(y_dev))

### Se carga el embedding de Palabras

In [None]:
file1 = '../../../embedding/others/biomed_emb/scielo_wikipedia/cbow/cased/Scielo+Wiki_cbow_cased.vec'
embedding_matrix = bme(file1, NUM_WORDS, EMBED_DIM, word2idx)

### Se carga el embedding de Lemmas

In [None]:
file2 = '../../../embedding/lung_cancer_corpus/lemma_fasttext_cbow_model_300.txt'
lemma_embedding_matrix = bme(file2, NUM_LEMA, EMBED_DIM, feature_lemma_to_idx)

### Se carga el embedding de Postag

In [None]:
file3 = '../../../embedding/lung_cancer_corpus/postag_fasttext_cbow_model_300.txt'
postag_embedding_matrix = bme(file3, NUM_POSTAG, EMBED_DIM, feature_pos_to_idx)

In [None]:
#print(embedding_matrix[4], "\n")

In [None]:
#print(postag_embedding_matrix[4], "\n")

In [None]:
#print(lemma_embedding_matrix[4], "\n")

## Definición del Modelo

In [None]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Input left Layer
    # Input sentences Layer
    input1 = Input(shape=(LEN_SENTS,), dtype='int32')

    # Embedding sentences Layer
    sentences = Embedding(NUM_WORDS,
                          EMBED_DIM,
                          input_length=LEN_SENTS,
                          weights=[embedding_matrix],
                          trainable=False,
                          mask_zero=True)(input1)

    drp_sentences = Dropout(_DROPOUT, name='dropout_sentences')(sentences)

    # Input lemma Layer
    input2 = Input(shape=(LEN_SENTS,), dtype='int32')

    # Embedding lemma Layer
    lemma = Embedding(NUM_LEMA,
                      EMBED_DIM,
                      input_length=LEN_SENTS,
                      weights=[lemma_embedding_matrix],
                      trainable=False,
                      mask_zero=True,
                      name='lemma_embedding')(input2)

    drp_lemma = Dropout(_DROPOUT, name='dropout_lemmas')(lemma)

    # Input postag Layer
    input3 = Input(shape=(LEN_SENTS,), dtype='int32')

    # Embedding postag Layer
    postag = Embedding(NUM_POSTAG,
                       EMBED_DIM,
                       input_length=LEN_SENTS,
                       weights=[postag_embedding_matrix],
                       #mask_zero=True,
                       name='postag_embedding')(input3)

    drp_postag = Dropout(_DROPOUT, name='dropout_postag')(postag)

    # merged layers : merge (concat, average...) word and pos > bi-LSTM > bi-LSTM
    mrg_cncat = Concatenate(axis=2)([drp_sentences, drp_lemma, drp_postag])

    # BI-LSTM Layer
    # BI-LSTM Layer
    myModel = Bidirectional(LSTM(EMBED_DIM * 3, 
                                 return_sequences=True
                                ),
                            name='bilstm1')(mrg_cncat)

    # TimeDistributed Layer
    myModel  = TimeDistributed(Dropout(_DROPOUT))(myModel)
    myModel  = TimeDistributed(Dense(units=EMBED_DIM * 3, activation='relu'))(myModel)
    myModel  = TimeDistributed(Dense(units=NUM_TAGS, activation='relu'))(myModel)
    
    # CRF Layer
    crf= crf4(NUM_TAGS, sparse_target=True, name='crf_layer')

    merged_chain = crf(myModel)

    #model = Model(inputs=input1, outputs=merged_chain)
    model = Model(inputs=[input1, input2, input3], outputs=merged_chain)
    
    model.compile(optimizer='Nadam', loss=crf.loss, metrics=[crf.accuracy])

model.summary()

## Entranamiento del modelo

In [None]:
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Fit the best model
history = model.fit([X_train, X_lemma_train, X_pos_train], y_train,
                      validation_data  = ([X_dev, X_lemma_dev, X_pos_dev], y_dev),
                      batch_size       = _BACH_SIZE, 
                      epochs           = _EPOCHS,
                      verbose          = 2, 
                      callbacks        = [tensorboard_callback])

### se almacena el modelo

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open(save_dir + "/" + MODEL + ".json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights(save_dir + "/" + MODEL + ".h5")
print("Saved model to disk")

### Evaluamos el modelo y calculamos el valor de precision con respecto a los datos de prueba

In [None]:
scores = model.evaluate([X_test, X_lemma_test, X_pos_test], y_test)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

### Definimos la funcion que nos servira para graficar el comportamiento del modelo en cada epoca del entrenamiento

In [None]:
def plot_model_performance(train_loss, train_acc, train_val_loss, train_val_acc):
    """ Plot model loss and accuracy through epochs. """
    blue= '#34495E'
    green = '#2ECC71'
    orange = '#E23B13'
    
    # plot model loss
    #fig, (ax1, ax2) = plt.subplots(2, figsize=(10, 8))
    plt.figure(figsize=(18, 6))
    plt.subplot(1, 2, 1)
    plt.plot(range(1, len(train_loss) + 1), train_loss, blue, linewidth=5, label='training')
    plt.plot(range(1, len(train_val_loss) + 1), train_val_loss, green, linewidth=5, label='validation')
    plt.xlabel('# epoch')
    plt.ylabel('loss')
    plt.tick_params('y')
    plt.legend(loc='upper right', shadow=False)
    plt.title('Model loss through #epochs', color=orange, fontweight='bold')
    
    # plot model accuracy
    plt.subplot(1, 2, 2)
    plt.plot(range(1, len(train_acc) + 1), train_acc, blue, linewidth=5, label='training')
    plt.plot(range(1, len(train_val_acc) + 1), train_val_acc, green, linewidth=5, label='validation')
    plt.xlabel('# epoch')
    plt.ylabel('accuracy')
    plt.tick_params('y')
    plt.legend(loc='lower right', shadow=False)
    plt.title('Model accuracy through #epochs', color=orange, fontweight='bold')
    
    #fig.savefig('Plot/training/training-mb-00.png', bbox_inches='tight')

###  Procedemos a Graficar el comportamiento del Entrenamiento, tanto del conjunto de entrenamiento como el de validación con respecto a la cantidad de epocas

In [None]:
plot_model_performance(
    train_loss     = history.history.get('loss', []),
    train_acc      = history.history.get('viterbi_accuracy', []),
    train_val_loss = history.history.get('val_loss', []),
    train_val_acc  = history.history.get('val_viterbi_accuracy', [])
)

### Función que Permite convertir Indices en Tags

In [None]:
def logits_to_tokens(sequences, indexa):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(indexa[categorical])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

### Hacemos la prediccion sobre el conjunto de pruebas

In [None]:
#print(idx2tag[6])
prediction = model.predict([X_test, X_lemma_test, X_pos_test])

In [None]:
y_pred = logits_to_tokens(np.argmax(prediction, -1), idx2tag)
#print(y_pred[0])

### Hallamos los valores de F1 score, recall, precision

In [None]:
y_true = []
for i, oracion in enumerate(test_sentences):
    row_sent = []
 
    for j, lista_palabras in enumerate(oracion):
        row_sent.append(lista_palabras[1])

    qekk = ['-PAD-'] * 153

    qekk[:len(row_sent)] = row_sent
    y_true.append(qekk)

In [None]:
#prub = np.argmax(y_test, axis=-1)
#print(idx2tag)
#print(prub[0])
#print(y_true[0])
#print(len(y_true))
#print(len(y_true[0]))
#print(len(y_true[1]), "\n")

In [None]:
li1 = sum(y_true, [])
li2 = sum(y_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [None]:
#print('\nclassification_report:\n', seqclarep(y_true, y_pred))

print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

### Hallamos el calculo de cada una de las etiquetas por separado

In [None]:
report = eskclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', report)

In [None]:
def report_to_df(report):
    report = [x.split(' ') for x in report.split('\n')]
    header = ['Class Name']+[x for x in report[0] if x!='']
    values = []
    for row in report[1:-5]:
        row = [value for value in row if value!='']
        if row!=[]:
            values.append(row)
    df = pd.DataFrame(data = values, columns = header)
    return df

print(report_to_df(report))

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

data = {'y_Actual':    results['Expected'],
        'y_Predicted': results['Predicted']
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)

sn.heatmap(confusion_matrix, annot=True)

plt.rcParams["figure.figsize"] = (20,10)
plt.show()

In [None]:
from numba import cuda 
devices = cuda.list_devices()
for device in devices:
    device.reset()