## Modelo Deep learning NER con bilstm y crf - bioEmb_300_skyp

### Definicion de Parametro e Hiperparametros del Modelo

In [None]:
import sys
sys.path.append('./libs')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules

from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'base_model_bioEmb_300_skyp'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

# ****** DEFINICION DE PARAMETROS *********
NUM_WORDS       = 25331 + 2
LEN_SENTS       = 190
NUM_TAGS        = 9 + 2

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS         = 75
EMBED_DIM       = 300
_DROPOUT        = 0.5
_BACH_SIZE      = 512 + 256
_LEARN_RATE     = 1e-2


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
#Cargar CSV
import pandas as pd
from sklearn import preprocessing

def process_csv(data_path):    
    df = pd.read_csv(data_path, encoding="utf-8")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    sentences_l = df.groupby("Sentence #")["Word"].apply(list).values
    sentences = sentences_l.tolist()
    tag_l = df.groupby("Sentence #")["Tag"].apply(list).values
    tag = tag_l.tolist()
    return sentences, tag

## Se cargan los datos de Entrenamiento

In [None]:
## ********** Oraciones ********** ##
word2idx = np.load('vectors/word2index.npy', allow_pickle=True).item()
tag2idx  = np.load('vectors/tag2index.npy', allow_pickle=True).item()
idx2tag  = np.load('vectors/index2tag.npy', allow_pickle=True).item()

In [None]:
test_data_csv = "sentences_test_bio.csv"
#test_data_csv  = "sentences_test.csv"
#test_data_csv = "sentences_test_700.csv"
X_oraciones, y_tags = process_csv(test_data_csv)

In [None]:
print(tag2idx)
print(X_oraciones[0])
print(y_tags[0])

In [None]:
X_test_unleng = []
for s1 in X_oraciones:
    s1_int = []
    for w in s1:
        try:
            s1_int.append(word2idx[w.lower()])
        except KeyError:
            s1_int.append(word2idx['-OOV-'])
    X_test_unleng.append(s1_int)

X_test = pad_sequences(X_test_unleng, maxlen=LEN_SENTS, padding='post')

In [None]:
y_test = [[tag2idx[w] for w in s] for s in y_tags]
y_test = pad_sequences(maxlen=LEN_SENTS, sequences=y_test, padding="post", value=tag2idx["-PAD-"])
y_true = logits_to_tokens(y_test, idx2tag)

In [None]:
#print(X_test[0])
#print(y_test[0])
#print(y_true[0])

## Definición del Modelo

In [None]:
#Loading the model
print("\nLoading the model for negation and speculation detection ...")
new_model = tf.saved_model.load(save_dir + "/")
print ("\nModel loaded ...")

In [None]:
#print(idx2tag[6])

prediction = new_model(X_test).numpy()

In [None]:
y_pred = logits_to_tokens(prediction, idx2tag)
print(len(y_pred))
print(len(y_true))

### Hallamos los valores de F1 score, recall, precision

In [None]:
li1 = sum(y_true, [])
li2 = sum(y_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [None]:
#hh1 = seqclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', hh1)


print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

### Hallamos el calculo de cada una de las etiquetas por separado

In [None]:
report = eskclarep(results['Expected'], results['Predicted'])
print(report_to_df(report))

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

data = {'y_Actual':    results['Expected'],
        'y_Predicted': results['Predicted']
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'], margins = True)

sn.heatmap(confusion_matrix, annot=True)

plt.rcParams["figure.figsize"] = (20,10)
plt.show()