
# Este modelo de red entrena un BiLSTM + CRF para la clasificación de NER sobre el corpus Conll2002. Este modelo tiene como entrada a la red la enterización del conjunto X de entrenamiento y la enterización y categorización  de los vectores de etiquetas. 

In [1]:
try:
    import seqeval
except ModuleNotFoundError as err:
    !pip install seqeval

In [2]:

import tensorflow as tf
#matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
import sys
sys.path.append('/kaggle/input/libs2021')
sys.path.append('/kaggle/input/embedding')
sys.path.append('/kaggle/input/libscrf4')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
#from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
#from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten
from tensorflow.keras.optimizers import Adam, schedules
#from crfta import CRF as crf4
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random



# Instalación del paquete nltk

In [4]:
import nltk
nltk.download('conll2002')
nltk.corpus.conll2002.fileids()

In [5]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
eval_sents = list(nltk.corpus.conll2002.iob_sents('esp.testa'))
print(len(train_sents),len(max(train_sents,key=len)))
print(len(test_sents),len(max(test_sents,key=len)))
print(len(eval_sents),len(max(eval_sents,key=len)))

In [6]:
print(train_sents[0])

# PARTE  1. PREPROCESAMIENTO DE LOS DATOS

In [7]:
def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [8]:
sent2tokens(train_sents[0])[0]
#sent2labels(train_sents[0])[0]

In [9]:
%%time
X_train = [sent2tokens(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2tokens(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_eval = [sent2tokens(s) for s in eval_sents]
y_eval = [sent2labels(s) for s in eval_sents]

In [10]:
print(X_train[2])
print(y_train[2])

In [11]:
import numpy as np

words, tagsss = set([]), set([])
 
for s in (X_train + X_eval + X_test):
    for w in s:
        words.add(w.lower())

for ts in (y_train + y_eval + y_test):
    for t in ts:
        tagsss.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 2 for i, t in enumerate(list(tagsss))}
tag2index['-PAD-'] = 0  # The special value used to padding
tag2index['-OOV-'] = 1  # The special value used to padding

print (len(word2index))
print (len(tag2index))

np.save("/kaggle/working/word2index.npy", word2index)
np.save("/kaggle/working/tag2index.npy", tag2index)
print(tagsss)

In [12]:
train_sentences_X, eval_sentences_X, test_sentences_X, train_tags_y, eval_tags_y, test_tags_y = [], [], [], [], [], []

for s in X_train:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)

for s in X_eval:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    eval_sentences_X.append(s_int)

for s in X_test:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

for s in y_train:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    train_tags_y.append(s_int)

for s in y_eval:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    eval_tags_y.append(s_int)

for s in y_test:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    test_tags_y.append(s_int)


# Las matrices de los tags son de números indexados pequeños porque solo son 11 tags.  ({ORG, LOC, PER}  X IOB)

In [13]:
print("Longitudes de las Matrices:")
print(len(train_sentences_X))
print(len(eval_sentences_X))
print(len( test_sentences_X))
print(len(train_tags_y))
print(len(eval_tags_y))
print(len(test_tags_y))

print("\nMuestra de Datos presentes en las Matrices con las transformaciones:\n")


print(train_sentences_X[0])
print(eval_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(eval_tags_y[0])
print(test_tags_y[0])


# Se procede a Normalizar las matrices con la longitud de la columna=MAX_LENGTH1 para que todas contengan el mismo numero de columnas, con la longitud máxima de palabras encontradas anteriormente y se agregan ceros a la derecha en las posiciones que hacen falta en el vector. 

In [14]:

MAX_LENGTH=202 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
eval_sentences_X = pad_sequences(eval_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
eval_tags_y = pad_sequences(eval_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(train_sentences_X.shape)
print(eval_sentences_X[0])
print(eval_sentences_X.shape)
print(test_sentences_X[0])
print(test_sentences_X.shape)
print(train_tags_y[0])
print(train_tags_y.shape)
print(eval_tags_y[0])
print(eval_tags_y.shape)
print(test_tags_y[0])
print(test_tags_y.shape)

np.save("/kaggle/working/train_sentences_X.npy", train_sentences_X)
np.save("/kaggle/working/eval_sentences_X.npy", eval_sentences_X)
np.save("/kaggle/working/test_sentences_X.npy", test_sentences_X)
np.save("/kaggle/working/train_tags_y.npy", train_tags_y)
np.save("/kaggle/working/eval_tags_y.npy", eval_tags_y)
np.save("/kaggle/working/test_tags_y.npy", test_tags_y)

In [15]:
def to_categoricals(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [16]:


def encode(data):
    print('Shape of data (BEFORE encode): %s' % str(data.shape))
    encoded = to_categorical(data)
    print('Shape of data (AFTER  encode): %s\n' % str(encoded.shape))
    return encoded

# Se realiza la categorización one-hot de las etiquetas o labels de entrenamiento, testeo y validación

In [17]:
cat_train_tags_y = to_categoricals(train_tags_y, len(tag2index))
cat_eval_tags_y  = to_categoricals(eval_tags_y, len(tag2index))
cat_test_tags_y  = to_categoricals(test_tags_y, len(tag2index))

print(cat_train_tags_y[1])
print(len(cat_train_tags_y))
print(len(cat_test_tags_y))

# PARTE 2. ENTRENAMIENTO DEL MODELO DE RED.

In [18]:

EMBED_DIM=300
file = '/kaggle/input/fasttext-spanish/cc.es.300.vec/cc.es.300.vec'
embedding_matrix = bme(file, len(word2index), EMBED_DIM, word2index)

In [19]:
from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules
input = Input(shape=(MAX_LENGTH,))
word_embedding_size = 300

# Embedding Layer
#model = Embedding(input_dim=len(word2index), 
    #            output_dim=word_embedding_size, 
     #           input_length=MAX_LENGTH,
     #           mask_zero=False)(input)

model = Embedding(len(word2index),
                        EMBED_DIM,
                        input_length=MAX_LENGTH,  
                        weights=[embedding_matrix],
                        trainable=False,
                        mask_zero=True)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                     return_sequences=True, 
                     dropout=0.5, 
                     recurrent_dropout=0.5))(model)
model  = Dropout(0.5, name='dropout_lstm')(model)
model  = Dense(units=EMBED_DIM * 2, activation='relu')(model)
model  = Dense(units=len(tag2index), activation='relu')(model)
    
model  = Masking(mask_value=0.,input_shape=(MAX_LENGTH, len(tag2index)))(model)
    
crf = crf6(units=len(tag2index), name="ner_crf")
predictions = crf(model)

base_model = Model(inputs=input, outputs=predictions)
model = ModelWithCRFLoss(base_model, sparse_target=True)
    
model.compile(optimizer='adam')

In [20]:
history= model.fit(train_sentences_X, cat_train_tags_y,
                       validation_data=(eval_sentences_X, cat_eval_tags_y),
                       batch_size=128, 
                       epochs=20,
                       verbose=2)

In [21]:
print(tag2index)
y_pred= model.predict(test_sentences_X)
print(y_pred.shape)

In [22]:
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
index2tag = {i: t for t, i in tag2index.items()}
print(index2tag)
y1_pred = logits_to_tokens(y_pred, index2tag)
print(y1_pred[10])

In [23]:
#print(Y_test[4])
print(test_tags_y.shape)

In [24]:
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
index2tag = {i: t for t, i in tag2index.items()}
print(index2tag)
y1_true = logits_to_tokens(test_tags_y, index2tag)
print(y1_true[10])

In [25]:
!pip install seqeval

In [26]:
#hh1 = seqclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', hh1)
from seqeval.metrics import classification_report as seqclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
print("precision: {:.1%}".format(precision_score(y1_true, y1_pred)))
print("   recall: {:.1%}".format(recall_score(y1_true,    y1_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y1_true,  y1_pred)))
print(" F1-score: {:.1%}".format(f1_score(y1_true,        y1_pred)))

In [27]:
import pandas as pd
li1 = sum(y1_true, [])
li2 = sum(y1_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [28]:
from sklearn.metrics import classification_report as eskclarep
report = eskclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', report)

print(report_to_df(report))

In [29]:
test_samples = [
    "James Rodriguez es el jugador colombiano más importante con Radamel Falcao.".split(),
    " Jugadores de la selección Colombia que juegan en el Reino Unido".split()
]
#print(max(test_samples))
print(test_samples)

In [30]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)
print(test_samples_X.shape)

In [31]:
predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

In [32]:
#print(len(predictions))
log_tokens = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
print(log_tokens)

In [33]:
#!pip install tabulate
from tabulate import tabulate

heads1 = test_samples[0]
body1 = [log_tokens[0][:len(test_samples[0])]]

heads2 = test_samples[1]
body2 = [log_tokens[1][:len(test_samples[1])]]

print(tabulate(body1, headers=heads1))

print ("\n")

print(tabulate(body2, headers=heads2))


## postagging Freeling 4.1

## El      hombre   bajo     corre    bajo  el      puente   con  bajo  índice   de  adrenalina  .
## DA0MS0  NCMS000  AQ0MS00  VMIP3S0  SP    DA0MS0  NCMS000  SP   SP    NCMS000  SP  NCFS000     Fp


## pos tagger Stanford NLP

## El      hombre   bajo     corre    bajo  el      puente   con    bajo   índice  de    adrenalina  .
## da0000  nc0s000  aq0000   vmip000  sp000 da0000  nc0s000  sp000  aq0000 nc0s000 sp000 nc0s000     fp