# Text Preprocessing

generate integer-indexed sentences, pos-tags and named entity tags, dictionaries for converting, etc, and save as `npy` binaries.

In [1]:
import pandas as pd
import numpy as np
from preprocessing import get_vocab, index_sents
from embedding import create_embeddings
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
!pip3 install keras-tqdm
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers.wrappers import Bidirectional
from keras.layers import concatenate, Input, LSTM, Dropout, Embedding
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from gensim.models import Word2Vec
from keras_tqdm import TQDMNotebookCallback
from embedding import load_vocab



In [3]:
# set maximum network vocabulary, test set size
MAX_VOCAB = 25000
TEST_SIZE = 0.15

### read ConLL2002 NER corpus from csv (first save as utf-8!)

In [4]:
data1 = pd.read_csv('data/ner_esp_train_dataset_utf8.csv')
data2=pd.read_csv('data/ner_esp_testa_dataset_utf8.csv')
#data.append(data2)
data4 = pd.concat([data1, data2], ignore_index=True, sort=False)
#print(data4)
data3=pd.read_csv('data/ner_esp_testb_dataset_utf8.csv')
data = pd.concat([data3, data4], ignore_index=True, sort=False)
print(data)
print(len(data))

         Sentence #           Word  POS    Tag
0       Sentence: 1             La   DA  B-LOC
1               NaN         Coruña   NC  I-LOC
2               NaN              ,   Fc      O
3               NaN             23    Z      O
4               NaN            may   NC      O
...             ...            ...  ...    ...
369166          NaN  Río-Santander  VMI  I-ORG
369167          NaN           6,18    Z      O
369168          NaN           +1,1    Z      O
369169          NaN         Dycasa   NC  B-ORG
369170          NaN              -   Fg      O

[369171 rows x 4 columns]
369171


In [5]:
sentmarks = data["Sentence #"].tolist()
sentmarks = [str(s) for s in sentmarks]
sentmarks[:5]

['Sentence: 1', 'nan', 'nan', 'nan', 'nan']

In [6]:
words = data["Word"].tolist()
postags = data["POS"].tolist()
nertags = data["Tag"].tolist()
#print(words[2],postags[2])

# Sentencias del conjunto de entrenamiento

In [7]:
sentence_text = []
sentence_post = []
sentence_ners = []

vocab = []

this_snt = []
this_pos = []
this_ner = []
#print(sentmarks[:10])
for idx, s in enumerate(sentmarks):
    # reset if new sent
    if s != 'nan':
        # edit: ONLY IF HAS TAG!
        #print(len(this_snt))
        if len(this_snt) > 0 and this_snt[-1] == '.':
            if list(set(this_ner)) != ['O']:
                sentence_text.append(this_snt[:-1])
                #print(this_snt[:-1])
                sentence_post.append(this_pos[:-1])
                sentence_ners.append(this_ner[:-1])
        this_snt = []
        this_pos = []
        this_ner = []
    
    # add to lists 
    this_snt.append(words[idx].lower())
    this_pos.append(postags[idx])
    this_ner.append(nertags[idx])
    vocab.append(words[idx].lower())
   
    #print(this_snt)

In [8]:
print(sentence_text[:2])
for idx, sent in enumerate(sentence_text[:2]):
    print(sent)
    print(sentence_post[idx])
    print(sentence_ners[idx])
    print('')

[['la', 'coruña', ',', '23', 'may', '(', 'efecom', ')'], ['las', 'reservas', '"', 'on', 'line', '"', 'de', 'billetes', 'aéreos', 'a', 'través', 'de', 'internet', 'aumentaron', 'en', 'españa', 'un', '300', 'por', 'ciento', 'en', 'el', 'primer', 'trimestre', 'de', 'este', 'año', 'con', 'respecto', 'al', 'mismo', 'período', 'de', '1999', ',', 'aseguró', 'hoy', 'iñigo', 'garcía', 'aranda', ',', 'responsable', 'de', 'comunicación', 'de', 'savia', 'amadeus']]
['la', 'coruña', ',', '23', 'may', '(', 'efecom', ')']
['DA', 'NC', 'Fc', 'Z', 'NC', 'Fpa', 'NP', 'Fpt']
['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'O']

['las', 'reservas', '"', 'on', 'line', '"', 'de', 'billetes', 'aéreos', 'a', 'través', 'de', 'internet', 'aumentaron', 'en', 'españa', 'un', '300', 'por', 'ciento', 'en', 'el', 'primer', 'trimestre', 'de', 'este', 'año', 'con', 'respecto', 'al', 'mismo', 'período', 'de', '1999', ',', 'aseguró', 'hoy', 'iñigo', 'garcía', 'aranda', ',', 'responsable', 'de', 'comunicación', 'de', 'sa

# Sentencias del conjunto de testeo

## Obtener vocabulario y entradas de índice
necesitamos convertir la entrada de cadena a vectores enteros para la red de keras (la red pycrfsuite necesita cadenas, ya que extraerá vectores de características de las propias palabras).

indexaremos cada palabra desde 1 de acuerdo con la frecuencia inversa (la palabra más común es 1, etc.) hasta el tamaño máximo de vocabulario. Reservaremos dos espacios, 0 para el índice PAD y MAX_VOCAB-1 para palabras fuera de vocabulario o desconocidas (OOV / UNK). Como esto es algo aburrido, lo puse en funciones externas. Los paquetes como keras y sklearn tienen herramientas más robustas para esto, pero una palabra simple: el diccionario índice funcionará bien para este experimento

In [9]:
# text vocab dicts
# subtract 2 for UNK, PAD
word2idx, idx2word = get_vocab(sentence_text, MAX_VOCAB-2)
print(len(idx2word))

24998


In [10]:
# POS and NER tag vocab dicts
pos2idx, idx2pos = get_vocab(sentence_post, len(set(postags)))
ner2idx, idx2ner = get_vocab(sentence_ners, len(set(nertags))+2)
print(len(ner2idx))


11


In [11]:
# index
sentence_text_idx = index_sents(sentence_text, word2idx)
sentence_post_idx = index_sents(sentence_post, pos2idx)
sentence_ners_idx = index_sents(sentence_ners, ner2idx)
#print(sentence_post_idx)
print(len(sentence_post_idx))


8781


## División  de los conjuntos de prueba y entrenamiento.
Dividimos los datos de entrenamiento en datos de entrenamiento y datos de prueba. los datos de prueba se usan solo para verificar el rendimiento del modelo. Un tercer conjunto, el conjunto de validación, puede separarse de nuestros datos de entrenamiento para el ajuste de hiperparámetros, aunque si utilizamos la validación cruzada k-fold, nuestro conjunto de validación cambiará cada vez.

In [12]:
indices = [i for i in range(len(sentence_text))]
#print(sentence_post_idx)

#print(train_idx)
#test_size=TEST_SIZE
train_idx, test_idx, X_train_pos, X_test_pos = train_test_split(indices,sentence_post_idx ,test_size=TEST_SIZE)
#X_train_pos,X_test_pos=train_test_split(indices, sentence_post_idx1 ,test_size=0.0001)


def get_sublist(lst, indices):
    result = []
    for idx in indices:
        result.append(lst[idx])
    return result

X_train_sents = get_sublist(sentence_text_idx, train_idx)
X_test_sents = get_sublist(sentence_text_idx, test_idx)
y_train_ner = get_sublist(sentence_ners_idx, train_idx)
y_test_ner = get_sublist(sentence_ners_idx, test_idx)
#print(X_test_sents,len(X_test_sents))
#print(sentence_ners_idx)
#print('****************************')
#print(y_test_ner,len(y_test_ner))







In [13]:
#print(test_idx)

## Creamos word2vec embeddings para  palabras, pos-tags

Se ha demostrado que el uso de vectores de incrustación pre-entrenados para inicializar la capa de incrustación ayuda a la capacitación para diversas tareas de etiquetado de secuencias, como el etiquetado de POS (Huang, Xu & Yu 2015; Ma & Hovy 2016) y el Reconocimiento de entidades con nombre para inglés (Ma & Hovy 2016 ; Lee Changki 2017) y japonés (Misawa, Taniguchi, Miura y Ohkuma 2017).

Como estamos usando las etiquetas POS como entrada secundaria, también entrenaremos un espacio de incrustación para estas. utilizaremos solo los datos de entrenamiento para crear las incrustaciones. Estoy usando Gensim para esta tarea, y estoy usando una función auxiliar para ajustar el Word2Vec que guarda la incrustación y también el diccionario de vocabulario. Se vectorizan 6185  sentencias y solo una de testeo que no se usa.

In [14]:
# sentence embeddings

train_sent_texts = [sentence_text[idx] for idx in train_idx]
        
w2v_vocab, w2v_model = create_embeddings(train_sent_texts,
                       embeddings_path='embedding/text_embeddings.gensimmodel',
                       vocab_path='embedding/text_mapping.json',
                       size=300,
                       workers=4,
                       iter=20)

In [15]:
# pos embeddings
train_post_texts = [sentence_post[idx] for idx in train_idx]

w2v_pvocab, w2v_pmodel = create_embeddings(train_post_texts,
                         embeddings_path='embedding/pos_embeddings.gensimmodel',
                         vocab_path='embedding/pos_mapping.json',
                         size=300,
                         workers=4,
                         iter=20)

## save everything to numpy binaries for loading

granted, `pickle` would probably be more suitable for a lot of these things. but over-reliance on `numpy` binaries is a bad habit i've picked up.

In [16]:
def numpy_save(saves, names):
    for idx, item in enumerate(saves):
        np.save('encoded/{0}.npy'.format(names[idx]), item)
    return

saves = [
vocab,
sentence_text_idx,
sentence_post_idx,
sentence_ners_idx,
word2idx, idx2word,
pos2idx, idx2pos,
ner2idx, idx2ner,
train_idx,
test_idx,
X_train_sents,
X_test_sents,
X_train_pos,
X_test_pos,
y_train_ner,
y_test_ner,
sentence_text,
sentence_post,
sentence_ners]

names = [
'vocab',
'sentence_text_idx',
'sentence_post_idx',
'sentence_ners_idx',
'word2idx', 'idx2word',
'pos2idx', 'idx2pos',
'ner2idx', 'idx2ner',
'train_idx',
'test_idx',
'X_train_sents',
'X_test_sents',
'X_train_pos',
'X_test_pos',
'y_train_ner',
'y_test_ner',
'sentence_text',
'sentence_post',
'sentence_ners']

numpy_save(saves, names)

In [17]:
# network hyperparameters
MAX_LENGTH = 30
MAX_VOCAB = 25000    # see preprocessing.ipynb
WORDEMBED_SIZE = 300 # see data_preprocessing.ipynb
POS_EMBED_SIZE = 300 # see data_preprocessing.ipynb
HIDDEN_SIZE = 400    # LSTM Nodes/Features/Dimension
BATCH_SIZE = 64
DROPOUTRATE = 0.25
MAX_EPOCHS = 8       # max iterations, early stop condition below
#print(y_test_ner)

In [18]:
# load data from npys (see preprocessing.ipynb)
print("loading data...\n")
vocab = list(np.load('encoded/vocab.npy',allow_pickle=True))
sentence_text = list(np.load('encoded/sentence_text.npy',allow_pickle=True))
sentence_post = list(np.load('encoded/sentence_post.npy',allow_pickle=True))
sentence_ners = list(np.load('encoded/sentence_ners.npy',allow_pickle=True))
sentence_text_idx = np.load('encoded/sentence_text_idx.npy',allow_pickle=True)
sentence_post_idx = np.load('encoded/sentence_post_idx.npy',allow_pickle=True)
sentence_ners_idx = np.load('encoded/sentence_ners_idx.npy',allow_pickle=True)
word2idx = np.load('encoded/word2idx.npy',allow_pickle=True).item()
idx2word = np.load('encoded/idx2word.npy',allow_pickle=True).item()
pos2idx = np.load('encoded/pos2idx.npy',allow_pickle=True).item()
idx2pos = np.load('encoded/idx2pos.npy',allow_pickle=True).item()
ner2idx = np.load('encoded/ner2idx.npy',allow_pickle=True).item()
idx2ner = np.load('encoded/idx2ner.npy',allow_pickle=True).item()
train_idx = np.load('encoded/train_idx.npy',allow_pickle=True)
test_idx = np.load('encoded/test_idx.npy',allow_pickle=True)
X_train_sents = np.load('encoded/X_train_sents.npy',allow_pickle=True)
X_test_sents = np.load('encoded/X_test_sents.npy',allow_pickle=True)
X_train_pos = np.load('encoded/X_train_pos.npy',allow_pickle=True)
X_test_pos = np.load('encoded/X_test_pos.npy',allow_pickle=True)
y_train_ner = np.load('encoded/y_train_ner.npy',allow_pickle=True)
y_test_ner = np.load('encoded/y_test_ner.npy',allow_pickle=True)
print(y_test_ner)

loading data...

[array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1])
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1])
 array([5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) ...
 array([1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 9, 9,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])
 array([2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 4, 9, 1, 1, 1,
       1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 9, 9, 1, 4, 9,
       1, 1, 1, 1, 2, 3, 3, 3, 1, 2, 1, 1, 1, 4, 9, 9, 9, 1, 1, 1, 1, 1,
       1, 2, 3, 1, 1, 1, 1, 1])
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1])]


In [19]:
#load embedding data
from embedding import load_vocab
w2v_vocab, _ = load_vocab('embedding/text_mapping.json')
w2v_model = Word2Vec.load('embedding/text_embeddings.gensimmodel')
w2v_pvocab, _ = load_vocab('embedding/pos_mapping.json')
w2v_pmodel = Word2Vec.load('embedding/pos_embeddings.gensimmodel')

# Secuencias de pad
debemos 'rellenar' nuestras secuencias de entrada y salida a una longitud fija debido a la representación de gráfico fijo de Tensorflow.

In [20]:
# zero-pad the sequences to max length
print("zero-padding sequences...\n")
X_train_sents = sequence.pad_sequences(X_train_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_sents = sequence.pad_sequences(X_test_sents, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_train_pos = sequence.pad_sequences(X_train_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
X_test_pos = sequence.pad_sequences(X_test_pos, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_train_ner = sequence.pad_sequences(y_train_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
y_test_ner = sequence.pad_sequences(y_test_ner, maxlen=MAX_LENGTH, truncating='post', padding='post')
print(len(X_train_sents))
print(len(X_test_sents))
print(len(X_train_pos))
print(len(X_test_pos))
print(len(y_train_ner))
print(len(y_test_ner))

#print(y_train_ner)
print(y_test_ner)

zero-padding sequences...

7463
1318
7463
1318
7463
1318
[[1 1 1 ... 1 1 1]
 [1 1 1 ... 0 0 0]
 [5 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 1 1 1]
 [2 1 1 ... 3 1 1]
 [1 1 1 ... 1 1 1]]


In [21]:
y_ner=y_test_ner
print(y_ner)

[[1 1 1 ... 1 1 1]
 [1 1 1 ... 0 0 0]
 [5 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 1 1 1]
 [2 1 1 ... 3 1 1]
 [1 1 1 ... 1 1 1]]


In [22]:
# get the size of pos-tags, ner tags
TAG_VOCAB = len(list(idx2pos.keys()))
NER_VOCAB = len(list(idx2ner.keys()))

In [23]:
# reshape data for CRF
y_train_ner = y_train_ner[:, :, np.newaxis]
y_test_ner = y_test_ner[:, :, np.newaxis]

# Precargar las incrustaciones pre-entrenadas
Como se vio en estudios previos como Ma & Hovy 2016, se ha demostrado que cargar la capa de embeddings con vectores de embeddings preentrenados mejora el rendimiento de la red. Aquí inicializamos un embeddings en ceros y luego cargamos  el embeddings desde el modelo previamente entrenado (si existe; puede que no se deba a los parámetros de Word2Vec).

In [24]:
# create embedding matrices from custom pretrained word2vec embeddings
word_embedding_matrix = np.zeros((MAX_VOCAB, WORDEMBED_SIZE))
c = 0
for word in word2idx.keys():
    # get the word vector from the embedding model
    # if it's there (check against vocab list)
    if word in w2v_vocab:
        c += 1
        # get the word vector
        word_vector = w2v_model[word]
        # slot it in at the proper index
        word_embedding_matrix[word2idx[word]] = word_vector
print("adicionados", c, "vectores")

adicionados 5279 vectores


  word_vector = w2v_model[word]


In [25]:
pos_embedding_matrix = np.zeros((TAG_VOCAB, POS_EMBED_SIZE))
c = 0
for word in pos2idx.keys():
    # get the word vector from the embedding model
    # if it's there (check against vocab list)
    if word in w2v_pvocab:
        c += 1
        # get the word vector
        word_vector = w2v_pmodel[word]
        # slot it in at the proper index
        pos_embedding_matrix[pos2idx[word]] = word_vector
print("adicionamos", c, "vectores")

adicionamos 51 vectores


  word_vector = w2v_pmodel[word]


In [26]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17591291961020332071
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 11916696664112116047
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 8300269692827185077
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_GPU:1"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 6883249637752363091
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5919389792
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2202175774815677818
physical_device_desc: "device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:01:00.0, compute capability: 6.1"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 5920683456
locality {
  bus_id: 1
  links {
  }
}
incarnation: 773

In [27]:
import tensorflow as tf
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


# Modelo BiLSTM+ CRF

In [28]:
# define model

# text layers : dense embedding > dropout > bi-LSTM
txt_input = Input(shape=(MAX_LENGTH,), name='txt_input')
txt_embed = Embedding(MAX_VOCAB, WORDEMBED_SIZE, input_length=MAX_LENGTH,
                      weights=[word_embedding_matrix],
                      name='txt_embedding', trainable=True)(txt_input)
txt_drpot = Dropout(DROPOUTRATE, name='txt_dropout')(txt_embed)

# pos layers : dense embedding > dropout > bi-LSTM
pos_input = Input(shape=(MAX_LENGTH,), name='pos_input')
pos_embed = Embedding(TAG_VOCAB, POS_EMBED_SIZE, input_length=MAX_LENGTH,
                      weights=[pos_embedding_matrix],
                      name='pos_embedding', trainable=True)(pos_input)
pos_drpot = Dropout(DROPOUTRATE, name='pos_dropout')(pos_embed)

# merged layers : merge (concat, average...) word and pos > bi-LSTM > bi-LSTM
mrg_cncat = concatenate([txt_drpot, pos_drpot], axis=2)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='mrg_bidirectional_1')(mrg_cncat)

# extra LSTM layer, if wanted
mrg_drpot = Dropout(DROPOUTRATE, name='mrg_dropout')(mrg_lstml)
mrg_lstml = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True),
                          name='mrg_bidirectional_2')(mrg_lstml)


# final linear chain CRF layer
crf = CRF(NER_VOCAB, sparse_target=True)
mrg_chain = crf(mrg_lstml)

model = Model(inputs=[txt_input, pos_input], outputs=mrg_chain)

model.compile(optimizer='adam',
              loss=crf.loss_function,
              metrics=[crf.accuracy])



In [29]:
# load model
#save_load_utils.load_all_weights(model,'model/crf_model.h5')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
txt_input (InputLayer)          (None, 30)           0                                            
__________________________________________________________________________________________________
pos_input (InputLayer)          (None, 30)           0                                            
__________________________________________________________________________________________________
txt_embedding (Embedding)       (None, 30, 300)      7500000     txt_input[0][0]                  
__________________________________________________________________________________________________
pos_embedding (Embedding)       (None, 30, 300)      18000       pos_input[0][0]                  
____________________________________________________________________________________________

In [30]:
print(len(X_train_sents),len(X_train_pos),len(y_train_ner))
history = model.fit([X_train_sents, X_train_pos], y_train_ner,
                    batch_size=BATCH_SIZE,
                    epochs=30,

                    verbose=2)

7463 7463 7463




Epoch 1/30
 - 31s - loss: 0.4239 - crf_viterbi_accuracy: 0.8878
Epoch 2/30
 - 29s - loss: 0.1871 - crf_viterbi_accuracy: 0.9414
Epoch 3/30
 - 29s - loss: 0.1218 - crf_viterbi_accuracy: 0.9581
Epoch 4/30
 - 30s - loss: 0.0783 - crf_viterbi_accuracy: 0.9704
Epoch 5/30
 - 29s - loss: 0.0518 - crf_viterbi_accuracy: 0.9782
Epoch 6/30
 - 30s - loss: 0.0321 - crf_viterbi_accuracy: 0.9838
Epoch 7/30
 - 30s - loss: 0.0162 - crf_viterbi_accuracy: 0.9884
Epoch 8/30
 - 30s - loss: 0.0046 - crf_viterbi_accuracy: 0.9914
Epoch 9/30
 - 30s - loss: -3.8245e-03 - crf_viterbi_accuracy: 0.9933
Epoch 10/30
 - 30s - loss: -1.0681e-02 - crf_viterbi_accuracy: 0.9946
Epoch 11/30
 - 30s - loss: -1.8007e-02 - crf_viterbi_accuracy: 0.9958
Epoch 12/30
 - 30s - loss: -2.4284e-02 - crf_viterbi_accuracy: 0.9969
Epoch 13/30
 - 30s - loss: -2.8867e-02 - crf_viterbi_accuracy: 0.9971
Epoch 14/30
 - 30s - loss: -3.3870e-02 - crf_viterbi_accuracy: 0.9978
Epoch 15/30
 - 30s - loss: -3.8610e-02 - crf_viterbi_accuracy: 0.9981

In [31]:
hist_dict = history.history

In [32]:
# save the model
# because we are using keras-contrib, we must save weights like this, and load into network
# (see decoding.ipynb)
save_load_utils.save_all_weights(model, 'model/crf_model.h5')
np.save('model/hist_dict.npy', hist_dict)
print("models saved!\n")

models saved!



In [33]:
print(len(X_test_sents),len(X_test_pos))
print(X_test_sents)
print(X_test_pos)

1318 1318
[[   37     3  1183 ...  2801     1     3]
 [   11   833     5 ...     0     0     0]
 [24496   581   858 ...     0     0     0]
 ...
 [    3   653     1 ...    18   276     5]
 [  321    30     3 ...  3561     2     5]
 [   32   854  5786 ...   272     1 19071]]
[[ 2  3  1 ...  1  2  3]
 [ 3  1 12 ...  0  0  0]
 [ 1 28  6 ...  0  0  0]
 ...
 [ 3  1  2 ... 10  1 12]
 [ 4 22  3 ...  4  5 12]
 [21  1  6 ...  4  2  7]]


In [34]:

preds = model.predict([X_test_sents, X_test_pos])

In [35]:
preds = np.argmax(preds, axis=-1)
preds.shape

(1318, 30)

In [36]:
trues = np.squeeze(y_test_ner, axis=-1)
trues.shape

(1318, 30)

In [37]:
s_preds = [[idx2ner[t] for t in s] for s in preds]

In [38]:
s_trues = [[idx2ner[t] for t in s] for s in trues]

In [39]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(s_preds, s_trues)))

F1-score: 80.5%


In [40]:
print(classification_report(s_trues, s_preds))

           precision    recall  f1-score   support

      ORG       0.77      0.80      0.78      1150
     MISC       0.41      0.52      0.46       317
      PER       0.89      0.90      0.89       745
      LOC       0.85      0.78      0.82       762
      PAD       1.00      1.00      1.00       406

micro avg       0.80      0.81      0.81      3380
macro avg       0.81      0.81      0.81      3380



In [41]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
def bio_classification_report(y_true, y_pred):
    """
    from scrapinghub's python-crfsuite example
    
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O', 'PAD'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [42]:
print(bio_classification_report(s_trues, s_preds))

              precision    recall  f1-score   support

       B-LOC       0.87      0.79      0.83       762
       I-LOC       0.79      0.70      0.74       246
      B-MISC       0.48      0.59      0.53       317
      I-MISC       0.64      0.52      0.57       474
       B-ORG       0.81      0.83      0.82      1150
       I-ORG       0.79      0.74      0.77       891
       B-PER       0.91      0.90      0.91       745
       I-PER       0.94      0.93      0.94       658

   micro avg       0.81      0.78      0.80      5243
   macro avg       0.78      0.75      0.76      5243
weighted avg       0.81      0.78      0.80      5243
 samples avg       0.10      0.10      0.10      5243



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
print(X_test_sents[:500])

[[   37     3  1183 ...  2801     1     3]
 [   11   833     5 ...     0     0     0]
 [24496   581   858 ...     0     0     0]
 ...
 [   12    20    54 ...  6311  2788    14]
 [   15   785     1 ...   645     1   297]
 [   18   272     1 ...    21   132  1784]]


In [44]:
##### hash(tuple(np.array([1,2,3,4])))
#print(y_ner)
print(X_test_sents[:500])
print(len(X_test_sents[:500]))
decoded = []
for sent_idx in range(len(X_test_sents[:500])):
    
    this_txt = sequence.pad_sequences([X_test_sents[sent_idx]], maxlen=MAX_LENGTH, truncating='post', padding='post')
    this_pos = sequence.pad_sequences([X_test_pos[sent_idx]], maxlen=MAX_LENGTH, truncating='post', padding='post')
    this_pred = model.predict([this_txt, this_pos])
    this_pred = [np.argmax(p) for p in this_pred[0]]
    np.shape(this_pred)
    #print(this_pred)
    # for each word in the sentence...
    word, pos, tru, prd = [], [], [], []
    for idx, wordid in enumerate(X_test_sents[sent_idx][:len(this_pred)]):
        # decode word
        word.append(idx2word[wordid])
        # decode pos
        #print(X_test_pos[sent_idx][idx])
        pos.append(idx2pos[X_test_pos[sent_idx][idx]])
        # decode true NER tag
        #print(pos)
        #print(y_ner[sent_idx][idx])
        tru.append(idx2ner[y_ner[sent_idx][idx]])
        # decode prediction
        #print(tru)
        prd.append(idx2ner[this_pred[idx]])
        #print(prd)
    
    answ = pd.DataFrame(
    {
        'word': word,
        'pos': pos,
        'true': tru,
        'pred': prd,
        'skip' : [' ' for s in word]
    })
    answ = answ[['word', 'pos', 'true', 'pred', 'skip']]
    answ = answ.T
    decoded.append(answ)

[[   37     3  1183 ...  2801     1     3]
 [   11   833     5 ...     0     0     0]
 [24496   581   858 ...     0     0     0]
 ...
 [   12    20    54 ...  6311  2788    14]
 [   15   785     1 ...   645     1   297]
 [   18   272     1 ...    21   132  1784]]
500


In [47]:
result = pd.concat(decoded)

In [48]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
word,según,la,acusación,",",esos,albaneses,fueron,miembros,de,la,...,),",",disuelta,y,desmilitarizada,tras,el,despliegue,de,la
pos,SP,DA,NC,Fc,DD,NC,VSI,NC,SP,DA,...,Fpt,Fc,AQ,CC,VMP,SP,DA,NC,SP,DA
true,O,O,O,O,O,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O
pred,O,O,O,O,O,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O
skip,,,,,,,,,,,...,,,,,,,,,,
