In [71]:
import os
import zipfile
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub


In [14]:
data_root = '../../data/'
local_zip = data_root + 'msra.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('../../data/')
zip_ref.close()


In [15]:
def get_data_path(data_dir=data_root + 'msra/', type='train'):
    if type in ['train', 'val', 'test']:
        sentences_path = os.path.join(data_dir, type, 'sentences.txt')
        tags_path = os.path.join(data_dir, type, 'tags.txt')
        return sentences_path, tags_path
    else:
        raise ValueError("data type not in ['train', 'val', 'test']")


In [16]:
max_len = 128
batch_size = 32
saved_model = 'hub_ner_bert'

bert_zh = 'https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4'
bert_zh_preprocess = 'https://tfhub.dev/tensorflow/bert_zh_preprocess/3'


In [77]:
# preload the model
preprocessor = hub.load(bert_zh_preprocess)
encoder = hub.load(bert_zh)

In [18]:
class Label_Tokenizer(object):
    def __init__(self, labels, max_length):
        super().__init__()
        self.size = len(labels)
        labels_to_ids = {k: v for v, k in enumerate(labels)}
        ids_to_labels = {v: k for v, k in enumerate(labels)}
        self.labels_to_ids = labels_to_ids
        self.ids_to_labels = ids_to_labels
        self.max_length = max_length

    def tokenize(self, labels):
        tokens = [self._tokenize(label) for label in labels]
        return tokens

    def _tokenize(self, label):
        label = label.decode('utf-8') if hasattr(label, 'decode') else label
        labels = [le for le in label.strip().split(' ')]
        pad_token = self.encode(['[PAD]'])[0]
        special_token = self.encode(['O'])[0]

        tokens = self.encode(labels)
        tokens = [special_token] + tokens + [pad_token]
        tokens = tokens[:self.max_length - 2]
        tokens = [special_token] + tokens + [special_token]
        # Add padded TAG tokens
        padding_len = self.max_length - len(tokens)
        tokens = tokens + ([pad_token] * padding_len)
        return tokens

    def encode(self, labels):
        return [self.labels_to_ids[label] for label in labels]

    def decode(self, ids):
        return [self.ids_to_labels[id] for id in ids]


In [19]:
labels = ['O', 'B-ORG', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-LOC', '[PAD]']
label_tokenizer = Label_Tokenizer(labels, max_length=max_len)
labels_num = label_tokenizer.size


In [116]:
sentences_file, tags_file = get_data_path()
sentences_dataset = tf.data.TextLineDataset(sentences_file).take(1600)
x_train = [sentence for sentence in sentences_dataset.as_numpy_iterator()]
x_train = tf.constant(x_train)
# sentences_dataset = [
#     sentence for sentence in sentences_dataset.as_numpy_iterator()]
# x_train = tokenizer.tokenize(sentences_dataset)

label_dataset = tf.data.TextLineDataset(tags_file).take(1600)
label_dataset = [label.decode('utf-8')
                 for label in label_dataset.as_numpy_iterator()]
y_train = label_tokenizer.tokenize(label_dataset)
y_train = tf.constant(y_train)


In [117]:
sentences_file, tags_file = get_data_path(type='val')
sentences_dataset = tf.data.TextLineDataset(sentences_file).take(320)
x_val = [sentence for sentence in sentences_dataset.as_numpy_iterator()]
x_val = tf.constant(x_val)

label_dataset = tf.data.TextLineDataset(tags_file).take(320)
label_dataset = [label.decode('utf-8')
                 for label in label_dataset.as_numpy_iterator()]
y_val = label_tokenizer.tokenize(label_dataset)
y_val = tf.constant(y_val)

In [21]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

pad_token = label_tokenizer.encode(['[PAD]'])[0]

# Each batch of data will consist of variable sized sentence tokens with
# appropriate padding in both input and target.
# During loss calculation, we ignore the loss corresponding padding tokens
# in the target.


def masked_ce_loss(real, pred):
    loss_ = loss_object(real, pred)

    # tf.math.equal([pad], pad_token) => True
    # logical_not(True)-> False, cast(False) -> 0
    # loss_ *= mask, ignore the loss corresponding padding tokens
    mask = tf.math.logical_not(tf.math.equal(real, pad_token))
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)


In [22]:
def create_model(labels_num, seq_length):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    preprocessor = hub.load(bert_zh_preprocess)
    tokenize = hub.KerasLayer(preprocessor.tokenize)
    tokenized_inputs = [tokenize(text_input)]

    bert_pack_inputs = hub.KerasLayer(preprocessor.bert_pack_inputs, arguments=dict(seq_length=seq_length))
    encoder_inputs = bert_pack_inputs(tokenized_inputs)

    encoder = hub.KerasLayer(bert_zh, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)

    embedding = outputs["sequence_output"]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(
        labels_num, activation='softmax', name='NER')(embedding)

    model = keras.Model(
        inputs=[text_input],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss,
                  metrics=['accuracy'])
    return model


In [23]:
model = create_model(labels_num, seq_length=max_len)
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       (None, None, None)   0           ['input_1[0][0]']                
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_mask': (Non  0           ['keras_layer[0][0]']            
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                  

In [118]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=3,
    verbose=1,
    batch_size=batch_size,
    validation_data=(x_val, y_val)
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [119]:
class Tokenizer(object):
    def __init__(self, preprocessor_name, max_length):
        super().__init__()
        self.preprocessor = hub.load(preprocessor_name)
        self.max_length = max_length
        self.tokens2words = {}

    def tokenize(self, text):
        tokens = []
        for word in text:
            token = self.encode(word)
            tokens.append(token)
        
        tokens = tokens[:self.max_length - 2]
        tokens = [self.encode('[CLS]')] + tokens + [self.encode('[SEP]')]
        n_length = len(tokens)
        tokens = tokens + ([self.encode('[PAD]')] * (self.max_length - len(tokens)))
        return tf.constant(tokens), n_length

    def encode(self, word):
        token = self.preprocessor.tokenize([word])[0][0][0].numpy()
        self.tokens2words[token] = word
        return token

    def decode(self, tokens):
        return [self.tokens2words[token] for token in tokens]
    
tokenizer = Tokenizer(bert_zh_preprocess, max_len)

In [123]:
def predict_from_text(text, model):
    x_text, n_length = tokenizer.tokenize(text)

    pred_test = model.predict([text]) if hasattr(
        model, 'predict') else model([text])
    # ignore predictions of padding tokens
    pred_tags = np.argmax(pred_test, 2)[0][:n_length]

    tags = label_tokenizer.decode(pred_tags)
    res = []
    words = {
        'word': '',
        'tag': None
    }
    for idx, tag in enumerate(tags):
        if(tag != 'O' and tag != '[PAD]'):
            pre, suf = tag.split('-')
            words['tag'] = suf
            token = x_text[idx-1]
            word = tokenizer.decode([token.numpy()])[0]
            words['word'] = words['word'] + word if words['word'] else word
        else:
            if(words['tag']):
                res.append(words)
            words = {
                'word': '',
                'tag': None
            }
    return pd.DataFrame(res)


In [121]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'


In [124]:
print(predict_from_text(test_inputs, model))


              word  tag
0               李华  PER
1  朝阳区香河园街道西坝河北里社区  LOC
2            天安门广场  LOC
3         太阳宫凯德茂商场  LOC


In [125]:
model.save(saved_model, include_optimizer=False)




INFO:tensorflow:Assets written to: hub_ner_bert/assets


INFO:tensorflow:Assets written to: hub_ner_bert/assets


In [126]:
reload_model = tf.saved_model.load(saved_model)


In [127]:
print(predict_from_text(test_inputs, reload_model))


              word  tag
0               李华  PER
1  朝阳区香河园街道西坝河北里社区  LOC
2            天安门广场  LOC
3         太阳宫凯德茂商场  LOC
