In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
import tensorflow_hub as hub


In [None]:
local_zip = '../../data/msra.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('./data/')
zip_ref.close()


In [None]:
def get_data_path(data_dir='../../data/msra/', type='train'):
    if type in ['train', 'val', 'test']:
        sentences_path = os.path.join(data_dir, type, 'sentences.txt')
        tags_path = os.path.join(data_dir, type, 'tags.txt')
        return sentences_path, tags_path
    else:
        raise ValueError("data type not in ['train', 'val', 'test']")

In [None]:
max_len = 128
batch_size = 32

bert_zh = 'https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4'
bert_zh_preprocess = 'https://tfhub.dev/tensorflow/bert_zh_preprocess/3'


In [None]:
# preload the model
preprocessor = hub.KerasLayer(
    bert_zh_preprocess, name='preprocessing', trainable=False)

encoder = hub.KerasLayer(bert_zh, trainable=False, name='BERT_encoder')

In [None]:
class Label_Tokenizer(object):
    def __init__(self, labels, max_length):
        super().__init__()
        self.size = len(labels)
        labels_to_ids = {k: v for v, k in enumerate(labels)}
        ids_to_labels = {v: k for v, k in enumerate(labels)}
        self.labels_to_ids = labels_to_ids
        self.ids_to_labels = ids_to_labels
        self.max_length = max_length

    def tokenize(self, labels):
        tokens = [self._tokenize(label) for label in labels]
        return tokens

    def _tokenize(self, label):
        label = label.decode('utf-8') if hasattr(label, 'decode') else label
        labels = [le for le in label.strip().split(' ')]
        pad_token = self.encode(['[PAD]'])[0]
        special_token = self.encode(['O'])[0]

        tokens = self.encode(labels)
        tokens = [special_token] + tokens + [pad_token]
        tokens = tokens[:self.max_length - 2]
        tokens = [special_token] + tokens + [special_token]
        # Add padded TAG tokens
        padding_len = self.max_length - len(tokens)
        tokens = tokens + ([pad_token] * padding_len)
        return tokens

    def encode(self, labels):
        return [self.labels_to_ids[label] for label in labels]

    def decode(self, ids):
        return [self.ids_to_labels[id] for id in ids]


In [None]:
labels = ['O', 'B-ORG', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-LOC', '[PAD]']
label_tokenizer = Label_Tokenizer(labels, max_length=max_len)
labels_num = label_tokenizer.size

In [None]:
class Sentence_Tokenizer(object):
    def __init__(self, preprocessor, max_length):
        super().__init__()
        self.max_length = max_length
        self.preprocessor = hub.load(preprocessor)
    
    def tokenize(self, sentences):
        tokenized = self._tokenize(sentences)
        encoder_inputs = self.preprocessor.bert_pack_inputs([tokenized], seq_length=self.max_length)
        return [tf.constant(encoder_inputs['input_word_ids']), tf.constant(encoder_inputs['input_mask']), tf.constant(encoder_inputs['input_type_ids'])]
    
    def _tokenize(self, sentence):
        encoder_inputs = self.preprocessor.tokenize(sentence)
        return encoder_inputs
    
    def encode(self, word):
        encoder_inputs = self.preprocessor.decode(word)
        return encoder_inputs

In [None]:
tokenizer = Sentence_Tokenizer(bert_zh_preprocess, max_length=max_len)

In [None]:
sentences_file, tags_file = get_data_path()
sentences_dataset = tf.data.TextLineDataset(sentences_file).take(1600)
sentences_dataset = [sentence for sentence in sentences_dataset.as_numpy_iterator()]
x_train = tokenizer.tokenize(sentences_dataset)

label_dataset = tf.data.TextLineDataset(tags_file).take(1600)
label_dataset = [label.decode('utf-8') for label in label_dataset.as_numpy_iterator()]
y_train = label_tokenizer.tokenize(label_dataset)
y_train = tf.constant(y_train)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

pad_token = label_tokenizer.encode(['[PAD]'])[0]

# Each batch of data will consist of variable sized sentence tokens with
# appropriate padding in both input and target.
# During loss calculation, we ignore the loss corresponding padding tokens
# in the target.


def masked_ce_loss(real, pred):
    loss_ = loss_object(real, pred)

    # tf.math.equal([pad], pad_token) => True
    # logical_not(True)-> False, cast(False) -> 0
    # loss_ *= mask, ignore the loss corresponding padding tokens
    mask = tf.math.logical_not(tf.math.equal(real, pad_token))
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)


In [None]:
def create_model(labels_num):
    encoder_inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32),
    )

    encoder = hub.KerasLayer(bert_zh, trainable=False, name='BERT_encoder')
    outputs = encoder(encoder_inputs)

    embedding = outputs["sequence_output"]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(
        labels_num, activation='softmax', name='NER')(embedding)

    model = keras.Model(
        inputs=[encoder_inputs['input_word_ids'],
                encoder_inputs['input_mask'], encoder_inputs['input_type_ids']],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss,
                  metrics=['accuracy'])
    return model


In [None]:
model = create_model(labels_num)
model.summary()


In [None]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=3,
    verbose=1,
    batch_size=batch_size
)


In [None]:
def predict_from_text(text, model):
    tokens = tokenizer.tokenize([text])
    x_test = [tokens['input_word_ids'], tokens['input_mask'], tokens['input_type_ids']]
    pred_test = model.predict(x_test) if hasattr(
        model, 'predict') else model(x_test)
    # ignore predictions of padding tokens
    pred_tags = np.argmax(pred_test, 2)[0]

    tags = label_tokenizer.decode(pred_tags)
    return tags
    # res = []
    # words = {
    #     'word': '',
    #     'tag': None
    # }
    # for idx, tag in enumerate(tags):
    #     if(tag != 'O' and tag != '[PAD]'):
    #         pre, suf = tag.split('-')
    #         words['tag'] = suf
    #         word = x_test[idx]
    #         word = preprocessor.decode(word)
    #         words['word'] = words['word'] + word if words['word'] else word
    #     else:
    #         if(words['tag']):
    #             res.append(words)
    #         words = {
    #             'word': '',
    #             'tag': None
    #         }
    # return pd.DataFrame(res)


In [None]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'


In [None]:
print(predict_from_text(test_inputs, model))


In [None]:
saved_model_path = 'ner_chinese.hf'


In [None]:
model.save(saved_model_path)


In [None]:
reload_model = tf.saved_model.load(saved_model_path)

In [None]:
print(predict_from_text(test_inputs, reload_model))
