In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text
import tensorflow_hub as hub


In [2]:
data_root = '../../data/'
local_zip = data_root + 'msra.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('../../data/')
zip_ref.close()


In [3]:
def get_data_path(data_dir=data_root + 'msra/', type='train'):
    if type in ['train', 'val', 'test']:
        sentences_path = os.path.join(data_dir, type, 'sentences.txt')
        tags_path = os.path.join(data_dir, type, 'tags.txt')
        return sentences_path, tags_path
    else:
        raise ValueError("data type not in ['train', 'val', 'test']")


In [16]:
max_len = 128
batch_size = 32

bert_zh = 'https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4'
bert_zh_preprocess = 'https://tfhub.dev/tensorflow/bert_zh_preprocess/3'
saved_model = 'hub_ner_bert'


In [6]:
# preload the model
preprocessor = hub.KerasLayer(
    bert_zh_preprocess, name='preprocessing', trainable=False)

encoder = hub.KerasLayer(bert_zh, trainable=False, name='BERT_encoder')


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-06-13 20:25:04.107897: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-06-13 20:25:04.108213: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-06-13 20:25:07.380758: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-06-13 20:25:07.408270: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [7]:
class Label_Tokenizer(object):
    def __init__(self, labels, max_length):
        super().__init__()
        self.size = len(labels)
        labels_to_ids = {k: v for v, k in enumerate(labels)}
        ids_to_labels = {v: k for v, k in enumerate(labels)}
        self.labels_to_ids = labels_to_ids
        self.ids_to_labels = ids_to_labels
        self.max_length = max_length

    def tokenize(self, labels):
        tokens = [self._tokenize(label) for label in labels]
        return tokens

    def _tokenize(self, label):
        label = label.decode('utf-8') if hasattr(label, 'decode') else label
        labels = [le for le in label.strip().split(' ')]
        pad_token = self.encode(['[PAD]'])[0]
        special_token = self.encode(['O'])[0]

        tokens = self.encode(labels)
        tokens = [special_token] + tokens + [pad_token]
        tokens = tokens[:self.max_length - 2]
        tokens = [special_token] + tokens + [special_token]
        # Add padded TAG tokens
        padding_len = self.max_length - len(tokens)
        tokens = tokens + ([pad_token] * padding_len)
        return tokens

    def encode(self, labels):
        return [self.labels_to_ids[label] for label in labels]

    def decode(self, ids):
        return [self.ids_to_labels[id] for id in ids]


In [8]:
labels = ['O', 'B-ORG', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-LOC', '[PAD]']
label_tokenizer = Label_Tokenizer(labels, max_length=max_len)
labels_num = label_tokenizer.size


In [9]:
class Sentence_Tokenizer(object):
    def __init__(self, preprocessor, max_length):
        super().__init__()
        self.max_length = max_length
        self.preprocessor = hub.load(preprocessor)

    def tokenize(self, sentences):
        tokenized = self._tokenize(sentences)
        encoder_inputs = self.preprocessor.bert_pack_inputs(
            [tokenized], seq_length=self.max_length)
        return [tf.constant(encoder_inputs['input_word_ids']), tf.constant(encoder_inputs['input_mask']), tf.constant(encoder_inputs['input_type_ids'])]

    def _tokenize(self, sentence):
        encoder_inputs = self.preprocessor.tokenize(sentence)
        return encoder_inputs

    def encode(self, word):
        encoder_inputs = self.preprocessor.decode(word)
        return encoder_inputs


In [10]:
tokenizer = Sentence_Tokenizer(bert_zh_preprocess, max_length=max_len)


2022-06-13 20:33:00.676876: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [11]:
sentences_file, tags_file = get_data_path()
sentences_dataset = tf.data.TextLineDataset(sentences_file).take(160)
sentences_dataset = [
    sentence for sentence in sentences_dataset.as_numpy_iterator()]
x_train = tokenizer.tokenize(sentences_dataset)

label_dataset = tf.data.TextLineDataset(tags_file).take(160)
label_dataset = [label.decode('utf-8')
                 for label in label_dataset.as_numpy_iterator()]
y_train = label_tokenizer.tokenize(label_dataset)
y_train = tf.constant(y_train)


2022-06-13 20:33:11.823172: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-06-13 20:33:11.893474: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [12]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

pad_token = label_tokenizer.encode(['[PAD]'])[0]

# Each batch of data will consist of variable sized sentence tokens with
# appropriate padding in both input and target.
# During loss calculation, we ignore the loss corresponding padding tokens
# in the target.


def masked_ce_loss(real, pred):
    loss_ = loss_object(real, pred)

    # tf.math.equal([pad], pad_token) => True
    # logical_not(True)-> False, cast(False) -> 0
    # loss_ *= mask, ignore the loss corresponding padding tokens
    mask = tf.math.logical_not(tf.math.equal(real, pad_token))
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)


In [25]:
def create_model(labels_num):
    encoder_inputs = dict(
        input_word_ids=tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32),
        input_mask=tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32),
        input_type_ids=tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32),
    )

    encoder = hub.KerasLayer(bert_zh, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)

    embedding = outputs["sequence_output"]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(
        labels_num, activation='softmax', name='NER')(embedding)

    model = keras.Model(
        inputs=[encoder_inputs['input_word_ids'],
                encoder_inputs['input_mask'], encoder_inputs['input_type_ids']],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss,
                  metrics=['accuracy'])
    return model


In [26]:
model = create_model(labels_num)
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 BERT_encoder (KerasLayer)      {'default': (None,   102267649   ['input_5[0][0]',                
                                768),                             'input_6[0][0]',          

In [27]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=3,
    verbose=1,
    batch_size=batch_size
)


Epoch 1/3


2022-06-13 20:40:51.861831: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


In [28]:
def predict_from_text(text, model):
    tokens = tokenizer.tokenize([text])
    x_test = tokens
    pred_test = model.predict(x_test) if hasattr(
        model, 'predict') else model(x_test)
    # ignore predictions of padding tokens
    pred_tags = np.argmax(pred_test, 2)[0]

    tags = label_tokenizer.decode(pred_tags)
    return tags
    # res = []
    # words = {
    #     'word': '',
    #     'tag': None
    # }
    # for idx, tag in enumerate(tags):
    #     if(tag != 'O' and tag != '[PAD]'):
    #         pre, suf = tag.split('-')
    #         words['tag'] = suf
    #         word = x_test[idx]
    #         word = preprocessor.decode(word)
    #         words['word'] = words['word'] + word if words['word'] else word
    #     else:
    #         if(words['tag']):
    #             res.append(words)
    #         words = {
    #             'word': '',
    #             'tag': None
    #         }
    # return pd.DataFrame(res)


In [29]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'


In [30]:
print(predict_from_text(test_inputs, model))


2022-06-13 20:42:48.837402: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


['O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [31]:
model.save(saved_model)




INFO:tensorflow:Assets written to: hub_ner_bert/assets


INFO:tensorflow:Assets written to: hub_ner_bert/assets


In [32]:
reload_model = tf.saved_model.load(saved_model)


In [None]:
print(predict_from_text(test_inputs, reload_model))
