In [14]:
import os
import zipfile
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_models as tfm
import tensorflow_hub as hub
from tensorflow import keras
from transformers import TFAlbertModel, BertTokenizer

In [2]:
data_root = '../../data/'
local_zip = data_root + 'msra.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('../../data/')
zip_ref.close()

In [3]:
def get_data_path(data_dir=data_root + 'msra/', type='train'):
    if type in ['train', 'val', 'test']:
        sentences_path = os.path.join(data_dir, type, 'sentences.txt')
        tags_path = os.path.join(data_dir, type, 'tags.txt')
        return sentences_path, tags_path
    else:
        raise ValueError("data type not in ['train', 'val', 'test']")


In [15]:
max_len = 70
batch_size = 32
model_name = 'clue/albert_chinese_tiny'
saved_model = 'ner_albert_tiny_text_input'

In [20]:
class Sentence_Tokenizer(object):
    def __init__(self, model_name, max_length=128, padded_token=True):
        super().__init__()
        self.max_length = max_length
        self.padded_token = padded_token
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
    
    def bert_pack_inputs(self, sentences):
        input_ids = []
        attention_masks = []
        token_type_ids = []
        for sentence in sentences:
            tokens = self.tokenize(sentence, self.padded_token)
            input_ids.append(tokens['input_ids'])
            attention_masks.append(tokens['attention_mask'])
            token_type_ids.append(tokens['token_type_ids'])
        return [tf.constant(input_ids), tf.constant(token_type_ids), tf.constant(attention_masks)]
    
    def tokenize(self, sentence, padded_token=True):
        padiding = 'max_length' if padded_token else True
        tokens = self.tokenizer(text=sentence.strip(), max_length=self.max_length, truncation=True, padding=padiding, add_special_tokens=True)
        return tokens
    
    def decode(self, tokens):
        words = self.tokenizer.decode(tokens)
        return words

tokenizer = Sentence_Tokenizer(model_name, max_length=max_len)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [16]:
class Preprocessor(object):
    def __init__(self, vocab_path, seq_length):
        super().__init__()
        self.tokenize = tfm.nlp.layers.FastWordpieceBertTokenizer(
            vocab_file=vocab_path,
            lower_case=True)
        self.bert_pack_inputs = tfm.nlp.layers.BertPackInputs(
            seq_length=seq_length,
            special_tokens_dict=self.tokenize.get_special_tokens_dict())

vocab_path = './vocab.txt'
preprocessor = Preprocessor(vocab_path, seq_length=max_len)


2022-06-21 13:37:46.560123: W tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc:362] The empty string is found in the vocabulary, which takes place in the token id space but will never be used in the result. Consider cleaning it from the vocabulary.
2022-06-21 13:37:46.560765: W tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc:374] The empty suffix token is found in the vocabulary, which takes place in token id space but will (almost) never be used in the result. Consider cleaning it from the vocabulary.


In [22]:
encoder = TFAlbertModel.from_pretrained(model_name, from_pt=True, trainable=False)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


In [12]:
text_input = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'

In [24]:

text_inputs = tf.constant([text_input])
tokenize = hub.KerasLayer(preprocessor.tokenize, trainable=True)
bert_pack_inputs = hub.KerasLayer(preprocessor.bert_pack_inputs, trainable=True)
encoder_inputs = bert_pack_inputs([tokenize(text_inputs)])

# print(encoder_inputs['input_word_ids'])

outputs = encoder(
    input_ids=encoder_inputs['input_word_ids'], token_type_ids=encoder_inputs['input_type_ids'], attention_mask=encoder_inputs['input_mask']
)
outputs = np.argmax(outputs[0], 1)[0]
print(outputs)

ERROR:absl:hub.KerasLayer is trainable but has zero trainable weights.
ERROR:absl:hub.KerasLayer is trainable but has zero trainable weights.


[37 34 65 38 24 29 16 32 50  5 48 27 13  4  7 37  3 17  2 19 37 12 16 16
 38 28 25 24 28 31  6 24 49 46  3 10 47 37 12 40 39 13 23 12 29 44 47 36
 23 20 48 16 24  5 21 36 16 48  6 24 46  7 10 36 14 30 11  5 41 27  1  0
 18 47 23 48 37  4 10 19 45 34 54 31 34 55  9 32 36 42  5 38 14 38  1 49
 36 26 37  3 13  2 46 30 23 11 31 25  0  7 22 46  4 42 25 49 35 47 29  6
 36 40  7 45 23 24 29 37 29  2 23 30 49 34 23 42 58 13 25 12  2 24 32 16
 53 47 46 24 65  2 28 49 12 16 12 18  5 38 38  6 32 37 12  8 35  9 25 43
 40 14 27 12 11 16  9 12 65 18 42 16 15 43 49 37  5 21 11  4 22 15 24 35
 32 26 30 39 25 36 40 38 35  5 25  2 32 40 19 13 38 11 24 45 12 40  5 12
 34 13 23 37  3 39  6 17 25 26 43  2  6 14 23  5 27 18 19  2 37  4 28 24
 47  2 32 37 36 49  7 44 38 25 24  1 27 25 44 37 46 19 22 19 36 28 24 22
 25 23 44 48 16  2 26 17  6  4  1  1 41 39 17  1 22 11 30  6 32 24 16 16
 44 23  4 47 43 23 24 44 34 22 35 23 16 28 39  2 14  3 32 31  2 41 32 24]


In [25]:

encoder_inputs = tokenizer.bert_pack_inputs([text_input])
# print(encoder_inputs[0])

outputs = encoder(
    input_ids=encoder_inputs[0], token_type_ids=encoder_inputs[1], attention_mask=encoder_inputs[2]
)
outputs = np.argmax(outputs[0], 1)[0]
print(outputs)

[37 34 68 38  2 35 16 34 43  5 23 27 13  4  7 37  3 30 20 14 37 12 16 16
 38 28 35 34 28 51  6 24 49 19  7 10 52 37 12 40 39 13 23 12 29 44 47 36
 23 46 20 20 24  5 21 36 16 48 34 24 46  7 10 34 14 25 19  5 41 26  1  0
 18 47 23 48 38  4 23 19 45 34 53 31 34 48  9 29 36 42 28 38  9  4  1 49
 34 26 37  3 13  2 46 30 23 11 31 25  0  7 12 35  4 42 46 49 10 47 43 19
 36 40 19 45 23 24 18 37 29  2 23 30 49 11 23 42 58 10 37 12 23 39  7 16
 11 47 46 23 64  2 28 49 20 16 12 18  5 38 38  6 32 37 12  8 35  9 35 22
 40 14 39 12 11 24 39 16 64  8 42 16 35 43 49 37  5 21 22  4 29 15 24 35
 47 26 30 24 23 46 40 38 25  5 15  8 33  5 19 13 38 11 25 45 12 40  5 12
 34 13 23 37  3 39  6 35 37 26 43  2  6 18 23  5 27 11 19  2 37  4 28 24
 47  2 32 37  5 49  7 44 38 25 24  1 27 17 44 37 46 24 22 19 36 28 24 44
 25 23 44 48 16  2 26 17  6  4 36  1 41 27 30 37 34 11 30  6 32 24 16 16
 44 23 17 45 43  7 24 44 34  7 35 29 16 28 39  2 19  3 39 31 35 41 32 24]
