In [21]:
import os
import zipfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFAlbertModel

In [3]:
data_root = '../../data/'
local_zip = data_root + 'msra.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('../../data/')
zip_ref.close()

In [4]:
def get_data_path(data_dir=data_root + 'msra/', type='train'):
    if type in ['train', 'val', 'test']:
        sentences_path = os.path.join(data_dir, type, 'sentences.txt')
        tags_path = os.path.join(data_dir, type, 'tags.txt')
        return sentences_path, tags_path
    else:
        raise ValueError("data type not in ['train', 'val', 'test']")


In [5]:
max_len = 128
batch_size = 32
model_name = 'clue/albert_chinese_tiny'
saved_model = 'tr_ner_albert'

In [6]:
class Label_Tokenizer(object):
    def __init__(self, labels, max_length):
        super().__init__()
        self.size = len(labels)
        labels_to_ids = {k: v for v, k in enumerate(labels)}
        ids_to_labels = {v: k for v, k in enumerate(labels)}
        self.labels_to_ids = labels_to_ids
        self.ids_to_labels = ids_to_labels
        self.max_length = max_length

    def tokenize(self, labels):
        tokens = [self._tokenize(label) for label in labels]
        return tokens

    def _tokenize(self, label):
        label = label.decode('utf-8') if hasattr(label, 'decode') else label
        labels = [le for le in label.strip().split(' ')]
        pad_token = self.encode(['[PAD]'])[0]
        special_token = self.encode(['O'])[0]

        tokens = self.encode(labels)
        tokens = tokens[:self.max_length - 2]
        tokens = [special_token] + tokens + [special_token]
        # Add padded TAG tokens
        padding_len = self.max_length - len(tokens)
        tokens = tokens + ([pad_token] * padding_len)
        return tokens

    def encode(self, labels):
        return [self.labels_to_ids[label] for label in labels]

    def decode(self, ids):
        return [self.ids_to_labels[id] for id in ids]


In [7]:
labels = ['O', 'B-ORG', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-LOC', '[PAD]']
label_tokenizer = Label_Tokenizer(labels, max_length=max_len)
labels_num = label_tokenizer.size

In [11]:
class Sentence_Tokenizer(object):
    def __init__(self, model_name, max_length=128, padded_token=True):
        super().__init__()
        self.max_length = max_length
        self.padded_token = padded_token
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
    
    def bert_pack_inputs(self, sentences):
        input_ids = []
        attention_masks = []
        token_type_ids = []
        for sentence in sentences:
            tokens = self.tokenize(sentence, self.padded_token)
            input_ids.append(tokens['input_ids'])
            attention_masks.append(tokens['attention_mask'])
            token_type_ids.append(tokens['token_type_ids'])
        return [tf.constant(input_ids), tf.constant(token_type_ids), tf.constant(attention_masks)]
    
    def tokenize(self, sentence, padded_token=True):
        padiding = 'max_length' if padded_token else True
        tokens = self.tokenizer(text=sentence.strip(), max_length=self.max_length, truncation=True, padding=padiding, add_special_tokens=True)
        return tokens
    
    def decode(self, tokens):
        words = self.tokenizer.decode(tokens)
        return words

In [12]:
tokenizer = Sentence_Tokenizer(model_name, max_length=max_len)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [38]:
sentences_file, tags_file = get_data_path()
sentences = tf.data.TextLineDataset(sentences_file).take(1600)
sentences = [sentence.decode('utf-8') for sentence in sentences.as_numpy_iterator()]
x_train = tokenizer.bert_pack_inputs(sentences)

labels = tf.data.TextLineDataset(tags_file).take(1600)
labels = [label.decode('utf-8') for label in labels.as_numpy_iterator()]
y_train = label_tokenizer.tokenize(labels)
y_train = tf.constant(y_train)

In [39]:
sentences_file, tags_file = get_data_path(type='val')
sentences = tf.data.TextLineDataset(sentences_file).take(160)
sentences = [sentence.decode('utf-8') for sentence in sentences.as_numpy_iterator()]
x_val = tokenizer.bert_pack_inputs(sentences)

labels = tf.data.TextLineDataset(tags_file).take(160)
labels = [label.decode('utf-8') for label in labels.as_numpy_iterator()]
y_val = label_tokenizer.tokenize(labels)
y_val = tf.constant(y_val)

In [16]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

pad_token = label_tokenizer.encode(['[PAD]'])[0]

# Each batch of data will consist of variable sized sentence tokens with
# appropriate padding in both input and target.
# During loss calculation, we ignore the loss corresponding padding tokens
# in the target.


def masked_ce_loss(real, pred):
    loss_ = loss_object(real, pred)

    # tf.math.equal([pad], pad_token) => True
    # logical_not(True)-> False, cast(False) -> 0
    # loss_ *= mask, ignore the loss corresponding padding tokens
    mask = tf.math.logical_not(tf.math.equal(real, pad_token))
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)


In [26]:
def create_model(labels_num, max_lenth):
    input_ids = keras.layers.Input(shape=(max_lenth,), dtype=tf.int32)
    token_type_ids = keras.layers.Input(shape=(max_lenth,), dtype=tf.int32)
    attention_mask = keras.layers.Input(shape=(max_lenth,), dtype=tf.int32)

    encoder = TFAlbertModel.from_pretrained(model_name, from_pt=True)
    outputs = encoder(
        input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )

    embedding = outputs[0]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(
        labels_num, activation='softmax', name='NER')(embedding)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss,
                  metrics=['accuracy'])
    return model


In [27]:
model = create_model(labels_num, max_lenth=max_len)
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.dense.bias']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 input_14 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 tf_albert_model_1 (TFAlbertMod  TFBaseModelOutputWi  4080520    ['input_13[0][0]',               
 el)                            thPooling(last_hidd               'input_15[0][0]',           

In [40]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=3,
    verbose=1,
    batch_size=batch_size,
    validation_data=(x_val, y_val)
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [41]:
def predict_from_text(text, model):
    n_tokens = len(tokenizer.tokenize(text, padded_token=False)['input_ids'])
    tokens = tokenizer.bert_pack_inputs([text])
    x_test = tokens
    pred_test = model.predict(x_test) if hasattr(model, 'predict') else model(x_test)
    # ignore predictions of padding tokens
    pred_tags = np.argmax(pred_test, 2)[0][:n_tokens]

    tags = label_tokenizer.decode(pred_tags)
    res = []
    words = {
        'word': '',
        'tag': None
    }
    for idx, tag in enumerate(tags):
        if(tag != 'O' and tag != '[PAD]'):
            pre, suf = tag.split('-')
            words['tag'] = suf
            token = x_test[0][0][idx]
            token = token.numpy()
            word = tokenizer.decode(token)
            # word = preprocessor.decode(word)
            words['word'] = words['word'] + word if words['word'] else word
        else:
            if(words['tag']):
                res.append(words)
            words = {
                'word': '',
                'tag': None
            }
    return pd.DataFrame(res)


In [42]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'

In [43]:
print(predict_from_text(test_inputs, model))


             word  tag
0              李华  PER
1  朝阳区香河园街道西坝河北里社  LOC
2           天安门广场  ORG
3          太阳宫凯德茂  PER


In [45]:
model.save(saved_model, include_optimizer=False)




INFO:tensorflow:Assets written to: tr_ner_albert/assets


INFO:tensorflow:Assets written to: tr_ner_albert/assets


In [46]:
reload_model = tf.saved_model.load(saved_model)


In [47]:
print(predict_from_text(test_inputs, reload_model))


             word  tag
0              李华  PER
1  朝阳区香河园街道西坝河北里社  LOC
2           天安门广场  ORG
3          太阳宫凯德茂  PER
