In [1]:
import os
import zipfile
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import TFBertModel

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-06-13 23:13:22.958424: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-06-13 23:13:22.958621: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
data_root = '../../data/'
local_zip = data_root + 'msra.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('../../data/')
zip_ref.close()

In [4]:
def get_data_path(data_dir=data_root + 'msra/', type='train'):
    if type in ['train', 'val', 'test']:
        sentences_path = os.path.join(data_dir, type, 'sentences.txt')
        tags_path = os.path.join(data_dir, type, 'tags.txt')
        return sentences_path, tags_path
    else:
        raise ValueError("data type not in ['train', 'val', 'test']")


In [14]:
model_name = 'bert-base-chinese'
saved_model = 'tr_ner_bert_data_loader'
max_len = 180
batch_size = 32
params = {
  'max_len': max_len,
  'padded_token': True,
  'max_lines': None
}

In [6]:
from data_loader import DataLoader
data_dir=data_root + 'msra/'
# Load a small amount of data for testing
params['max_lines'] = 160  # 42000
data_loader = DataLoader(data_dir, model_name, params)

In [7]:
# load train data
data = data_loader.load_data('train')
num_tags = len(data_loader.idx2tag)
x_train = [
  data["input_ids"],
  data["token_type_ids"],
  data["attention_mask"]
]
y_train = data['tags']
size = data['size']
print(size)

160


In [12]:
# load validataion data
val_data = data_loader.load_data('val')
x_val = [
  val_data["input_ids"],
  val_data["token_type_ids"],
  val_data["attention_mask"]
]
y_val = val_data['tags']
validation_data = (x_val, y_val)
print(val_data['size'])

160


In [8]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

# Each batch of data will consist of variable sized sentence tokens with
# appropriate padding in both input and target.
# During loss calculation, we ignore the loss corresponding padding tokens
# in the target.
def masked_ce_loss(real, pred):
    loss_ = loss_object(real, pred)

    # tf.math.equal([pad], data_loader.tag_pad_idx) => True
    # logical_not(True)-> False, cast(False) -> 0
    # loss_ *= mask, ignore the loss corresponding padding tokens
    mask = tf.math.logical_not(tf.math.equal(real, data_loader.tag_pad_idx))
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)


def create_model(num_tags):
    # BERT encoder
    encoder = TFBertModel.from_pretrained(model_name)

    # NER Model
    # Input() is used to instantiate a Keras tensor.
    input_ids = keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = keras.layers.Input(shape=(max_len,), dtype=tf.int32)

    outputs = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )
    # last_hidden_state, more details [here](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions)
    embedding = outputs[0]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(num_tags+1, activation='softmax')(embedding)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
    return model

In [9]:
model = create_model(num_tags)
model.summary()

Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 180)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 180)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 180)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  102267648   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_3[0][0]',            

In [13]:
history = model.fit(
    x_train,
    y_train,
    epochs=3,
    verbose=1,
    batch_size=batch_size,
    validation_data=validation_data
)

Epoch 1/3

2022-06-13 23:18:42.200577: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


In [15]:
model.save(saved_model)



INFO:tensorflow:Assets written to: tr_ner_bert_data_loader/assets


INFO:tensorflow:Assets written to: tr_ner_bert_data_loader/assets


In [None]:
imported = tf.saved_model.load(saved_model)

In [None]:
def predict_from_text(text, model):
    x_test, original_len = data_loader.get_text_tokens(text)
    pred_test = model.predict(x_test) if hasattr(model, 'predict') else model(x_test)
    # ignore predictions of padding tokens
    pred_tags = np.argmax(pred_test, 2)[0][:original_len]

    tags = [data_loader.idx2tag[_] for _ in pred_tags]
    res = []
    words = {
        'word': '',
        'tag': None
    }
    for idx, tag in enumerate(tags):
        token = x_test[0][0][idx]
        token = token.numpy()
        if(token == 101 or token == 102 or token == None):
            continue
        if(tag != 'O' and tag != '[pad]'):
            pre, suf = tag.split('-')
            words['tag'] = suf
            word = data_loader.tokenizer.decode([token])
            words['word'] =  words['word'] + word if words['word'] else word
        else:
            if(words['tag']):
                res.append(words)
            words = {
                'word': '',
                'tag': None
            }
    return pd.DataFrame(res)


In [None]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'
trained_model = imported
print(predict_from_text(test_inputs, trained_model))