简单的tokenizer

英文都会拆到char

基本相当于list(inputs_str.replace(' ', ''))

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import re
import tensorflow as tf
import unicodedata
from collections import Counter
import random

In [2]:
word_index = {}
special = []
with open('./vocab.txt') as fp:
    for i, line in enumerate(fp):
        line = line.strip().lower()
        word_index[line] = i
        if line.startswith('[') and line.endswith(']'):
            special.append(line)

print(len(special))

104


In [3]:
special[:3]

['[pad]', '[unused1]', '[unused2]']

In [4]:
class BertTokenizer(tf.keras.models.Model):
    def __init__(self, word_index, **args):
        super(BertTokenizer, self).__init__(**args)
    
    @tf.function(experimental_relax_shapes=True)
    def call(self, inputs):
        x = inputs
        x = tf.strings.lower(x)
        x = tf.strings.regex_replace(x, tf.constant(r'\s+'), tf.constant(''))
        x = tf.strings.unicode_split(x, 'UTF-8')
        x = tf.strings.reduce_join(x, separator=' ', axis=-1)
        x = tf.strings.regex_replace(x, r'\[ p a d \]', ' [pad] ')
        x = tf.strings.regex_replace(x, r'\[ u n k \]', ' [unk] ')
        x = tf.strings.regex_replace(x, r'\[ c l s \]', ' [cls] ')
        x = tf.strings.regex_replace(x, r'\[ s e p \]', ' [sep] ')
        x = tf.strings.regex_replace(x, r'\[ m a s k \]', ' [mask] ')
        x = x.to_tensor()
        return x

    def compute_output_shape(self, input_shape):
        return input_shape

In [5]:
t = BertTokenizer(word_index)

In [6]:
t([['我爱你'], ['我']])

<tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'\xe6\x88\x91 \xe7\x88\xb1 \xe4\xbd\xa0'],
       [b'\xe6\x88\x91']], dtype=object)>

In [7]:
class BertIds(tf.keras.models.Model):
    def __init__(self, word_index, **args):
        super(BertIds, self).__init__(**args)
        self.construct(word_index)
    
    def construct(self, word_index):
        keys = tf.constant(list(word_index.keys()), dtype=tf.string)
        values = tf.constant(list(word_index.values()), dtype=tf.int32)
        self.table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(keys, values),
            tf.constant(word_index['[unk]'])) # default value
    
    @tf.function(experimental_relax_shapes=True)
    def call(self, inputs):
        x = inputs

        x = tf.strings.split(x)
        x = x.to_tensor('[pad]')
        x = tf.squeeze(x, axis=1)
        x = self.table.lookup(x)

        cls = tf.fill([tf.shape(x)[0], 1], tf.constant(101))
        pad = tf.fill([tf.shape(x)[0], 1], tf.constant(0))
        x = x[:, :510]
        x = tf.concat([cls, x, pad], axis=1)
    
        row_inds = tf.range(0, tf.shape(x)[0])
        col_inds = tf.math.count_nonzero(x, axis=1)
        col_inds = tf.cast(col_inds, tf.int32)
        inds = tf.concat([
            tf.reshape(row_inds, (-1, 1)),
            tf.reshape(col_inds, (-1, 1))], axis=1)
        fill = tf.ones(tf.shape(x)[0], dtype=tf.int32)
        shape = tf.cast(tf.shape(x), tf.int32)
        sep = tf.scatter_nd(inds, fill, shape) * tf.constant(102)
        x = x + sep
    
        return x

    def compute_output_shape(self, input_shape):
        return input_shape

In [8]:
t = BertIds(word_index)

In [9]:
t([['我 爱 你'], ['我']])

<tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[ 101, 2769, 4263,  872,  102],
       [ 101, 2769,  102,    0,    0]], dtype=int32)>

In [10]:
text = tf.constant([
    ['我爱  你'],
    ['我，你[MASK][MASK]有'],
    ['我' * 500]
] * 10)

In [11]:
bt = BertTokenizer(word_index)
y = bt(text)

In [12]:
bid = BertIds(word_index)
bid(y)

<tf.Tensor: shape=(30, 502), dtype=int32, numpy=
array([[ 101, 2769, 4263, ...,    0,    0,    0],
       [ 101, 2769, 8024, ...,    0,    0,    0],
       [ 101, 2769, 2769, ..., 2769, 2769,  102],
       ...,
       [ 101, 2769, 4263, ...,    0,    0,    0],
       [ 101, 2769, 8024, ...,    0,    0,    0],
       [ 101, 2769, 2769, ..., 2769, 2769,  102]], dtype=int32)>

In [13]:
%%timeit
bt(text)

908 µs ± 1.97 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit
bid(y)

913 µs ± 13.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
bid(bt([['我'], ['我爱你[cls]额额[mask]额[unk]哦[sep]']]))

<tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[ 101, 2769,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [ 101, 2769, 4263,  872,  101, 7583, 7583,  103, 7583,  100, 1521,
         102,  102]], dtype=int32)>

In [16]:
save_path = 'bert_tokenizer'

model = tf.keras.Sequential([
    BertTokenizer(word_index, name='bert_tokenizer'),
    BertIds(word_index, name='bert_token_to_ids'),
])
model._set_inputs(tf.keras.backend.placeholder((None, 1), dtype='string'))
model.save(save_path, include_optimizer=False)

INFO:tensorflow:Assets written to: bert_tokenizer/assets


In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert_tokenizer (BertTokenize multiple                  0         
_________________________________________________________________
bert_token_to_ids (BertIds)  multiple                  0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [18]:
m2 = tf.keras.models.load_model(save_path)

In [19]:
text = [
    ['我爱你[mask] [mask]哦'],
    ['我[unk][MASK]'],
    ['important']
]
vec = tf.constant(text)

In [20]:
m2(vec)

<tf.Tensor: shape=(3, 11), dtype=int32, numpy=
array([[ 101, 2769, 4263,  872,  103,  103, 1521,  102,    0,    0,    0],
       [ 101, 2769,  100,  103,  102,    0,    0,    0,    0,    0,    0],
       [ 101,  151,  155,  158,  157,  160,  162,  143,  156,  162,  102]],
      dtype=int32)>