In [1]:
!(pip list | grep tqdm) || pip install tqdm
!(pip list | grep tensorflow-hub) || pip install tensorflow-hub
!(pip list | grep tokenizers) || pip install tokenizers

tqdm                     4.41.1         
tensorflow-hub           0.8.0          
tokenizers               0.7.0          


In [2]:
import os
import json

os.environ['TFHUB_DOWNLOAD_PROGRESS'] = '1'

from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from tokenizers import BertWordPieceTokenizer

In [3]:
!curl --output train.json https://code.aliyun.com/qhduan/dataset/raw/88b3182c9f9d6185935d4484dfefefc23f50eaa7/LCQMC/train.json
!curl --output dev.json https://code.aliyun.com/qhduan/dataset/raw/88b3182c9f9d6185935d4484dfefefc23f50eaa7/LCQMC/dev.json
!curl --output vocab.txt https://code.aliyun.com/qhduan/zh-bert/raw/0fb1d96ec2133fe25e66bee12fe387cbe1e52938/vocab.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 25.9M    0 25.9M    0     0  4113k      0 --:--:--  0:00:06 --:--:-- 6947k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1060k    0 1060k    0     0   422k      0 --:--:--  0:00:02 --:--:--  422k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  106k    0  106k    0     0  62630      0 --:--:--  0:00:01 --:--:-- 62630


In [4]:
train = [json.loads(x) for x in open('train.json')]
dev = [json.loads(x) for x in open('dev.json')]

In [5]:
print(train[0], dev[0])

{'sentence1': '喜欢打篮球的男生喜欢什么样的女生', 'sentence2': '爱打篮球的男生喜欢什么样的女生', 'label': '1'} {'sentence1': '开初婚未育证明怎么弄？', 'sentence2': '初婚未育情况证明怎么开？', 'label': '1'}


In [6]:
tokenizer = BertWordPieceTokenizer("vocab.txt")

In [7]:
def compose_data(data, batch_size=32):
    X = [
        tokenizer.encode(x.get('sentence1'), x.get('sentence2')).tokens
        for x in tqdm(data)
    ]
    Y = [int(x.get('label')) for x in data]
    X = tf.ragged.constant(X, tf.string)
    Y = tf.constant(Y, tf.int32)

    @tf.autograph.experimental.do_not_convert
    def _to_tensor(x, y):
        return x.to_tensor(), y

    return tf.data.Dataset.zip((
        tf.data.Dataset.from_tensor_slices(X),
        tf.data.Dataset.from_tensor_slices(Y)
    )).batch(batch_size).map(_to_tensor)

In [8]:
data_train = compose_data(train)
data_dev = compose_data(dev)

100%|██████████| 238766/238766 [00:10<00:00, 23469.84it/s]
100%|██████████| 8802/8802 [00:00<00:00, 22555.26it/s]


In [9]:
for x, y in data_train.take(1):
  print(x.shape, y.shape)

(32, 52) (32,)


In [10]:
bert = hub.KerasLayer(
    'https://code.aliyun.com/qhduan/zh-roberta-wwm/raw/2c0d7fd709e4719a9ab2ca297f51b24e20586dbe/zh-roberta-wwm-L12.tar.gz',
    output_key='pooled_output',
    trainable=True)

In [11]:
pred_y = bert(x)

In [18]:
print(pred_y.shape)

(32, 768)


In [21]:
# 用tf.keras.Sequential的话，可能导致模型无法load
inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.string)
m = inputs
m = bert(m)
m = tf.keras.layers.Dense(2, activation='softmax')(m)
model = tf.keras.Model(inputs=inputs, outputs=m)

In [22]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-6),
    metrics=['acc']
)

In [23]:
model.predict(x)

array([[0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.94146293],
       [0.05853711, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853709, 0.94146293],
       [0.05853709, 0.94146293],
       [0.05853709, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.94146293],
       [0.05853711, 0.94146293],
       [0.05853711, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853709, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.94146293],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.05853711, 0.9414628 ],
       [0.

In [24]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
keras_layer (KerasLayer)     (None, 768)               102880904 
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1538      
Total params: 102,882,442
Trainable params: 102,882,442
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(data_train, epochs=1, validation_data=data_dev)



<tensorflow.python.keras.callbacks.History at 0x7f3e68eb8e10>

In [26]:
model.evaluate(data_dev)



[0.5099160671234131, 0.7627812027931213]

In [27]:
model.save('./test_model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: ./test_model/assets


INFO:tensorflow:Assets written to: ./test_model/assets


In [28]:
tf.keras.models.load_model('./test_model')

<tensorflow.python.keras.engine.training.Model at 0x7f3e81173e48>