In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.1.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.1
sklearn 0.22.1
tensorflow 2.1.0
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf


In [3]:
import tensorflow_datasets as tfds
examples, info = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_data, val_data = examples['train'], examples['validation']



In [4]:
pt_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_data), target_vocab_size = 2 ** 13 
)

en_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_data), target_vocab_size = 2 ** 13 
)


In [122]:
buffer_size = 20000
batch_size = 64
max_length = 40

def encode_to_subword(pt_sentence, en_sentence):
    pt_sequence = [pt_tokenizer.vocab_size] + pt_tokenizer.encode(pt_sentence.numpy()) + [pt_tokenizer.vocab_size + 1]
    en_sequence = [en_tokenizer.vocab_size] + en_tokenizer.encode(en_sentence.numpy()) + [en_tokenizer.vocab_size + 1]
    return pt_sequence, en_sequence

#def encode_to_subword(pt_sentence, en_sentence):
# <start> sentence<end>
#    pt_sequence = [pt_tokenizer.vocab_size] + pt_tokenizer.encode(pt_sentence.numpy()) + [pt_tokenizer.vocab_size + 1]
#    en_sequence = [en_tokenizer.vocab_size] + en_tokenizer.encode(en_sentence.numpy()) + [en_tokenizer.vocab_size + 1]
#    return pt_sequence, en_sequence

def filter_by_max_length(pt_sequence, en_sequence):
    return tf.logical_and(tf.size(pt_sequence) < max_length, tf.size(en_sequence) < max_length)

def tf_func_encode_to_subword(pt_sentence, en_sentence):
    return tf.py_function(encode_to_subword, [pt_sentence, en_sentence], [tf.int64, tf.int64])

#train_dataset = train_data.map(tf_func_encode_to_subword)
#train_dataset = train_dataset.filter(filter_by_max_length)
# [-1], [-1]表示当前函数有两个维度，每个维度都在当前维度扩展到最大值
#train_dataset.shuffle(buffer_size).padded_batch(batch_size, padded_shapes=([-1], [-1]))

train_dataset = train_data.map(tf_func_encode_to_subword).filter(filter_by_max_length).shuffle(buffer_size).padded_batch(batch_size, padded_shapes=([-1], [-1]))
val_dataset = val_data.map(tf_func_encode_to_subword).filter(filter_by_max_length).shuffle(buffer_size).padded_batch(batch_size, padded_shapes=([-1], [-1]))

In [123]:
for pt_batch, en_batch in train_dataset.take(5):
    print(pt_batch.shape, en_batch.shape)

(64, 37) (64, 39)
(64, 37) (64, 38)
(64, 39) (64, 39)
(64, 38) (64, 37)
(64, 38) (64, 38)


In [99]:
# PE(pos, 2i) = sin(pos /10000^(2i/d_model))
# PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

# pos.shape: [sentence_length, 1]
# i.shape: [1, d_model]
# result.shape: [sentence_length, d_model]
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i / 2)) / np.float32(d_model))
    return pos * angle_rates
                               
def get_position_embedding(sequence_length, d_model):
    angles = get_angles(np.arange(sequence_length)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
    sines = np.sin(angles[:, 0::2])
    cosines = np.cos(angles[:, 1::2])
    position_embedding = np.concatenate([sines, cosines], axis=-1)
    position_embedding = position_embedding[np.newaxis, :]   # 为了后续使用方便
    return tf.cast(position_embedding, dtype=tf.float32)

position_embedding = get_position_embedding(40, 512)
print(position_embedding.shape)

(1, 40, 512)


In [114]:
# batch_data.shape: [batch_size, seq_len]
def create_padding_mask(batch_data):
    print(batch_data.shape)
    mask = tf.cast(tf.math.equal(batch_data, 0), tf.float32)
    # [batch_size, 1, 1, seq_len]
    return mask[:, np.newaxis, np.newaxis, :]
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(x)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

create_look_ahead_mask(5)

(3, 5)


<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0.]], dtype=float32)>

In [101]:
def scaled_dot_product_attention(q, k, v, mask):
    # q.shape: (..., seq_len_q, depth)
    # k.shape: (..., seq_len, depth)
    # v.shape: (..., seq_len, depth_v)
    
    # matmul_qk.shape: (batch_size, seq_len_q, seq_len)
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(k.shape[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        scaled_attention_logits += mask * (-1e9)
    
    # shape: (..., seq_len_q, seq_len)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    # shape: (..., seq_len_q, depth_v)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights
    
def print_scaled_dot_product_attention(q, k, v):
    temp_out, temp_att = scaled_dot_product_attention(q, k, v, None)
    print("Attention weights are:")
    print(temp_att)
    print("Output is:")
    print(temp_out)
    
temp_k = tf.constant([[10, 0, 0], 
                                   [0, 10, 0],
                                   [0, 0, 10],
                                   [0, 0, 10]], dtype=tf.float32)  # (4, 3)
temp_v = tf.constant([[10, 0], [10, 0], [100, 5], [1000, 6]], dtype=tf.float32) # (4, 2)

temp_q1 = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
np.set_printoptions(suppress=True)
print_scaled_dot_product_attention(temp_q1, temp_k, temp_v)
# q * k -> 只有[0, 10, 0]行的结果非0, 所以是　[0, 1, 0, 0]
# q * k * v -> [0, 1, 0, 0]与v相乘只有第二行有结果, 即[10, 0]

Attention weights are:
tf.Tensor([[0. 1. 0. 0.]], shape=(1, 4), dtype=float32)
Output is:
tf.Tensor([[10.  0.]], shape=(1, 2), dtype=float32)


In [102]:
class MultiHeadAttention(keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads
        self.WQ = keras.layers.Dense(d_model)
        self.WK = keras.layers.Dense(d_model)
        self.WV = keras.layers.Dense(d_model)
        self.dense = keras.layers.Dense(d_model)

    def reshape_heads(self, x, batch_size):
        # x.shape: (batch_size, seq_len, d_model)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, q, k, v, mask):
        batch_size = q.shape[0]
        q = self.reshape_heads(self.WQ(q), batch_size)
        k = self.reshape_heads(self.WK(k), batch_size)
        v = self.reshape_heads(self.WQ(v), batch_size)
        
        scaled_attention_output, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        # scaled_attention_output.shape: (batch_size, num_heads, seq_len, depth)
        
        # shape: (batch_size, seq_len, d_model)
        scaled_attention_outputs = tf.reshape(tf.transpose(scaled_attention_output, [0, 2, 1, 3]),
                                                                  (batch_size, -1, self.d_model))
        return self.dense(scaled_attention_outputs), attention_weights
        
temp_mha = MultiHeadAttention(d_model=512, num_heads=8) # dk = 64
y = tf.random.uniform((1, 60, 256)) #(batch_size, seq_len_q, dim)
output, attn = temp_mha(y, y, y, mask=None)
print(output.shape)
print(attn.shape)

(1, 60, 512)
(1, 8, 60, 60)


In [103]:
def feed_forward_network(d_model, dff):
    return keras.models.Sequential([
        keras.layers.Dense(dff, activation='relu'),
        keras.layers.Dense(d_model)
    ])

sample_ffn = feed_forward_network(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

TensorShape([64, 50, 512])

In [104]:
class EncoderLayer(keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = feed_forward_network(d_model, dff)
        self.norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.drop1 = keras.layers.Dropout(rate)
        self.norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.drop2 = keras.layers.Dropout(rate)
        
    def call(self, x, is_training, mask):
        # shape: (batch_size, seq_len, d_model)
        attention_out, _ = self.mha(x, x, x, mask)
        attention_out = self.drop1(attention_out, training=is_training)
        out1 = self.norm1(x + attention_out)
        
        ffn_out = self.ffn(out1)
        ffn_out = self.drop2(ffn_out, training=is_training)
        out2 = self.norm2(out1 + ffn_out)
        return out2
    
sample_encoder_layer = EncoderLayer(512, 8, 2048)
sampe_input = tf.random.uniform((64, 50, 512))
sample_output = sample_encoder_layer(sampe_input, False, None)
print(sample_output.shape)

(64, 50, 512)


In [105]:
class DecoderLayer(keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = feed_forward_network(d_model, dff)
        
        self.norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm3 = keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.drop1 = keras.layers.Dropout(rate)
        self.drop2 = keras.layers.Dropout(rate)
        self.drop3 = keras.layers.Dropout(rate)
        
    def call(self, x, encoder_outputs, is_training, look_ahead_mask, padding_mask):
        # shape: (batch_size, seq_len, d_model)
        atten_out1, atten_weight1 = self.mha1(x, x, x, look_ahead_mask)
        atten_out1 = self.drop1(atten_out1, training=is_training)
        out1 = self.norm1(x + atten_out1)
        
        atten_out2, atten_weight2 = self.mha2(x, encoder_outputs, encoder_outputs, padding_mask)
        atten_out2 = self.drop2(atten_out2, training=is_training)
        out2 = self.norm2(out1 + atten_out2)
        
        ffn_out = self.ffn(out2)
        ffn_out = self.drop3(ffn_out, training=is_training)
        out3 = self.norm3(ffn_out + out2)
        return out3, atten_weight1, atten_weight2
    
sample_decoder_layer = DecoderLayer(512, 8, 2048)
sample_decoder_input = tf.random.uniform((64, 60, 512))
sample_decoder_output, sample_decoder_attn_weights1, sample_deocder_attn_weights2 = sample_decoder_layer(sample_decoder_input, sample_output, False, None, None)

print(sample_decoder_output.shape)
print(sample_decoder_attn_weights1.shape)
print(sample_deocder_attn_weights2.shape)

(64, 60, 512)
(64, 8, 60, 60)
(64, 8, 60, 50)


In [106]:
class EncoderModel(keras.layers.Layer):
    def __init__(self, num_layers, input_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.max_length = max_length
        self.embedding = keras.layers.Embedding(input_vocab_size, d_model)
        self.position_embedding = get_position_embedding(max_length, d_model)
        self.dropout = keras.layers.Dropout(rate)
        self.encoder_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        
    def call(self, x, is_training, mask):
        # x.shape: (batch_size, input_seq_len)
        input_seq_len = x.shape[1]
        tf.debugging.assert_less_equal(input_seq_len, self.max_length, "input_seq_len should be less or equal to self.max_length")
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.position_embedding[:, :input_seq_len, :]
        
        x = self.dropout(x, training=is_training)
        for i in range(self.num_layers):
            encoder_layer = self.encoder_layers[i]
            x = encoder_layer(x, is_training, mask)
        return x
    
sample_encoder_model = EncoderModel(2, 8500, max_length, 512, 8, 2048)
sample_encoder_model_input = tf.random.uniform((64, 37))
sample_encoder_model_output = sample_encoder_model(sample_encoder_model_input, False, mask=None)

print(sample_encoder_model_output.shape)

(64, 37, 512)


In [126]:
class DecoderModel(keras.layers.Layer):
    def __init__(self, num_layers, target_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.max_length = max_length
        self.embedding = keras.layers.Embedding(target_vocab_size, d_model)
        self.position_embedding = get_position_embedding(max_length, d_model)
        self.dropout = keras.layers.Dropout(rate)
        self.decoder_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        
    def call(self, x, encoding_outputs, training, look_ahead_mask, padding_mask):
        out_seq_len = tf.shape(x)[1]
        tf.debugging.assert_less_equal(out_seq_len, self.max_length, "output_seq_len should be less or equal to self.max_length")
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        pos_embedding = self.position_embedding[:, :out_seq_len, :]
        print(pos_embedding.shape)
        x += pos_embedding
        x = self.dropout(x, training=training)
        
        attention_weights = {}
        for i in range(self.num_layers):
            x, attn1, attn2 = self.decoder_layers[i](x, encoding_outputs, training, look_ahead_mask, padding_mask)
            attention_weights['decoder_layer{}_att1'.format(i+1)] = attn1
            attention_weights['decoder_layer{}_att2'.format(i+1)] = attn2
        return x, attention_weights
    
sample_decoder_model = DecoderModel(2, 8000, max_length, 512, 8, 2048)
sample_decoder_model_input = tf.random.uniform((64, 35))
sample_decoder_model_output, sample_decoder_model_att = sample_decoder_model(sample_decoder_model_input, sample_encoder_model_output, training=False, look_ahead_mask=None, padding_mask=None)
print(sample_encoder_model_output.shape)
for key in sample_decoder_model_att:
    print(sample_decoder_model_att[key].shape)

(1, 35, 512)
(64, 37, 512)
(64, 8, 35, 35)
(64, 8, 35, 37)
(64, 8, 35, 35)
(64, 8, 35, 37)


In [108]:
class TransformerModel(keras.models.Model):
    def __init__(self, num_layers, input_vocab_size, target_vocab_size, max_length, d_model, num_heads, dff, rate):
        super().__init__()
        self.encoder_model = EncoderModel(num_layers, input_vocab_size, max_length, d_model, num_heads, dff, rate)
        self.decoder_model = DecoderModel(num_layers, target_vocab_size, max_length, d_model, num_heads, dff, rate)
        self.final_layer = keras.layers.Dense(target_vocab_size)
        
    def call(self, inp, tar, training, encoder_padding_mask, look_ahead_mask, decoder_padding_mask):
        encoder_output = self.encoder_model(inp, training, encoder_padding_mask)
        decoder_output, attention_weights = self.decoder_model(tar, encoder_output, training, look_ahead_mask, decoder_padding_mask)
        prediction = self.final_layer(decoder_output)
        return prediction, attention_weights

sample_transformer = TransformerModel(2, 8500, 8000, 40, 512, 8, 2048, 0.1)
temp_input = tf.random.uniform((64, 26))
temp_target = tf.random.uniform((64, 31))

predictions, attention_weights = sample_transformer(temp_input, temp_target, training=False, encoder_padding_mask=None, look_ahead_mask=None, decoder_padding_mask =None)
print(predictions.shape)
for key in attention_weights:
    print(key, attention_weights[key].shape)

(64, 31, 512)
(1, 31, 512)
(64, 31, 8000)
decoder_layer1_att1 (64, 8, 31, 31)
decoder_layer1_att2 (64, 8, 31, 26)
decoder_layer2_att1 (64, 8, 31, 31)
decoder_layer2_att2 (64, 8, 31, 26)


In [109]:
# 1. initialize model
# 2. define loss, optimizer, learning_rate schedule
# 3. train_step
# 4. train process
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
input_vocab_size = pt_tokenizer.vocab_size + 2
target_vocab_size = en_tokenizer.vocab_size + 2
dropout_rate = 0.1
transformer = Transformer(num_layers, input_vocab_size, target_vocab_size, max_length, d_model, num_heads, dff, dropout_rate)

In [110]:
# learning_rate调整策略
# lrate = (d_model ** -0.5) * min(step_num ** (-0.5), step_num * warm_up_steps ** (-1.5))

class CustomizedSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
        
    def __call__(self, step_num):
        arg1 = tf.math.rsqrt(self.d_model)
        arg2 = tf.math.rsqrt(step_num)
        arg3 = step_num * (self.warmup_steps ** (-1.5))
        return arg1 * tf.math.minimum(arg2, arg3)
    
learning_rate = CustomizedSchedule(d_model)
optimizer = keras.optimizers.Adam(learning_rate, beta_1 = 0.9, beta_2=0.98, epsilon=1e-9)
temp_learning_rate_schedule = CustomizedSchedule(d_model)
#plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
#plt.ylabel("Learning rate")
#plt.xlabel("Train Step")

In [111]:
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_func(real, pred):
    mask = tf.math.not_equal(real, 0)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [128]:
def create_masks(inp, tar):
    encoder_padding_mask = create_padding_mask(inp)
    encoder_decoder_padding_mask = create_padding_mask(inp)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    decoder_padding_mask = create_padding_mask(tar)
    
    decoder_mask = tf.maximum(look_ahead_mask, decoder_padding_mask)
    return encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask

temp_inp, temp_tar = iter(train_dataset.take(1)).next()
create_masks(temp_inp, temp_tar)

(64, 39)
(64, 39)
(64, 36)


(<tf.Tensor: shape=(64, 1, 1, 39), dtype=float32, numpy=
 array([[[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        ...,
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]]], dtype=float32)>,
 <tf.Tensor: shape=(64, 1, 36, 36), dtype=float32, numpy=
 array([[[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 1., 1., ..., 1., 1., 1.

In [129]:
train_loss = keras.metrics.Mean(name='train_loss')
train_accuracy = keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_out = tar[:, 1:]
    encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask = create_masks(inp, tar_inp)
    with tf.GradientTape() as tape:
        pred, _ = transformer(inp, tar_inp, True, encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask)
        loss_ = loss_func(tar_out, pred)
    gradients = tape.gradient(loss_, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    train_loss(loss_)
    
epochs = 10
for epoch in range(epochs):
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f} Accuracy{:.4f}'.format(epoch+1, batch, train_loss.result(), train_accuracy.result()))
    print('Epoch{} Loss{:.4f} Accuracy {:.4f}'.format(epoch+1, train_loss.result(), train_accuracy.result()))
    print('Time take for 1 epoch: {} secs\n'.format(time.time() - start))

(64, 38)
(64, 38)
(64, 37)
(64, 37, 128)
(1, 37, 128)


TypeError: in converted code:

    <ipython-input-129-fedf814bbc48>:13 train_step  *
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:440 apply_gradients
        apply_state = self._prepare(var_list)
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:626 _prepare
        self._prepare_local(var_device, var_dtype, apply_state)
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/adam.py:157 _prepare_local
        super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:632 _prepare_local
        lr_t = array_ops.identity(self._decayed_lr(var_dtype))
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:684 _decayed_lr
        lr_t = math_ops.cast(lr_t(local_step), var_dtype)
    <ipython-input-110-4e9898586df5>:11 __call__
        arg1 = tf.math.rsqrt(self.d_model)
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_math_ops.py:7981 rsqrt
        "Rsqrt", x=x, name=name)
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py:576 _apply_op_helper
        param_name=input_name)
    /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py:61 _SatisfiesTypeConstraint
        ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))

    TypeError: Value passed to parameter 'x' has DataType int32 not in list of allowed values: bfloat16, float16, float32, float64, complex64, complex128
