In [25]:
# A dependency of the preprocessing for BERT inputs
!pip install -U "tensorflow-text==2.13.*"
!pip install "tf-models-official==2.13.*"



In [32]:
import os
import shutil
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [27]:
bert_model_name = "bert_multi_cased_L-12_H-768_A-12"

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]


print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3


In [37]:
raw_text = ['在這美麗的春天裡，一群小鳥在樹林中歡快地歌唱著，迎接新生命的到來。陽光明媚，微風拂過，樹葉輕輕地搖曳著舞動著。遠處的山峰在藍天的映襯下顯得格外妖娆。小溪清澈見底，不時傳來悅耳的水聲。這是一個充滿生機和希望的季節，萬物都在沐浴著春天的陽光，展現著勃勃生機。在古老的村落中，傳承著世代相傳的文化和傳統。古老的建築依然屹立，見證著歷史的變遷。村民們勤勞樸實，過著簡樸而幸福的生活。森林深處，叢林茂密，鳥語花香，猴子在樹上嬉戲玩耍，熊貓悠閒地躺在竹林間。這裡是大自然的樂園，充滿了神奇和生機，讓人心曠神怡，彷彿置身於童話般的世界中。在這個繁華的都市中，高樓大廈林立，車水馬龍，人來人往，燈紅酒綠，熙熙攘攘的街道上充斥著各種各樣的人群和商店。這裡是現代文明的象徵，每一個人都在為了自己的夢想和目標而努力奮鬥，生活充滿了無限可能性。'
]

In [29]:
# preprocessor = hub.load(
#     "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/multi-cased-preprocess/3"
#     )

# # Step 1: tokenize batches of text inputs.
# text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string),
#                ...] # This SavedModel accepts up to 2 text inputs.
# tokenize = hub.KerasLayer(preprocessor.tokenize)
# tokenized_inputs = [tokenize(segment) for segment in text_inputs]

# # Step 2 (optional): modify tokenized inputs.
# pass

# # Step 3: pack input sequences for the Transformer encoder.
# seq_length = 128  # Your choice here.
# bert_pack_inputs = hub.KerasLayer(
#     preprocessor.bert_pack_inputs,
#     arguments=dict(seq_length=seq_length))  # Optional argument.
# encoder_inputs = bert_pack_inputs(tokenized_inputs)

ValueError: Exception encountered when calling layer 'keras_layer_10' (type KerasLayer).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * Ellipsis
    * False
    * None
  Keyword arguments: {}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * TensorSpec(shape=(None,), dtype=tf.string, name='sentences')
    * False
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * TensorSpec(shape=(None,), dtype=tf.string, name='inputs')
    * False
    * None
  Keyword arguments: {}

Option 3:
  Positional arguments (3 total):
    * TensorSpec(shape=(None,), dtype=tf.string, name='sentences')
    * True
    * None
  Keyword arguments: {}

Option 4:
  Positional arguments (3 total):
    * TensorSpec(shape=(None,), dtype=tf.string, name='inputs')
    * True
    * None
  Keyword arguments: {}

Call arguments received by layer 'keras_layer_10' (type KerasLayer):
  • inputs=Ellipsis
  • training=None

In [33]:
def make_bert_preprocess_model(sentence_features, seq_length=128):
  """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load(tfhub_handle_preprocess)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Optional: Trim segments in a smart way to fit seq_length.
  # Simple cases (like this example) can skip this step and let
  # the next step apply a default truncation to approximately equal lengths.
  truncated_segments = segments

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

In [40]:
test_preprocess_model = make_bert_preprocess_model(['my_input1'], 512)
test_text = [np.array(raw_text)]
text_preprocessed = test_preprocess_model(test_text)

print('Keys           : ', list(text_preprocessed.keys()))
print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
print('Word Ids       : ', text_preprocessed['input_word_ids'][0, :])
print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
print('Input Mask     : ', text_preprocessed['input_mask'][0, :])
print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
print('Type Ids       : ', text_preprocessed['input_type_ids'][0, :])

Keys           :  ['input_type_ids', 'input_word_ids', 'input_mask']
Shape Word Ids :  (1, 512)
Word Ids       :  tf.Tensor(
[  101  3031  7734  6417  8782  5718  4376  3198  7113 10064  2072  6424
  3459  8709  3031  4731  4520  2104  4789  3806  3035  4784  2886  6827
 10064  7695  4161  4333  5600  2827  5718  2555  2278  1882  8237  2432
  4368  3322 10064  3785  8445  4056  7758 10064  4731  6826  7617  7617
  3035  4199  4448  6827  6646  2621  6827  1882  7768  6946  5718  3504
  3539  3031  6916  3198  5718  4375  7136  2079  8412  3775  4580  3189
  3251   100  1882  3459  5118  5061  5201  7143  3662 10064  2080  4388
  2387  2278  3858  6468  5718  4867  6488  1882  7734  4380  2072  2333
  2428  5154  5600  4741  2833  3605  4468  5718  3363  6104 10064  6820
  5407  7838  3031  4909  5004  6827  4376  3198  5718  8237  2432 10064
  3495  5512  6827  2614  2614  5600  4741  1882  3031  2747  6455  5718
  4492  6824  2104 10064  2387  4029  6827  2087  2202  5760  2387  5718

TypeError: count_nonzero(): argument 'input' (position 1) must be Tensor, not tensorflow.python.framework.ops.EagerTensor

In [41]:
test_preprocess_model = make_bert_preprocess_model(['my_input1'], 256)
test_text = [np.array(raw_text)]
text_preprocessed = test_preprocess_model(test_text)

print('Keys           : ', list(text_preprocessed.keys()))
print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
print('Word Ids       : ', text_preprocessed['input_word_ids'][0, :])
print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
print('Input Mask     : ', text_preprocessed['input_mask'][0, :])
print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
print('Type Ids       : ', text_preprocessed['input_type_ids'][0, :])

Keys           :  ['input_type_ids', 'input_word_ids', 'input_mask']
Shape Word Ids :  (1, 256)
Word Ids       :  tf.Tensor(
[  101  3031  7734  6417  8782  5718  4376  3198  7113 10064  2072  6424
  3459  8709  3031  4731  4520  2104  4789  3806  3035  4784  2886  6827
 10064  7695  4161  4333  5600  2827  5718  2555  2278  1882  8237  2432
  4368  3322 10064  3785  8445  4056  7758 10064  4731  6826  7617  7617
  3035  4199  4448  6827  6646  2621  6827  1882  7768  6946  5718  3504
  3539  3031  6916  3198  5718  4375  7136  2079  8412  3775  4580  3189
  3251   100  1882  3459  5118  5061  5201  7143  3662 10064  2080  4388
  2387  2278  3858  6468  5718  4867  6488  1882  7734  4380  2072  2333
  2428  5154  5600  4741  2833  3605  4468  5718  3363  6104 10064  6820
  5407  7838  3031  4909  5004  6827  4376  3198  5718  8237  2432 10064
  3495  5512  6827  2614  2614  5600  4741  1882  3031  2747  6455  5718
  4492  6824  2104 10064  2387  4029  6827  2087  2202  5760  2387  5718

In [20]:
print(f'Keys       : {list(encoder_inputs.keys())}')
print(f'Shape      : {encoder_inputs["input_word_ids"].shape}')
print(f'Word Ids   : {encoder_inputs["input_word_ids"]}')
print(f'Input Mask : {encoder_inputs["input_mask"][0, :]}')
print(f'Type Ids   : {encoder_inputs["input_type_ids"][0, :]}')

Keys       : ['input_type_ids', 'input_mask', 'input_word_ids']
Shape      : (None, 512)
Word Ids   : KerasTensor(type_spec=TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), name='keras_layer_5/PartitionedCall:2', description="created by layer 'keras_layer_5'")
Input Mask : KerasTensor(type_spec=TensorSpec(shape=(512,), dtype=tf.int32, name=None), name='tf.__operators__.getitem_3/strided_slice:0', description="created by layer 'tf.__operators__.getitem_3'")
Type Ids   : KerasTensor(type_spec=TensorSpec(shape=(512,), dtype=tf.int32, name=None), name='tf.__operators__.getitem_4/strided_slice:0', description="created by layer 'tf.__operators__.getitem_4'")


In [14]:
# bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
# text_preprocessed = bert_preprocess_model(test_text)
# print(f'Keys       : {list(text_preprocessed.keys())}')
# print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
# print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :]}')
# print(f'Input Mask : {text_preprocessed["input_mask"][0, :]}')
# print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [  101  3031  7734  6417  8782  5718  4376  3198  7113 10064  2072  6424
  3459  8709  3031  4731  4520  2104  4789  3806  3035  4784  2886  6827
 10064  7695  4161  4333  5600  2827  5718  2555  2278  1882  8237  2432
  4368  3322 10064  3785  8445  4056  7758 10064  4731  6826  7617  7617
  3035  4199  4448  6827  6646  2621  6827  1882  7768  6946  5718  3504
  3539  3031  6916  3198  5718  4375  7136  2079  8412  3775  4580  3189
  3251   100  1882  3459  5118  5061  5201  7143  3662 10064  2080  4388
  2387  2278  3858  6468  5718  4867  6488  1882  7734  4380  2072  2333
  2428  5154  5600  4741  2833  3605  4468  5718  3363  6104 10064  6820
  5407  7838  3031  4909  5004  6827  4376  3198  5718  8237  2432 10064
  3495  5512  6827  2614  2614  5600  4741   102]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [46]:
# using the bert model
bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_results = bert_model(text_preprocessed)

In [47]:
print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print("\n")
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3
Pooled Outputs Shape:(1, 768)
Pooled Outputs Values:[ 0.25509596 -0.1346429   0.17952994 -0.29601434 -0.18753952  0.5486122
  0.18474422  0.06925251 -0.35944673  0.4102258   0.02746217 -0.27535892]


Sequence Outputs Shape:(1, 256, 768)
Sequence Outputs Values:[[ 0.41470388  0.3225596   0.12216708 ...  0.46025234  0.19598807
  -0.04283614]
 [-0.08378892  0.0581618   1.0475366  ...  0.49799496  0.42801988
  -0.1365274 ]
 [ 0.52187616  0.17823525  0.23103118 ...  0.04721265  0.59018356
  -0.00605807]
 ...
 [-0.3425064   0.8184923   0.31473142 ...  0.07185332 -0.04120203
   0.35766715]
 [ 0.11016785 -0.0575197   1.5136073  ... -0.03025512  0.30609918
  -0.10369077]
 [ 0.7740867  -0.09425966  1.2691633  ... -0.07537282  1.0354899
   0.4844886 ]]
