In [1]:
# A dependency of the preprocessing for BERT inputs
!pip install -U "tensorflow-text==2.13.*"
!pip install "tf-models-official==2.13.*"



In [2]:
import os
import shutil
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [13]:
def make_bert_preprocess_model(tfhub_handle_preprocess, input_names, seq_length=128):
    """Returns Model mapping string features to BERT inputs.

    Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

    Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
    """

    input_segments = [
        tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
        for ft in input_names]

    # Tokenize the text to word pieces.
    bert_preprocess = hub.load(tfhub_handle_preprocess)
    tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
    segments = [tokenizer(s) for s in input_segments]

    # Optional: Trim segments in a smart way to fit seq_length.
    # Simple cases (like this example) can skip this step and let
    # the next step apply a default truncation to approximately equal lengths.
    truncated_segments = segments

    # Pack inputs. The details (start/end token ids, dict of output tensors)
    # are model-dependent, so this gets loaded from the SavedModel.
    packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                            arguments=dict(seq_length=seq_length),
                            name='packer')
    model_inputs = packer(truncated_segments)
    return tf.keras.Model(input_segments, model_inputs)

def get_embbedings_bert(tfhub_handle_preprocess, tfhub_handle_encoder, seq_length = 256, raw_text_list=[]):
    # text preprocessing
    test_preprocess_model = make_bert_preprocess_model(tfhub_handle_preprocess, ['my_input'], seq_length)
    test_text = [np.array(raw_text_list)]
    text_preprocessed = test_preprocess_model(test_text)
    print('Preprocessing!')
    print('Keys           : ', list(text_preprocessed.keys()))
    print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
    print('Word Ids       : ', text_preprocessed['input_word_ids'])
    print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
    print('Input Mask     : ', text_preprocessed['input_mask'])
    print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
    print('Type Ids       : ', text_preprocessed['input_type_ids'])


    # using the bert model
    print('Using Bert Model!')
    bert_model = hub.KerasLayer(tfhub_handle_encoder)
    bert_results = bert_model(text_preprocessed)
    print(f'Loaded BERT: {tfhub_handle_encoder}')
    print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
    print(f'Pooled Outputs Values:{bert_results["pooled_output"]}')
    print("\n")
    print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
    print(f'Sequence Outputs Values:{bert_results["sequence_output"]}')

    return bert_results

In [10]:
raw_text_list = ['在這美麗的春天裡，一群小鳥在樹林中歡快地歌唱著，迎接新生命的到來。陽光明媚，微風拂過，樹葉輕輕地搖曳著舞動著。遠處的山峰在藍天的映襯下顯得格外妖娆。小溪清澈見底，不時傳來悅耳的水聲。這是一個充滿生機和希望的季節，萬物都在沐浴著春天的陽光，展現著勃勃生機。在古老的村落中，傳承著世代相傳的文化和傳統。古老的建築依然屹立，見證著歷史的變遷。村民們勤勞樸實，過著簡樸而幸福的生活。森林深處，叢林茂密，鳥語花香，猴子在樹上嬉戲玩耍，熊貓悠閒地躺在竹林間。這裡是大自然的樂園，充滿了神奇和生機，讓人心曠神怡，彷彿置身於童話般的世界中。在這個繁華的都市中，高樓大廈林立，車水馬龍，人來人往，燈紅酒綠，熙熙攘攘的街道上充斥著各種各樣的人群和商店。這裡是現代文明的象徵，每一個人都在為了自己的夢想和目標而努力奮鬥，生活充滿了無限可能性。',
            '當夜幕低垂，星光點點，月色漫洒，一個小村莊在遠處靜靜地沉睡。村莊的古老寺廟在月光下顯得格外神秘，廟宇前的石階上鋪滿了青苔，散發出淡淡的清香。村民們相信，這座寺廟是神明的居所，保佑著這個村莊的和平與安寧。在這樣的夜晚，人們在信仰中感受著神秘與安詳，期盼著明天的美好。'
]

In [11]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'

In [14]:
seq_length = 256
eb = get_embbedings_bert(tfhub_handle_preprocess, tfhub_handle_encoder, seq_length, raw_text_list)

Preprocessing!
Keys           :  ['input_word_ids', 'input_mask', 'input_type_ids']
Shape Word Ids :  (2, 256)
Word Ids       :  tf.Tensor(
[[  101  3031  7734  6417  8782  5718  4376  3198  7113 10064  2072  6424
   3459  8709  3031  4731  4520  2104  4789  3806  3035  4784  2886  6827
  10064  7695  4161  4333  5600  2827  5718  2555  2278  1882  8237  2432
   4368  3322 10064  3785  8445  4056  7758 10064  4731  6826  7617  7617
   3035  4199  4448  6827  6646  2621  6827  1882  7768  6946  5718  3504
   3539  3031  6916  3198  5718  4375  7136  2079  8412  3775  4580  3189
   3251   100  1882  3459  5118  5061  5201  7143  3662 10064  2080  4388
   2387  2278  3858  6468  5718  4867  6488  1882  7734  4380  2072  2333
   2428  5154  5600  4741  2833  3605  4468  5718  3363  6104 10064  6820
   5407  7838  3031  4909  5004  6827  4376  3198  5718  8237  2432 10064
   3495  5512  6827  2614  2614  5600  4741  1882  3031  2747  6455  5718
   4492  6824  2104 10064  2387  4029  6827  2

In [18]:
eb['pooled_output']

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[ 0.25509596, -0.13464287,  0.17952994, ..., -0.43196055,
         0.15226448,  0.35424572],
       [ 0.24260353, -0.10071942,  0.11895233, ..., -0.30264026,
         0.18844548,  0.30124283]], dtype=float32)>