# Word representation using TensorFlow-BERT


https://tfhub.dev/google/collections/bert/1

In [None]:
%pip install tensorflow
%pip install tensorflow_text
%pip install tensorflow_hub
%pip install tf_models_official

In [1]:
import numpy as np

data = np.array([
    'this is the first document',
    'this document is the second document',
    'this is the third one not the first nor the third',
    'is this the first document or is is another document'
])

data

array(['this is the first document',
       'this document is the second document',
       'this is the third one not the first nor the third',
       'is this the first document or is is another document'],
      dtype='<U52')

## I. Text preprocessing

In [29]:
import tensorflow_text as text
import tensorflow_hub as hub
# Load BERT and the preprocessing model from TF Hub.
preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

# preprocessing the data
text_preprocessed = preprocess(data)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids (sentence 1)   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask (sentence 1) : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids (sentence 1)   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_type_ids', 'input_word_ids', 'input_mask']
Shape      : (4, 128)
Word Ids (sentence 1)   : [ 101 2023 2003 1996 2034 6254  102    0    0    0    0    0]
Input Mask (sentence 1) : [1 1 1 1 1 1 1 0 0 0 0 0]
Type Ids (sentence 1)   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [50]:
# Get special tokens caracters
preprocess.tokenize.get_special_tokens_dict()

{'mask_id': <tf.Tensor: shape=(), dtype=int32, numpy=103>,
 'end_of_segment_id': <tf.Tensor: shape=(), dtype=int32, numpy=102>,
 'start_of_sequence_id': <tf.Tensor: shape=(), dtype=int32, numpy=101>,
 'vocab_size': <tf.Tensor: shape=(), dtype=int32, numpy=30522>,
 'padding_id': <tf.Tensor: shape=(), dtype=int32, numpy=0>}

In [72]:
# get tokenized sentences without special tokens
sent_tok1 = preprocess.tokenize(['The document is fine'])
sent_tok2 = preprocess.tokenize(['It is not small nor big'])

sent_tok1, sent_tok2

([[[1996], [6254], [2003], [2986]]],
 <tf.RaggedTensor [[[2009], [2003], [2025], [2235], [4496], [2502]]]>)

In [71]:
import tensorflow as tf

text_preprocessed2 = preprocess.bert_pack_inputs([sent_tok1, sent_tok2], tf.constant(16))

print(f'Word Ids   : {text_preprocessed2["input_word_ids"][0, :]}')
print(f'Input Mask : {text_preprocessed2["input_mask"][0, :]}')
print(f'Type Ids   : {text_preprocessed2["input_type_ids"][0, :]}')

Word Ids   : [ 101 1996 6254 2003 2986  102 2009 2003 2025 2235 4496 2502  102    0
    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0]


## II. Text encoding

In [6]:
encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2')

# Use BERT on a batch of raw text inputs.
text_encoded = encoder(text_preprocessed)

text_encoded

{'encoder_outputs': [<tf.Tensor: shape=(4, 128, 128), dtype=float32, numpy=
  array([[[-0.30272555,  0.7067623 , -6.452263  , ..., -0.9379624 ,
           -0.4914864 ,  0.06854436],
          [-2.7249696 ,  1.2982738 ,  0.16811511, ..., -4.148181  ,
           -2.2010918 , -2.388429  ],
          [-2.5665681 ,  0.82445073,  0.22696826, ..., -3.4811137 ,
           -1.1981452 , -1.280134  ],
          ...,
          [-1.0882303 ,  0.7663802 , -0.36233592, ..., -2.0777793 ,
           -0.02864565, -0.09961006],
          [-0.7831269 ,  1.0778471 , -0.39730912, ..., -1.8431315 ,
           -0.15982047, -0.17599884],
          [-0.44202378,  1.2962248 , -0.36632612, ..., -1.5936308 ,
           -0.46940017, -0.29314977]],
  
         [[-0.35713828,  0.73916215, -6.4168353 , ..., -1.0037444 ,
           -0.52118576, -0.04826893],
          [-2.620458  ,  1.4798343 ,  0.38576162, ..., -4.192346  ,
           -2.3510869 , -2.5234041 ],
          [-2.6625538 ,  0.29404777,  0.3326159 , ..., -4

In [5]:
pooled_output = text_encoded["pooled_output"]
# embedding of the first sentence
print(pooled_output[0,:])

tf.Tensor(
[-0.9999978   0.09469287 -0.99929047  0.27537918 -0.999115   -0.5971148
 -0.99546176 -0.95303    -0.0701344  -0.07040184 -0.7565991  -0.03649563
 -0.04945032  1.         -0.5973235  -0.48945084  0.48514897  0.17088273
 -0.6533267   0.6814318   0.9876508  -0.09426827  0.8053285   0.4527445
 -0.9999795  -0.02340178 -0.9997462   0.7623185   0.98504996  0.05111226
  0.05511493  0.07216851 -0.9665822  -0.72186553  0.44624218  0.9998264
 -0.965401   -0.01046555  0.9973469  -0.9993376   0.9916014   0.9237551
 -0.9996473   0.99621475 -0.99998873 -0.17873257 -0.9989677   0.9990991
  0.9779341   0.99672234  0.9947251   0.12321524 -0.05816822  0.96800035
  0.9946639   0.9997741  -0.99689597 -0.6901617   0.9964953  -0.97409534
  0.00603457  0.87430596 -0.887609    0.9208319   0.02739939 -0.9999991
 -0.81231576  0.033195    0.8571852   0.8376998   0.9984142   0.02803145
 -0.9992058   0.03042576  0.8151939  -0.9977297   0.5357129  -0.00716372
 -0.9199667   0.04325847 -0.7394107  -0.124342

## III. Fine-Tuning

In [None]:
### III.1. Classification

In [None]:
sent = [
    
]