In [7]:
# pip install transformers

In [1]:
import tensorflow as tf

2023-04-01 08:39:49.990757: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
import numpy as np

In [9]:
import transformers
from transformers import BertTokenizer

In [2]:
tf.test.gpu_device_name()

2023-04-01 08:39:53.334886: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-01 08:39:53.352834: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-01 08:39:53.353020: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-01 08:39:54.409287: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-01 08:39:54.409627: I tensorflow/compile

'/device:GPU:0'

In [4]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

First step is to create an embedding layer that translates raw tokens into K-dimensional embeddings. 

In [11]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Your input text
input_text = "John is the best basketball player I know"

# Tokenize the text
input_tokens = tokenizer(input_text, return_tensors="tf", padding=True, truncation=True)

# Extract input_ids
input_ids = input_tokens["input_ids"]

# Create an embedding layer
vocab_size = tokenizer.vocab_size  # Size of the BERT vocabulary
embedding_dim = 768  # Embedding dimension, which is 768 for BERT-base

embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    trainable=True,
)

# Convert token IDs to embeddings
token_embeddings = embedding_layer(input_ids)

print(token_embeddings)

tf.Tensor(
[[[-0.0491559   0.04126722 -0.04697653 ... -0.0068565  -0.03183677
   -0.03195263]
  [ 0.01929628 -0.01089764  0.01000128 ... -0.03021339 -0.01418394
   -0.00332562]
  [-0.01274476  0.04068748  0.04470846 ... -0.04423631  0.04037298
    0.04162011]
  ...
  [-0.02374609  0.03028352 -0.032427   ...  0.0421473   0.04265909
   -0.04501848]
  [-0.00558231  0.00323743 -0.03707803 ... -0.02362527 -0.04564954
    0.0056197 ]
  [-0.03696796 -0.02114703 -0.03012    ... -0.03253096 -0.00911947
    0.00583986]]], shape=(1, 10, 768), dtype=float32)


# Positional Encodings

In the paper "Attention is All You Need", they define the positional encodings as follows:

```
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
```

The first thing that we want to do is break down the denominator which is the same in both cases.

```
10000^(2i/d_model) = exp(2i * (1/d_model) * log(10000)
```

Thus we can remove the exponent, which should promote numerical stability throughout the computations. 

We know that `i` represents the position within the embedding, so the first thing that we can do is create a range of length `d_model` with the function `f(i) = 2i`. We do that here:

In [17]:
max_position = 10

In [18]:
position = np.arange(max_position)[:, np.newaxis]

In [19]:
position

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [20]:
d_model = 768

In [26]:
two_i = np.arange(0, d_model, 2)

In [27]:
two_i

array([  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,
        26,  28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,
        52,  54,  56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,
        78,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98, 100, 102,
       104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128,
       130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154,
       156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180,
       182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206,
       208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232,
       234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258,
       260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284,
       286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310,
       312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336,
       338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 36

In [28]:
len(two_i)

384

We only need the divisors to be of length 384 because the the divisors are the same for the evens and the odd numbers. We can create the array once and then use it for both the even and the odd assignment, which we'll move to next.  

In [34]:
denominator = np.exp(two_i * 1/d_model * np.log(10000))

Now we can go ahead and build the matrix.

In [35]:
num_tokens = 10
d_model = 768

In [36]:
pos_matrix = np.zeros((num_tokens, d_model))

In [37]:
pos_matrix.shape

(10, 768)

Looking good so far. 

You can see now why we only needed `d_model / 2` elements in our array `two_i` -- we do two sets of assignments, 

In [40]:
pos_matrix[:, 0::2] = np.sin(position / denominator)
pos_matrix[:, 1::2] = np.cos(position / denominator)

In [41]:
pos_matrix[:10, :10]

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ],
       [ 0.84147098,  0.54030231,  0.82843076,  0.56009149,  0.81525065,
         0.57910826,  0.8019618 ,  0.59737533,  0.78859304,  0.61491545],
       [ 0.90929743, -0.41614684,  0.92799403, -0.37259506,  0.94423677,
        -0.32926724,  0.95814438, -0.28628544,  0.9698361 , -0.24375797],
       [ 0.14112001, -0.9899925 ,  0.21109235, -0.97746612,  0.27837998,
        -0.96047102,  0.34278182, -0.93941504,  0.40414137, -0.91469654],
       [-0.7568025 , -0.65364362, -0.69153198, -0.72234585, -0.62181248,
        -0.78316616, -0.54860557, -0.83608129, -0.47281055, -0.88116411],
       [-0.95892427,  0.28366219, -0.98573469,  0.1683066 , -0.99857347,
         0.05339503, -0.99822869, -0.05949362, -0.9856184 , -0.16898632],
       [-0.2794155 ,  0.96017029, -0.41267124,  0.91088004, -0.53475181,
         0.84500917, -0.6440288 ,  0.76500125

Notice how the broadcasting works -- we used a matrix of shape (num_tokens, 1) to represent the different token positions, and we used vectors of shape (1, d_model) -- or really, two vectors of size (1, d_model / 2) -- to repreesnt the latent dimensions. The result is an output with shape (num_tokens, d_model), which is our original matrix shape. 

In [44]:
pos_matrix_tf = tf.constant(pos_matrix, dtype=tf.float32)

In [45]:
pos_matrix_tf

<tf.Tensor: shape=(10, 768), dtype=float32, numpy=
array([[ 0.0000000e+00,  1.0000000e+00,  0.0000000e+00, ...,
         1.0000000e+00,  0.0000000e+00,  1.0000000e+00],
       [ 8.4147096e-01,  5.4030228e-01,  8.2843077e-01, ...,
         1.0000000e+00,  1.0242752e-04,  1.0000000e+00],
       [ 9.0929741e-01, -4.1614684e-01,  9.2799401e-01, ...,
         1.0000000e+00,  2.0485504e-04,  1.0000000e+00],
       ...,
       [ 6.5698659e-01,  7.5390226e-01,  5.2346742e-01, ...,
         9.9999970e-01,  7.1699260e-04,  9.9999976e-01],
       [ 9.8935825e-01, -1.4550003e-01,  9.9905050e-01, ...,
         9.9999964e-01,  8.1942009e-04,  9.9999964e-01],
       [ 4.1211849e-01, -9.1113025e-01,  5.9565198e-01, ...,
         9.9999958e-01,  9.2184759e-04,  9.9999958e-01]], dtype=float32)>

In [47]:
token_embeddings_with_pos_encoding = token_embeddings + pos_matrix_tf