# Transformers

**Read the input text file**

In [1]:
with open("./tiny-shakespeare.txt", "r", encoding="utf-8") as input:
    text = input.read()

print(f"Input length: {len(text)}")

Input length: 1115394


**Tokenize the corpus of text**

In [2]:
import tiktoken

tkn = tiktoken.encoding_for_model("gpt-4")

encoding = tkn.encode(text)
print(f"Encoding size: {len(encoding)}")

Encoding size: 301829


**Transform the encoded text into a tensor dataset**

In [3]:
import tensorflow as tf

data = tf.constant(encoding)

2023-09-29 16:55:23.079703: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


**Split the dataset into train (90%) and test (10%)**

In [4]:
split = int(0.9 * len(data))
train = data[:split]
test = data[split:]

**Verify the split it's correct**

In [5]:
print(f"Data size: {len(data)}")
print(f"Train size: {len(train)}")
print(f"Test size: {len(test)}")

assert len(train) + len(test) == len(data)

Data size: 301829
Train size: 271646
Test size: 30183


In [6]:
SEQUENCE_SIZE = 8  # the length of the sequence to process
BATCH_SIZE = 4  # the number of sequences to process at once (possibly in parallel)

In [14]:
from tensorflow import keras
from tensorflow.keras import layers

def transformer(input_shape, head_size, num_heads, dropout):
    model = keras.Sequential(
        name='Transformer',
        layers=[
            layers.Input(shape=input_shape),
            ## Normalization & Attention
            layers.LayerNormalization(epsilon=1e-6),
            layers.MultiHeadAttention(
                key_dim=head_size,
                num_heads=num_heads,
                dropout=dropout
            ),
            layers.Dropout(dropout),
            
        ]
    )
    
    return model
    
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
# model.summary()