<br>

<h1 style="text-align:center;">Transformers</h1>

<br>

### INITIAL SETUP

---

In [1]:
# Turn off the tensorflow logging messages
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
# Import the libraries
import keras_nlp, os, pathlib, random, warnings
import tensorflow as tf
from tensorflow import keras
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab,
)

In [3]:
# Turn off the warnings
warnings.filterwarnings('ignore')

In [4]:
# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 1
MAX_SEQUENCE_LENGTH = 60
ENG_VOCAB_SIZE = 30000
SPA_VOCAB_SIZE = 30000
EMBED_DIM = 512
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8
N_ENCODER = 2   # Number of encoders
N_DECODER = 2     # Number of encoders


<br>

### DATASET

---

In [5]:
# Download the file
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)

# Get the path to the spa-eng.txt file
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
text_file

PosixPath('/home/soheil/.keras/datasets/spa-eng/spa.txt')

In [6]:
# Load the file
with open(text_file) as f: lines = f.read().split("\n")[:-1]

# Initialize the list of text pairs
text_pairs = []

# Loop over the lines
for line in lines:

    # Split the line into english and spanish
    eng, spa = line.split("\t")

    # Lowercase the text
    eng, spa = eng.lower(), spa.lower()

    # Append the text pairs
    text_pairs.append((eng, spa))

# Randomly print text pairs
for _ in range(5):   print(random.choice(text_pairs))

("my alarm clock didn't work. that's why i was late.", 'mi reloj despertador no funcionó. es por eso que llegué tarde.')
("i can't stand being cooped up in this prison!", '¡no puedo soportar estar encerrado en esta prisión!')
('he was somehow able to swim across the river.', 'de alguna manera él fue capaz de nadar hasta el otro lado del río.')
('i did that which she asked me to do.', 'hice lo que ella me pidió.')
("everyone's talking about what tom did.", 'todo el mundo está hablando de lo que hizo tom.')


<br>

### DATA PREPROCESSING

---

In [7]:
# Function for splitting the text pAairs into training, validation and test samples
def split_text_pairs(text_pairs):
    """
    This function splits the text pairs into training, validation and test samples.

    ARGUMENTS
    ======================
        - text_pairs (list): List of text pairs

    RETURNS
    ======================
        - train_pairs (list): List of training text pairs
        - val_pairs (list): List of validation text pairs
        - test_pairs (list): List of test text pairs
    """

    # Shuffle the text pairs
    random.shuffle(text_pairs)

    # Number of training, validation and test samples
    num_val_samples = int(0.15 * len(text_pairs))
    num_train_samples = len(text_pairs) - 2 * num_val_samples

    # Split the text pairs into training, validation and test samples
    train_pairs = text_pairs[:num_train_samples]
    val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
    test_pairs = text_pairs[num_train_samples + num_val_samples :]
    
    return train_pairs, val_pairs, test_pairs


# Create a function to tokenize the text
def train_word_piece(text_samples, vocab_size):
    """
    This function tokenizes the text samples using the word piece tokenizer.

    ARGUMENTS
    =================
        - text_samples (list): A list of text samples.
        - vocab_size (int): The size of the vocabulary.
        - reserved_tokens (list): A list of reserved tokens.

    RETURNS
    =================
        - vocab (tf.Tensor): A tensor containing the vocabulary.
    """

    # Initialize the reserved tokens
    reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

    # Create a TensorFlow dataset
    word_piece_ds = tf.data.Dataset.from_tensor_slices(text_samples)

    # Create a vocabulary
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(word_piece_ds.batch(1000).prefetch(2),
                                                               vocabulary_size=vocab_size,
                                                               reserved_tokens=reserved_tokens,)
    
    return vocab


# Function for preprocessing a batch of text pairs
def preprocess_batch(eng, spa):
    """
    This function preprocesses a batch of text pairs.

    ARGUMENTS
    ======================
        - eng (tf.Tensor): A tensor containing the english text.
        - spa (tf.Tensor): A tensor containing the spanish text.

    RETURNS
    ======================
        - inputs (dict): A dictionary containing the encoder input (english text) and decoder input (spanish text except the last token).
        - outputs (tf.Tensor): A tensor containing the decoder output (spanish text except the first token).
    """

    # Batch size
    batch_size = tf.shape(spa)[0]

    # Tokenize the english and spanish text
    eng = eng_tokenizer(eng)
    spa = spa_tokenizer(spa)

    # Pad the english text to the maximum sequence length
    eng_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=eng_tokenizer.token_to_id("[PAD]"),
    )

    # Apply to the english text
    eng = eng_start_end_packer(eng)

    # Initialize a custom layer for adding start, end, and padding token
    spa_start_end_packer = keras_nlp.layers.StartEndPacker(sequence_length=MAX_SEQUENCE_LENGTH + 1,
                                                           start_value=spa_tokenizer.token_to_id("[START]"),
                                                           end_value=spa_tokenizer.token_to_id("[END]"),
                                                           pad_value=spa_tokenizer.token_to_id("[PAD]") )
    
    # Apply to the spanish text
    spa = spa_start_end_packer(spa)

    # Get the inputs and outputs
    inputs = {"encoder_inputs": eng, "decoder_inputs": spa[:, :-1]}     # Encoder input (i.e. english text) and decoder input (i.e. spanish text except the last token)
    outputs = spa[:, 1:]                                                # Decoder output (i.e. spanish text except the first token) 

    return (inputs, outputs)


# Create a function to make a dataset
def make_dataset(pairs):
    """
    This function creates a TensorFlow dataset.

    ARGUMENTS
    ======================
        - pairs (list): A list of text pairs.

    RETURNS
    ======================
        - dataset (tf.data.Dataset): A TensorFlow dataset.
    """

    # Zip the english and spanish text together
    eng_texts, spa_texts = zip(*pairs)

    # Convert into list
    eng_texts, spa_texts = list(eng_texts), list(spa_texts)

    # Convert into TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    
    # Set the batch size
    dataset = dataset.batch(BATCH_SIZE)
    
    # Preprocess the batch (in parallel)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)

    # Shuffle, prefetch and cache the dataset
    dataset = dataset.shuffle(2048).prefetch(16).cache()

    return dataset

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


In [None]:
# Split the text pairs into training, validation and test samples
train_pairs, val_pairs, test_pairs = split_text_pairs(text_pairs)

# Print the number of samples
print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

In [9]:
# Get the english and spanish texts from the text pairs
eng_samples = [text_pair[0] for text_pair in train_pairs]
spa_samples = [text_pair[1] for text_pair in train_pairs]

# Get the english and spanish vocabularies
eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE)
spa_vocab = train_word_piece(spa_samples, SPA_VOCAB_SIZE)

# Print sample english and spanish tokens
print("English Tokens: ", eng_vocab[100:110])
print("Spanish Tokens: ", spa_vocab[100:110])

English Tokens:  ['at', 'know', 'him', 'there', 'they', 'go', 'her', 'has', 'will', 're']
Spanish Tokens:  ['con', 'mi', 'qué', 'ella', 'le', 'te', 'para', 'mary', 'las', 'más']


In [10]:
# Create the tokenizer for english and spanish words
eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=eng_vocab, lowercase=False)
spa_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=spa_vocab, lowercase=False)

In [13]:
# Convert the text pairs into a TensorFlow dataset
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

# Print sample dataset
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 60)
inputs["decoder_inputs"].shape: (64, 60)
targets.shape: (64, 60)


<br>

### MODEL ARCHITECTURE

---

In [14]:
#################
#    ENCODER    #
#################

# Input
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

# Token and Position Embedding
x = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=ENG_VOCAB_SIZE,
                                               sequence_length=MAX_SEQUENCE_LENGTH,
                                               embedding_dim=EMBED_DIM,
                                               mask_zero=True)(encoder_inputs)

# Transformer Encoder
for _ in range(N_ENCODER):
    x = keras_nlp.layers.TransformerEncoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(inputs=x)

# Create the model
encoder = keras.Model(encoder_inputs, x, name="encoder")
encoder.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, None)]           0         
                                                                 
 token_and_position_embeddin  (None, None, 512)        15390720  
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_encoder (Transf  (None, None, 512)        3152384   
 ormerEncoder)                                                   
                                                                 
 transformer_encoder_1 (Tran  (None, None, 512)        3152384   
 sformerEncoder)                                                 
                                                                 
Total params: 21,695,488
Trainable params: 21,695,488
Non-t

In [15]:
#################
#    DECODER    #
#################

# Inputs
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

# Token and Position Embedding
x = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=SPA_VOCAB_SIZE,
                                               sequence_length=MAX_SEQUENCE_LENGTH,
                                               embedding_dim=EMBED_DIM,
                                               mask_zero=True)(decoder_inputs)

# Transformer Decoder
for _ in range(N_DECODER):
    x = keras_nlp.layers.TransformerDecoder(intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)

# Dense layer
decoder_outputs = keras.layers.Dense(SPA_VOCAB_SIZE, activation="softmax")(x)

# Create the model
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs, name="decoder")
decoder.summary()

Model: "decoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 token_and_position_embedding_1  (None, None, 512)   15390720    ['decoder_inputs[0][0]']         
  (TokenAndPositionEmbedding)                                                                     
                                                                                                  
 decoder_state_inputs (InputLay  [(None, None, 512)]  0          []                               
 er)                                                                                              
                                                                                            

In [16]:
#####################
#    TRANSFORMER    #
#####################

# Get the output of the encoder and decoder
encoder_outputs = encoder(encoder_inputs)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

# Create the model
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder (Functional)           (None, None, 512)    21695488    ['encoder_inputs[0][0]']         
                                                                                                  
 decoder (Functional)           (None, None, 30000)  39188784    ['decoder_inputs[0][0]',         
                                                                  'encoder[0][0]']      

<br>

### TRAINING

---

In [17]:
# Compile the model
transformer.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [18]:
# Train the model
#transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
# 1302/1302 [==============================] - 169s 120ms/step - loss: 3.5289 - accuracy: 0.4403 - val_loss: 2.5349 - val_accuracy: 0.5534

In [19]:
# Train the model
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)



<keras.callbacks.History at 0x7fb0107e3190>

In [20]:
# Save the model
transformer.save("transformer.h5")

In [21]:
# Load the model
transformer = tf.keras.models.load_model("transformer.h5")
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder (Functional)           (None, None, 512)    21695488    ['encoder_inputs[0][0]']         
                                                                                                  
 decoder (Functional)           (None, None, 30000)  39188784    ['decoder_inputs[0][0]',         
                                                                  'encoder[0][0]']      

<br>

### PREDICTION

---

In [22]:

def decode_sequences(input_sentences):
    batch_size = tf.shape(input_sentences)[0]

    # Tokenize the encoder input.
    encoder_input_tokens = eng_tokenizer(input_sentences).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )

    # Define a function that outputs the next token's probability given the
    # input sequence.
    def next(prompt, cache, index):
        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
        # Ignore hidden states for now; only needed for contrastive search.
        hidden_states = None
        return logits, hidden_states, cache

    # Build a prompt of length 40 with a start token and padding tokens.
    length = 40
    start = tf.fill((batch_size, 1), spa_tokenizer.token_to_id("[START]"))
    pad = tf.fill((batch_size, length - 1), spa_tokenizer.token_to_id("[PAD]"))
    prompt = tf.concat((start, pad), axis=-1)

    generated_tokens = keras_nlp.samplers.GreedySampler()(
        next,
        prompt,
        end_token_id=spa_tokenizer.token_to_id("[END]"),
        index=1,  # Start sampling after start token.
    )
    generated_sentences = spa_tokenizer.detokenize(generated_tokens)
    return generated_sentences


test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(5):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences(tf.constant([input_sentence]))
    translated = translated.numpy()[0].decode("utf-8")
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    print(f"** Example {i} **")
    print(input_sentence)
    print(translated)
    print()

** Example 0 **
tom is not welcome in this house.
tom no es el único que mary .

** Example 1 **
i don't feel like taking a walk now.
no me gusta un paseo .

** Example 2 **
my brother has joined the baseball club.
mi padre tiene tres años .

** Example 3 **
she argued with him and then hit him.
ella se sentó en su casa .

** Example 4 **
tom bought mary an expensive umbrella.
tom se encontró un accidente de su perro .



<br>

### EVALUATION

---

In [23]:
rouge_1 = keras_nlp.metrics.RougeN(order=1)
rouge_2 = keras_nlp.metrics.RougeN(order=2)

for test_pair in test_pairs[:30]:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences(tf.constant([input_sentence]))
    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())

ROUGE-1 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.20665598>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.19850121>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.19998457>}
ROUGE-2 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.052222226>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.04666667>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.047683466>}
