In [106]:
# Project 4 - AI and Deep Learning - Shan Ali Shah Sayed

In [107]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import tensorflow as tf
import keras
from keras import layers
from keras.layers import Input, Embedding, Dense, Dropout, TextVectorization, GRU, Bidirectional

import os
# os.chdir(f'{os.getenv("HOME")}/analytics/lib') # place this before data preparation
from transformer import Transformer

import string
import re
import random

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def custom_standardization(input_string):
    return tf.strings.regex_replace( tf.strings.lower(input_string), f"[{re.escape(strip_chars)}]", "" )

# os.chdir(f'{os.getenv("HOME")}/Data')
text_file = "spa.txt" # http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]


In [108]:
#%% Dataset preparation for English-to-Spanish translation

text_pairs = []
for line in lines:
    english, spanish = line.split("\t")
    spanish = "[start] " + spanish + " [end]" # prepend an SOS token and append an EOS token to the target sequence
    text_pairs.append( (english, spanish) )

random.shuffle( text_pairs )
num_val = int(0.15 * len(text_pairs))
num_train = len(text_pairs) - 2 * num_val
train_pairs, val_pairs, test_pairs = text_pairs[:num_train], text_pairs[num_train:num_train+num_val], text_pairs[num_train+num_val:]

vocab_size, seq_length = 15000, 20

source_vectorization = TextVectorization(  max_tokens=vocab_size, output_mode="int",
                                           output_sequence_length=seq_length )
source_vectorization.adapt( [pair[0] for pair in train_pairs] )

# target sequence is one token longer than the source sequence since we'll construct spa[:, :-1]) and spa[:, 1:]
target_vectorization = TextVectorization(  max_tokens=vocab_size, output_mode="int",
                                           standardize=custom_standardization,
                                           output_sequence_length=seq_length + 1)
target_vectorization.adapt( [pair[1] for pair in train_pairs] )

def format_dataset(eng, spa):
    eng = source_vectorization( eng )
    spa = target_vectorization( spa )
    return ( (eng, spa[:, :-1]), spa[:, 1:]) # target is one step ahead

batch_size = 64
def make_dataset(pairs):
    eng_texts, spa_texts = zip( *pairs ) # unzip the sequence of (en,sp) pairs
    eng_texts, spa_texts = list(eng_texts), list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices( (eng_texts, spa_texts) ).batch( batch_size ).map( format_dataset, num_parallel_calls=4 )
    return dataset.shuffle(2048).prefetch(16).cache() # use in-memory catching to speed up preprocessing

train_ds = make_dataset( train_pairs )
val_ds = make_dataset( val_pairs )


In [109]:
#%% Loss, Accuracy, and Optimizer

def masked_loss(label, pred):
  loss = keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none' )(label, pred)
  mask = tf.cast(label != 0, dtype=loss.dtype)
  loss *= mask
  return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  mask = label != 0
  match = (label == pred) & mask
  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)


In [110]:
#%% Transformer

class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()
    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(128)
optimizer = keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

model = Transformer( n_layers=4, d_emb=128, n_heads=8, d_ff=512, dropout_rate=0.1, src_vocab_size=vocab_size, tgt_vocab_size=vocab_size )
model.compile( loss=masked_loss, optimizer=optimizer, metrics=[masked_accuracy] )
model.fit( train_ds, epochs=10, validation_data=val_ds ) # val_masked_accuracy = 69% after 10 epochs


Epoch 1/10




[1m1302/1302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 34ms/step - loss: 7.5209 - masked_accuracy: 0.1610 - val_loss: 3.5376 - val_masked_accuracy: 0.4456
Epoch 2/10
[1m1302/1302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - loss: 3.2131 - masked_accuracy: 0.5017 - val_loss: 2.3975 - val_masked_accuracy: 0.5934
Epoch 3/10
[1m1302/1302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - loss: 2.3252 - masked_accuracy: 0.6100 - val_loss: 2.1493 - val_masked_accuracy: 0.6214
Epoch 4/10
[1m1302/1302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - loss: 2.0164 - masked_accuracy: 0.6441 - val_loss: 1.9201 - val_masked_accuracy: 0.6538
Epoch 5/10
[1m1302/1302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - loss: 1.6868 - masked_accuracy: 0.6887 - val_loss: 1.8040 - val_masked_accuracy: 0.6709
Epoch 6/10
[1m1302/1302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - loss: 1.4696 - mask

<keras.src.callbacks.history.History at 0x7d98d5ca0d10>

In [111]:
#%% Translate

spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) # a dict to convert token index prediction to string token

def decode_sequence( input_sentence ):
    tokenized_input_sentence = source_vectorization( [input_sentence] )
    decoded_sentence = "[start]" # seed token
    for i in range( 20 ): # 20 tokens at most for the decoded sentence
        tokenized_target_sentence = target_vectorization( [decoded_sentence] ) # [:, :-1]
        next_token_predictions = model.predict( [tokenized_input_sentence, tokenized_target_sentence] )
        sampled_token_index = np.argmax( next_token_predictions[0, i, :] )
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [ pair[0] for pair in test_pairs ]
for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))


-
You look like you're about to cry.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[start] te parece que estás a llorar [end]
-
The company wants to hire 20 people.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━

In [112]:
# --

In [113]:
# Save transformer weights
model.save_weights("translation_transformer.weights.h5")


In [114]:
# Save vectorizers
tf.keras.models.save_model(source_vectorization, "source_vectorizer.keras")
tf.keras.models.save_model(target_vectorization, "target_vectorizer.keras")


In [None]:
# --