# Synopsis

This project involved building an encoder-decoder model to translate text from English to Spanish. The model was built using TensorFlow and the dataset was sourced from the website http://www.manythings.org/anki/. The final model was exported for incorporation into a Streamlit web application.

# Setup

Import the libraries and methods required for the project.

In [1]:
! pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 524 kB/s 
[?25hCollecting tensorflow<2.8,>=2.7.0
  Downloading tensorflow-2.7.0-cp37-cp37m-manylinux2010_x86_64.whl (489.6 MB)
[K     |████████████████████████████████| 489.6 MB 20 kB/s 
Collecting libclang>=9.0.1
  Downloading libclang-12.0.0-py2.py3-none-manylinux1_x86_64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 17.2 MB/s 
[?25hCollecting tensorflow-estimator<2.8,~=2.7.0rc0
  Downloading tensorflow_estimator-2.7.0-py2.py3-none-any.whl (463 kB)
[K     |████████████████████████████████| 463 kB 45.5 MB/s 
[?25hCollecting keras<2.8,>=2.7.0rc0
  Downloading keras-2.7.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 47.8 MB/s 
Collecting tensorflow-io-gcs-filesystem>=0.21.0
  Downloading tensorflow_io_gcs_filesystem-0.23.1-cp37-cp37m-manylinux_2_12_x86_64.ma

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import tensorflow_text as tf_text
import random
import pathlib
import typing
from typing import Any, Tuple
from nltk.translate.bleu_score import corpus_bleu

Create a shape checker class to ensure that all objects have the right dimensions.

In [3]:
class ShapeChecker():
    def __init__(self):
        # keep a cache of every axis-name seen
        self.shapes = {}
        
    def __call__(self,
                 tensor, 
                 names,
                 broadcast = False):
        if not tf.executing_eagerly():
            return
        
        if isinstance(names, str):
            names = (names, )
            
        shape = tf.shape(tensor)
        rank = tf.rank(tensor)
        
        if rank != len(names):
            raise ValueError(f"Rank mismatch:\n"
                             f"   Found {rank}: {shape.numpy()}\n"
                             f"   Expected {len(names)}: {names}\n")
            
        for i, name in enumerate(names):
            if isinstance(name, int):
                old_dim = name
            else:
                old_dim = self.shapes.get(name, None)
            new_dim = shape[i]
            
            if (broadcast and new_dim == 1):
                continue
                
            if old_dim is None:
                # if the axis name is new, add its length to the cache
                self.shapes[name] = new_dim
                continue
                
            if new_dim != old_dim:
                raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                                 f"   Found: {new_dim}\n"
                                 f"   Expected: {old_dim}\n")

# The data

## Load the data

Load the Spanish-to-English dataset.

In [4]:
path_to_zip = tf.keras.utils.get_file("spa-eng.zip",
                                      origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
                                      extract = True)
path_to_file = pathlib.Path(path_to_zip).parent/"spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [5]:
def load_data(path):
    text = path.read_text(encoding = "utf-8")
    
    lines = text.splitlines()
    pairs = [line.split("\t") for line in lines]
    
    inp = [inp for inp, targ in pairs]
    targ = [targ for inp, targ in pairs]
    
    return inp, targ

In [6]:
inp, targ = load_data(path_to_file)

Training the model on the full dataset will take a very long time. Subset a desired number of examples from the original dataset to train the model on within a reasonable amount of time.

In [7]:
print(f"Number of examples in the full dataset: {len(inp)}")

Number of examples in the full dataset: 118964


In [8]:
n_desired_examples = 50000

In [9]:
inp = inp[0:n_desired_examples]
targ = targ[0:n_desired_examples]

## Data partitioning

Carve out a training set and a test set from the original data.

In [10]:
original_indices = list(range(n_desired_examples))
train_size = int(0.9 * n_desired_examples)
train_indices = random.sample(original_indices, train_size)
test_indices = [index for index in original_indices if (index not in train_indices)]

In [11]:
train_inputs = [inp[i] for i in train_indices]
test_inputs = [inp[i] for i in test_indices]
train_targets = [targ[i] for i in train_indices]
test_targets = [targ[i] for i in test_indices]

## Create a tf.data dataset

In [12]:
BUFFER_SIZE = len(train_inputs)
BATCH_SIZE = 64

In [13]:
dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_targets)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)

2021-12-17 00:05:46.312091: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-12-17 00:05:46.327914: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


# Create the translator template

## Text standardization

In [14]:
def tf_lower_and_split_punct(text):
    # split accented characters
    text = tf_text.normalize_utf8(text,
                                  "NFKD")
    text = tf.strings.lower(text)
    
    # keep spaces, a-z, and select punctuation
    text = tf.strings.regex_replace(text,
                                    "[^ a-z.?!,¿]",
                                    "")
    
    # add spaces around punctuation
    text = tf.strings.regex_replace(text,
                                    "[.?!,¿]",
                                    r" \0 ")
    
    # strip whitespace
    text = tf.strings.strip(text)
    
    text = tf.strings.join(["[START]", text, "[END]"],
                           separator = " ")
    return text

## The encoder

In [15]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, 
                 input_vocab_size,
                 embedding_dim,
                 enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.input_vocab_size = input_vocab_size
        
        # the embedding layer converts tokens to vectors
        self.embedding = tf.keras.layers.Embedding(input_dim = self.input_vocab_size,
                                                   output_dim = embedding_dim)
        
        # the GRU RNN layer processes those vectors sequentially
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       # return the sequence and state
                                       return_sequences = True,
                                       return_state = True,
                                       recurrent_initializer = "glorot_uniform")
        
    def call(self,
             tokens, 
             state = None):
        shape_checker = ShapeChecker()
        shape_checker(tokens, ("batch", "s"))
        
        # the embedding layer looks up the embedding for each token
        vectors = self.embedding(tokens)
        shape_checker(vectors, ("batch", "s", "embed_dim"))
        
        # the GRU processes the embedding sequence
        # output shape: (batch, s, enc_units)
        # state shape: (batch, enc_units)
        output, state = self.gru(vectors,
                                 initial_state = state)
        shape_checker(output, ("batch", "s", "enc_units"))
        shape_checker(state, ("batch", "enc_units"))
        
        # returns the new sequence and its state
        return output, state

## The attention head

In [16]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,
                 units):
        super().__init__()
        
        # the attention scoring function in Bahdanau's additive style
        self.W1 = tf.keras.layers.Dense(units, 
                                        use_bias = False)
        self.W2 = tf.keras.layers.Dense(units,
                                        use_bias = False)
        
        self.attention = tf.keras.layers.AdditiveAttention()
        
    def call(self,
             query,
             value, 
             mask):
        shape_checker = ShapeChecker()
        shape_checker(query, ("batch", "t", "query_units"))
        shape_checker(value, ("batch", "s", "value_units"))
        shape_checker(mask, ("batch", "s"))
        
        # the W1@ht term in the attention score formula
        w1_query = self.W1(query)
        shape_checker(w1_query, ("batch", "t", "attn_units"))
        
        # the W2@hs term in the attention score formula
        w2_key = self.W2(value)
        shape_checker(w2_key, ("batch", "s", "attn_units"))
        
        query_mask = tf.ones(tf.shape(query)[:-1],
                             dtype = bool)
        value_mask = mask
        
        context_vector, attention_weights = self.attention(inputs = [w1_query, value, w2_key],
                                                           mask = [query_mask, value_mask],
                                                           return_attention_scores = True)
        shape_checker(context_vector, ("batch", "t", "value_units"))
        shape_checker(attention_weights, ("batch", "t", "s"))
        
        return context_vector, attention_weights

## The decoder

In [17]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,
                 output_vocab_size, 
                 embedding_dim,
                 dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.output_vocab_size = output_vocab_size
        self.embedding_dim = embedding_dim
        
        # step 1 - the embedding layer converts token IDs to vectors
        self.embedding = tf.keras.layers.Embedding(input_dim = self.output_vocab_size,
                                                   output_dim = embedding_dim)
        
        # step 2 - the RNN keeps track of what's been generated so far
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences = True,
                                       return_state = True,
                                       recurrent_initializer = "glorot_uniform")
        
        # step 3 - the RNN output will be the query for the attention layer
        self.attention = BahdanauAttention(self.dec_units)
        
        # step 4 - converting the context vector to the attention vector
        self.Wc = tf.keras.layers.Dense(dec_units,
                                        activation = tf.math.tanh,
                                        use_bias = False)
        
        # step 5 - this fully connected layer produces the logits for each output token
        self.fc = tf.keras.layers.Dense(self.output_vocab_size)

In [18]:
class DecoderInput(typing.NamedTuple):
    new_tokens: Any
    enc_output: Any
    mask: Any

class DecoderOutput(typing.NamedTuple):
    logits: Any
    attention_weights: Any

In [19]:
def call(self,
         inputs: DecoderInput,
         state = None) -> Tuple[DecoderOutput, tf.Tensor]:
    shape_checker = ShapeChecker()
    shape_checker(inputs.new_tokens, ("batch", "t"))
    shape_checker(inputs.enc_output, ("batch", "s", "enc_units"))
    shape_checker(inputs.mask, ("batch", "s"))
    
    if state is not None:
        shape_checker(state, ("batch", "dec_units"))
        
    # step 1 - lookup the embeddings
    vectors = self.embedding(inputs.new_tokens)
    shape_checker(vectors, ("batch", "t", "embedding_dim"))
    
    # step 2 - process one step with the RNN
    rnn_output, state = self.gru(vectors,
                                 initial_state = state)
    
    shape_checker(rnn_output, ("batch", "t", "dec_units"))
    shape_checker(state, ("batch", "dec_units"))
    
    # step 3 - use the RNN output as the query for the attention over the encoder output
    context_vector, attention_weights = self.attention(query = rnn_output,
                                                       value = inputs.enc_output,
                                                       mask = inputs.mask)
    shape_checker(context_vector, ("batch", "t", "dec_units"))
    shape_checker(attention_weights, ("batch", "t", "s"))
    
    # step 4 - join the context_vector and rnn_output
    # [ct; ht] shape: (batch t, value_units + query_units)
    context_and_rnn_output = tf.concat([context_vector, rnn_output],
                                       axis = -1)
    
    # step 4 (continued) - at = tanh(Wc@[ct; ht])
    attention_vector = self.Wc(context_and_rnn_output)
    shape_checker(attention_vector, ("batch", "t", "dec_units"))
    
    # step 5 - generate logit predictions
    logits = self.fc(attention_vector)
    shape_checker(logits, ("batch", "t", "output_vocab_size"))
    
    return DecoderOutput(logits, attention_weights), state

In [20]:
Decoder.call = call

## Loss function

In [21]:
class MaskedLoss(tf.keras.losses.Loss):
    def __init__(self):
        self.name = "masked_loss"
        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True,
                                                                  reduction = "none")
        
    def __call__(self,
                 y_true,
                 y_pred):
        shape_checker = ShapeChecker()
        shape_checker(y_true, ("batch", "t"))
        shape_checker(y_pred, ("batch", "t", "logits"))
        
        # calculate the loss for each item in the batch
        loss = self.loss(y_true,
                         y_pred)
        shape_checker(loss, ("batch", "t"))
        
        # mask off the losses on padding
        mask = tf.cast(y_true != 0,
                       tf.float32)
        shape_checker(mask, ("batch", "t"))
        loss *= mask
        
        # return the total
        return tf.reduce_sum(loss)

## Implementing the training step

In [22]:
class TrainTranslator(tf.keras.Model):
    def __init__(self,
                 embedding_dim,
                 units,
                 input_text_processor,
                 output_text_processor,
                 use_tf_function = True):
        super().__init__()
        
        # build the encoder and decoder
        encoder = Encoder(input_text_processor.vocabulary_size(),
                          embedding_dim,
                          units)
        decoder = Decoder(output_text_processor.vocabulary_size(),
                          embedding_dim,
                          units)
        
        self.encoder = encoder
        self.decoder = decoder
        self.input_text_processor = input_text_processor
        self.output_text_processor = output_text_processor
        self.use_tf_function = use_tf_function
        self.shape_checker = ShapeChecker()
        
    def train_step(self,
                   inputs):
        self.shape_checker = ShapeChecker()
        if self.use_tf_function:
            return self._tf_train_step(inputs)
        else:
            return self._train_step(inputs)

In [23]:
def _preprocess(self, 
                input_text, 
                target_text):
    self.shape_checker(input_text, ("batch", ))
    self.shape_checker(target_text, ("batch", ))
    
    # convert the text to token IDs
    input_tokens = self.input_text_processor(input_text)
    target_tokens = self.output_text_processor(target_text)
    self.shape_checker(input_tokens, ("batch", "s"))
    self.shape_checker(target_tokens, ("batch", "t"))

    # convert IDs to masks
    input_mask = input_tokens != 0
    self.shape_checker(input_mask, ("batch", "s"))

    target_mask = target_tokens != 0
    self.shape_checker(target_mask, ("batch", "t"))

    return input_tokens, input_mask, target_tokens, target_mask

In [24]:
TrainTranslator._preprocess = _preprocess

In [25]:
def _train_step(self, 
                inputs):
    input_text, target_text = inputs  
    (input_tokens, input_mask, target_tokens, target_mask) = self._preprocess(input_text, 
                                                                              target_text)

    max_target_length = tf.shape(target_tokens)[1]

    with tf.GradientTape() as tape:
        # encode the input
        enc_output, enc_state = self.encoder(input_tokens)
        self.shape_checker(enc_output, ("batch", "s", "enc_units"))
        self.shape_checker(enc_state, ("batch", "enc_units"))

        # initialize the decoder's state to the encoder's final state
        # this only works if the encoder and decoder have the same number of units
        dec_state = enc_state
        loss = tf.constant(0.0)

        for t in tf.range(max_target_length-1):
            # pass in two tokens from the target sequence:
            # 1. the current input to the decoder.
            # 2. the target for the decoder's next prediction.
            new_tokens = target_tokens[:, t:t+2]
            step_loss, dec_state = self._loop_step(new_tokens, 
                                                   input_mask,
                                                   enc_output, 
                                                   dec_state)
            loss = loss + step_loss

        # average the loss over all non padding tokens.
        average_loss = loss / tf.reduce_sum(tf.cast(target_mask, tf.float32))

    # apply an optimization step
    variables = self.trainable_variables 
    gradients = tape.gradient(average_loss, variables)
    self.optimizer.apply_gradients(zip(gradients, variables))

    # return a dict mapping metric names to current value
    return {'batch_loss': average_loss}

In [26]:
TrainTranslator._train_step = _train_step

In [27]:
def _loop_step(self, new_tokens, input_mask, enc_output, dec_state):
    input_token, target_token = new_tokens[:, 0:1], new_tokens[:, 1:2]

    # run the decoder one step
    decoder_input = DecoderInput(new_tokens = input_token,
                                 enc_output = enc_output,
                                 mask = input_mask)

    dec_result, dec_state = self.decoder(decoder_input, state = dec_state)
    self.shape_checker(dec_result.logits, ("batch", "t1", "logits"))
    self.shape_checker(dec_result.attention_weights, ("batch", "t1", "s"))
    self.shape_checker(dec_state, ("batch", "dec_units"))

    # 'self.loss' returns the total for non-padded tokens
    y = target_token
    y_pred = dec_result.logits
    step_loss = self.loss(y, y_pred)

    return step_loss, dec_state

In [28]:
TrainTranslator._loop_step = _loop_step

## Testing the training step

In [29]:
@tf.function(input_signature = [[tf.TensorSpec(dtype = tf.string,
                                               shape = [None]),
                                 tf.TensorSpec(dtype = tf.string,
                                               shape = [None])]])

def _tf_train_step(self, inputs):
    return self._train_step(inputs)

In [30]:
TrainTranslator._tf_train_step = _tf_train_step

## Training logs

In [31]:
class BatchLogs(tf.keras.callbacks.Callback):
    def __init__(self, key):
        self.key = key
        self.logs = []

    def on_train_batch_end(self, n, logs):
        self.logs.append(logs[self.key])

## The translator

In [32]:
class Translator(tf.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 input_text_processor,
                 output_text_processor):
        self.encoder = encoder
        self.decoder = decoder
        self.input_text_processor = input_text_processor
        self.output_text_processor = output_text_processor

        self.output_token_string_from_index = (
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary = output_text_processor.get_vocabulary(),
                mask_token = "",
                invert = True))

        # the output should never generate padding, unknown, or start
        index_from_string = tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary = output_text_processor.get_vocabulary(), 
            mask_token = "")
        token_mask_ids = index_from_string(['', '[UNK]', '[START]']).numpy()

        token_mask = np.zeros([index_from_string.vocabulary_size()], 
                           dtype=np.bool)
        token_mask[np.array(token_mask_ids)] = True
        self.token_mask = token_mask

        self.start_token = index_from_string(tf.constant('[START]'))
        self.end_token = index_from_string(tf.constant('[END]'))

## Convert token IDs to text

In [33]:
def tokens_to_text(self, result_tokens):
    shape_checker = ShapeChecker()
    shape_checker(result_tokens, ("batch", "t"))
    result_text_tokens = self.output_token_string_from_index(result_tokens)
    shape_checker(result_text_tokens, ("batch", "t"))

    result_text = tf.strings.reduce_join(result_text_tokens,
                                         axis=1, 
                                         separator=" ")
    shape_checker(result_text, ("batch"))

    result_text = tf.strings.strip(result_text)
    shape_checker(result_text, ("batch", ))
    return result_text

In [34]:
Translator.tokens_to_text = tokens_to_text

## Sampling from the decoder's predictions

In [35]:
def sample(self, logits, temperature):
    shape_checker = ShapeChecker()
    # 't' is usually 1 here
    shape_checker(logits, ("batch", "t", "vocab"))
    shape_checker(self.token_mask, ("vocab", ))

    token_mask = self.token_mask[tf.newaxis, tf.newaxis, :]
    shape_checker(token_mask, ("batch", "t", "vocab"), 
                  broadcast = True)

    # set the logits for all masked tokens to -inf, so they are never chosen
    logits = tf.where(self.token_mask, -np.inf, logits)

    if temperature == 0.0:
        new_tokens = tf.argmax(logits, 
                               axis = -1)
    else: 
        logits = tf.squeeze(logits, 
                            axis = 1)
    new_tokens = tf.random.categorical(logits / temperature,
                                       num_samples=1)

    shape_checker(new_tokens, ("batch", "t"))

    return new_tokens

In [36]:
Translator.sample = sample

## Implementing the translation loop

In [37]:
def translate_unrolled(self,
                       input_text, 
                       *,
                       max_length=50,
                       return_attention=True,
                       temperature=1.0):
    batch_size = tf.shape(input_text)[0]
    input_tokens = self.input_text_processor(input_text)
    enc_output, enc_state = self.encoder(input_tokens)

    dec_state = enc_state
    new_tokens = tf.fill([batch_size, 1], self.start_token)

    result_tokens = []
    attention = []
    done = tf.zeros([batch_size, 1], 
                    dtype = tf.bool)

    for _ in range(max_length):
        dec_input = DecoderInput(new_tokens = new_tokens,
                                 enc_output = enc_output,
                                 mask = (input_tokens != 0))

        dec_result, dec_state = self.decoder(dec_input, 
                                             state = dec_state)

        attention.append(dec_result.attention_weights)

        new_tokens = self.sample(dec_result.logits, 
                                 temperature)

        # if a sequence produces an 'end_token', set it 'done'
        done = done | (new_tokens == self.end_token)
        # once a sequence is done it only produces 0-padding
        new_tokens = tf.where(done, 
                              tf.constant(0, dtype = tf.int64), 
                              new_tokens)

        # collect the generated tokens
        result_tokens.append(new_tokens)

        if tf.executing_eagerly() and tf.reduce_all(done):
            break

    # convert the list of generates token ids to a list of strings
    result_tokens = tf.concat(result_tokens,
                              axis = -1)
    result_text = self.tokens_to_text(result_tokens)

    if return_attention:
        attention_stack = tf.concat(attention, 
                                    axis = 1)
        return {"text": result_text, 
                "attention": attention_stack}
    else:
        return {"text": result_text}

In [38]:
Translator.translate = translate_unrolled

In [39]:
@tf.function(input_signature = [tf.TensorSpec(dtype = tf.string, 
                                              shape = [None])])
def tf_translate(self, input_text):
    return self.translate(input_text)

Translator.tf_translate = tf_translate

# Build and train a translator model

Preprocess the text.

In [40]:
max_vocab_size = 5000

In [41]:
input_text_processor = preprocessing.TextVectorization(standardize = tf_lower_and_split_punct,
                                                       max_tokens = max_vocab_size)
input_text_processor.adapt(train_inputs)

In [42]:
output_text_processor = preprocessing.TextVectorization(standardize = tf_lower_and_split_punct,
                                                        max_tokens = max_vocab_size)
output_text_processor.adapt(train_targets)

Compile and train a translator model.

In [43]:
embedding_dim = 256
units = 1024

In [44]:
train_translator = TrainTranslator(embedding_dim, 
                                   units,
                                   input_text_processor = input_text_processor,
                                   output_text_processor = output_text_processor)

In [45]:
train_translator.compile(loss = MaskedLoss(),
                         optimizer = tf.optimizers.Adam())

In [46]:
batch_loss = BatchLogs("batch_loss")

In [47]:
train_translator.fit(dataset, 
                     epochs = 3,
                     callbacks = [batch_loss])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f842d152550>

Use the trained model to create a translator object.

In [48]:
translator = Translator(encoder = train_translator.encoder,
                        decoder = train_translator.decoder,
                        input_text_processor = input_text_processor,
                        output_text_processor = output_text_processor)

# Evaluate the model

Evaluate the accuracy of the model's translations by computing its BLEU (Bilingual Evaluation Understudy) score on the test set.

In [49]:
def StandardizeTarget(target):
    # split accented characters
    target = tf_text.normalize_utf8(target,
                                    "NFKD")
    target = tf.strings.lower(target)
    
    # keep spaces, a-z, and select punctuation
    target = tf.strings.regex_replace(target,
                                      "[^ a-z.?!,¿]",
                                      "")
    
    # add spaces around punctuation
    target = tf.strings.regex_replace(target,
                                      "[.?!,¿]",
                                      r" \0 ")
    
    # strip whitespace
    target = tf.strings.strip(target)
    
    # convert the string tensor to a regular string
    target = target.numpy().decode()
    
    return target

In [50]:
def StandardizeListOfTargets(list_of_targets):
    return [StandardizeTarget(target) for target in list_of_targets]

In [51]:
def TokenizeListOfTargets(list_of_targets):
    return [target.split(" ") for target in list_of_targets]

In [52]:
# make predictions on the test set

input_text = tf.constant(test_inputs)
raw_predictions = translator.translate(input_text = input_text)
predictions = []

for i in range(len(input_text)):
    predictions.append(raw_predictions["text"][i].numpy().decode())

tokenized_predictions = list(map(lambda x: x.split(), predictions))

In [53]:
# collate the targets into a list of list of list of tokens

# make a mapping from each input text to all target texts which it corresponds to
input_to_target = dict()

for input_text, target_text in list(zip(test_inputs, test_targets)):
    if input_text not in input_to_target.keys():
        input_to_target[input_text] = [target_text]
    else:
        input_to_target[input_text] = input_to_target[input_text] + [target_text]

# cluster together the alternative target texts corresponding to each input text
clustered_targets = list(map(input_to_target.get, test_inputs))

# standardize the target text
standardized_targets = [StandardizeListOfTargets(list_of_targets) for list_of_targets in clustered_targets]

# tokenize the target text
tokenized_targets = [TokenizeListOfTargets(list_of_targets) for list_of_targets in standardized_targets]

In [54]:
# obtain the BLEU score across all predictions

bleu_score = corpus_bleu(list_of_references = tokenized_targets,
                         hypotheses = tokenized_predictions)

print(f"BLEU score for the final model: {bleu_score}")

BLEU score for the final model: 0.2322011280125518


# Export the model

Save the architecture and weights associated with the model.

In [55]:
tf.saved_model.save(translator, 
                    "translator",
                    signatures = {"serving_default": translator.tf_translate})

2021-12-17 01:21:08.550804: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
