In [1]:
import numpy as np

import tensorflow as tf

import tensorflow_text

from tensorflow.keras.layers import Add, Concatenate, Dense, Dropout, Embedding, Input, Layer, LayerNormalization, Softmax
from tensorflow.keras.models import Model

In [2]:
#======================================================
#===  Create keras layer for dot-product attention  ===
#======================================================

class LinearDotProductAttention(Layer) :
    def __init__(self, dk, dv, same_transform_QK=False, same_transform_KV=False, use_causal_mask=False):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.dk      = dk
        self.dv      = dv
        self.sqrt_dk = np.sqrt(dk)
        self.use_causal_mask = use_causal_mask
        
        ##  Create Softmax layer, which can handle masking whereas tf.nn.softmax cannot
        self.softmax = Softmax()
        
        ##  Create Dense transform of query feature vectors
        self.query_transform = Dense(dk)
        
        ##  Create Dense transform of key feature vectors
        if same_transform_QK : self.key_transform = self.query_transform
        else                 : self.key_transform = Dense(dk)
            
        ##  Create Dense transform of value feature vectors
        if same_transform_KV : 
            if dk != dk :
                raise ValueError(f"Can only use same transform for keys and values if dk (={dk}) \
                                   is equal to dv (={dv})")
            self.value_transform = self.key_transform
        else : 
            self.value_transform = Dense(dv)
                    
    def call(self, inputs, training=False):
        """
        Call docstring
        """
        ##  Resolve inputs
        if len(inputs) != 2 : raise ValueError(f"inputs of length {len(inputs)}, expected 2")
        query_features = inputs[0]  # shape [batch_size, query_sequence_length, query_feature_length]
        ref_features   = inputs[1]  # shape [batch_size, ref_sequence_length  , ref_feature_length  ]
                
        ##  Calculate Q, K, V matrices
        Q = self.query_transform(query_features, training=training)  # shape [batch_size, query_sequence_length, dk]
        K = self.key_transform  (ref_features  , training=training)  # shape [batch_size, ref_sequence_length  , dk]
        V = self.value_transform(ref_features  , training=training)  # shape [batch_size, ref_sequence_length  , dv]
                
        ##  Calculate transpose of K, without modifying the first axis which indexes batch samples
        K_T = tf.transpose(K, perm=[0,2,1])  # shape [batch_size, dk, ref_sequence_length]
                
        ##  Calculate dot-product attention scores
        x = tf.matmul(Q, K_T)             # shape [batch_size, query_sequence_length, ref_sequence_length]
        x = x / self.sqrt_dk              # shape [batch_size, query_sequence_length, ref_sequence_length]
        
        ##  Create a causal mask on-the-fly if needed
        mask = None
        if self.use_causal_mask :
            mask_shape = tf.shape(x)
            mask = self._create_causal_mask(mask_shape)
                
        ##  Calculate attention weights
        x = self.softmax(x, mask=mask, training=training)   # shape [batch_size, query_sequence_length, ref_sequence_length]
                        
        ##  Attend to reference sequence and return updated feature vector of length dv
        x = tf.matmul(x, V)              # shape [batch_size, query_sequence_length, dv] 
                
        return x
    
    def _create_causal_mask(self, mask_shape) :
        """
        Method docstring
        - using trick for creating causal mask from from keras base_dense_attention class method
        - https://github.com/keras-team/keras/blob/e6784e4302c7b8cd116b74a784f4b78d60e83c26/keras/layers/attention/base_dense_attention.py
        Mask is 1 for elements that we want to include, 0 for elements we want to exclude
        Axis -1 is the "reference sequence" index
        Axis -2 is the "query sequence" index
        We want mask to be 1 only when Axis -1 <= Axis -2, the same as Axis -2 >= Axis -1
        With indices [row, col], we have 1 when row >= col, giving a lower triangular matrix
        """
        ones_like_x = tf.ones(shape=mask_shape, dtype=tf.int32)
        query_index = tf.cumsum(ones_like_x, axis=-2)
        ref_index   = tf.cumsum(ones_like_x, axis=-1)
        return tf.greater_equal(query_index, ref_index)
    

In [3]:
#=====================================================
#===  Create keras layer for multi-head attention  ===
#=====================================================

class MultiHeadAttention(Layer) :
    """
    Class docstring
    """
    def __init__(self, num_heads, d_out, dk_per_head, dv_per_head, use_causal_mask=True, dropout=0.1):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.num_heads       = num_heads
        self.d_out           = d_out
        self.dk_per_head     = dk_per_head   
        self.dv_per_head     = dv_per_head
        self.use_causal_mask = use_causal_mask
        
        ##  Create heads
        self.heads = [LinearDotProductAttention(self.dk_per_head, self.dv_per_head, use_causal_mask=use_causal_mask) 
                      for hi in range(num_heads)]
                    
        ##  Create other keras layers
        self.concat  = Concatenate()
        self.linear  = Dense(d_out)
        self.dropout = Dropout(dropout)
        
    def call(self, inputs, training=False):
        """
        Call docstring
        """    
        ##  Calculate attention heads
        #        shape [batch_size, query_sequence_length, dv] for each list element
        x = [head(inputs, training=training) for head in self.heads]
        
        ##  Concatenate heads and project onto single output
        x = self.concat (x)
        x = self.dropout(x, training=training)  
        x = self.linear (x, training=training)     
        return x


In [38]:
#==============================================
#===  Create keras layer for encoder block  ===
#==============================================

class EncoderBlock(Layer) :
    """
    Class docstring
    """
    def __init__(self, d_model, num_heads, dk_per_head, dv_per_head, dff, dropout=0.1, name=""):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.d_model     = d_model
        self.num_heads   = num_heads
        self.dk_per_head = dk_per_head
        self.dv_per_head = dv_per_head
        self.dff         = dff
        
        ##  Create attention layer
        self.mha = MultiHeadAttention(num_heads, d_model, dk_per_head, dv_per_head)
                    
        ##  Create other keras layers
        self.add1    = Add()
        self.norm1   = LayerNormalization()
        self.add2    = Add()
        self.norm2   = LayerNormalization()
        self.dense1  = Dense(dff, activation="relu")
        self.dense2  = Dense(d_model)
        self.dropout = Dropout(dropout)
        
    def call(self, query_features, training=False):
        """
        Call docstring
        """
        ##  Calculate multi-head attention
        x_skip = query_features
        x = self.mha([query_features, query_features], training=training)   # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Combine attention output with skip-connection
        x = self.add1([x, x_skip], training=training)    # shape [batch_size, query_sequence_length, d_model] 
        x = self.norm1(x, training=training)             # shape [batch_size, query_sequence_length, d_model] 
        x_skip = x
                
        ##  Feed-forward processing of linearly-combined feature vectors from each head
        x = self.dense1 (x, training=training)            # shape [batch_size, query_sequence_length, dff] 
        x = self.dense2 (x, training=training)            # shape [batch_size, query_sequence_length, d_model] 
        x = self.dropout(x, training=training)            # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Skip-connect and return
        x = self.add2([x, x_skip], training=training)    # shape [batch_size, query_sequence_length, d_model] 
        x = self.norm2(x, training=training)             # shape [batch_size, query_sequence_length, d_model]     
        return x


In [5]:

class PositionalEncoding(tf.keras.layers.Layer) :
    
    def __init__(self, d_in, d_model) :
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.d_in      = d_in
        self.d_model   = d_model
        
        ##  Store Tensor object with pre-computed positional encodings
        self.encoded_positions = self.create_encoded_positions_tensor(d_in, d_model)

    def call(self, x) :
        """
        Call docstring
        """
        ##  Return slice of stored encoded_positions Tensor with correct shape
        length = tf.shape(x)[1]
        return self.encoded_positions[tf.newaxis, :length, :]
    
    def create_encoded_positions_tensor(self, d_in, d_model) :
        """
        Method docstring
        """
        ##  Create numpy array with positions
        positions = np.arange(d_in)   # shape (d_in)
        
        ##  Combine with indices to create 2D array of angles
        half_indices = np.arange(d_model/2)   # shape (d_model/2)
        angles = (10000**(-half_indices))     # shape (d_model/2)
        angles = np.outer(positions, angles)  # shape (d_in, d_model/2)

        ##  Interleave sing and cos of angles into single 2D array of positional encodings
        pos_encoding = np.concatenate([np.sin(angles), np.cos(angles)], axis=-1)   # shape (d_in, d_model)

        ##  Return Tensor of positional encodings
        return tf.cast(pos_encoding, dtype=tf.float32)
        

In [6]:
#========================================
#===  Create keras layer for encoder  ===
#========================================

class Encoder(Layer) :
    """
    Class docstring
    """
    def __init__(self, num_blocks, d_in, d_model, num_heads, dk_per_head, dv_per_head, dff):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.num_blocks  = num_blocks
        self.d_model     = d_model
        self.num_heads   = num_heads
        self.dk_per_head = dk_per_head
        self.dv_per_head = dv_per_head
        self.dff         = dff
        self.emb_scalar  = tf.math.sqrt(tf.cast(d_model, tf.float32))
                    
        ##  Create embedding layers
        self.add               = Add()
        self.token_embedding   = Embedding(d_in, d_model, mask_zero=True)
        self.position_encoding = PositionalEncoding(d_in, d_model)
        
        ##  Create encoder block layers
        self.encoder_blocks = [EncoderBlock(d_model, num_heads, dk_per_head, dv_per_head, dff) 
                               for ei in range(num_blocks)]
        
    def call(self, query_features, training=False):
        """
        Call docstring
        """
        ##  Calculate embeddings
        token_embedding   = self.token_embedding(query_features, training=training) # shape [batch_size, query_sequence_length, d_model]
        token_embedding  *= self.emb_scalar                                         # shape [batch_size, query_sequence_length, d_model]
        position_encoding = self.position_encoding(query_features)                  # shape [batch_size, query_sequence_length, d_model]
        x = self.add([token_embedding, position_encoding], training=training)       # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Pass through encoder blocks
        for encoder_block in self.encoder_blocks :
            x = encoder_block(x, training=training)
            
        ##  Return encoded sequence
        return x
    
    '''def compute_mask(self, *args, **kwargs):
        """
        Method docstring
        """
        return self.token_embedding.compute_mask(*args, **kwargs)'''


In [7]:
#==============================================
#===  Create keras layer for encoder block  ===
#==============================================

class DecoderBlock(Layer) :
    """
    Class docstring
    """
    def __init__(self, d_model, num_heads, dk_per_head, dv_per_head, dff, dropout=0.1):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.d_model     = d_model
        self.num_heads   = num_heads
        self.dk_per_head = dk_per_head
        self.dv_per_head = dv_per_head
        self.dff         = dff
        
        ##  Create attention layers
        self.masked_attention = MultiHeadAttention(num_heads, d_model, dk_per_head, dv_per_head, use_causal_mask=True )
        self.cross_attention  = MultiHeadAttention(num_heads, d_model, dk_per_head, dv_per_head, use_causal_mask=False)
                    
        ##  Create other keras layers
        self.add1    = Add()
        self.norm1   = LayerNormalization()
        self.add2    = Add()
        self.norm2   = LayerNormalization()
        self.add3    = Add()
        self.norm3   = LayerNormalization()
        self.dense1  = Dense(dff, activation="relu")
        self.dense2  = Dense(d_model)
        self.dropout = Dropout(dropout)
        
    def call(self, inputs, training=False):
        """
        Call docstring
        """
        ##  Resolve inputs
        if len(inputs) != 2 : raise ValueError(f"inputs of length {len(inputs)}, expected 2")
        decoder_input  = inputs[0]
        encoder_output = inputs[1]
        
        ##  Calculate masked self-attention
        x_skip = decoder_input
        x = self.masked_attention([decoder_input, decoder_input], training=training)   # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Combine attention output with skip-connection
        x = self.add1([x, x_skip], training=training)          # shape [batch_size, query_sequence_length, d_model] 
        x = self.norm1(x, training=training)                   # shape [batch_size, query_sequence_length, d_model] 
        x_skip = x
        
        ##  Calculate cross-attention
        x = self.cross_attention([x, encoder_output], training=training)   # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Combine attention output with skip-connection
        x = self.add2([x, x_skip], training=training)          # shape [batch_size, query_sequence_length, d_model] 
        x = self.norm2(x, training=training)                   # shape [batch_size, query_sequence_length, d_model] 
        x_skip = x
        
        ##  Feed-forward processing of linearly-combined feature vectors from each head
        x = self.dense1 (x, training=training)            # shape [batch_size, query_sequence_length, dff] 
        x = self.dense2 (x, training=training)            # shape [batch_size, query_sequence_length, d_model] 
        x = self.dropout(x, training=training)            # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Skip-connect and return
        x = self.add3([x, x_skip], training=training)          # shape [batch_size, query_sequence_length, d_model] 
        x = self.norm3(x, training=training)                   # shape [batch_size, query_sequence_length, d_model] 
                
        return x


In [37]:
#========================================
#===  Create keras layer for decoder  ===
#========================================

class Decoder(Layer) :
    """
    Class docstring
    """
    def __init__(self, num_blocks, d_in, d_model, num_heads, dk_per_head, dv_per_head, dff, name=""):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.num_blocks  = num_blocks
        self.d_in        = d_in
        self.d_model     = d_model
        self.num_heads   = num_heads
        self.dk_per_head = dk_per_head
        self.dv_per_head = dv_per_head
        self.dff         = dff
        self.emb_scalar  = tf.math.sqrt(tf.cast(d_model, tf.float32))
                    
        ##  Create embedding layers
        self.add               = Add()
        self.token_embedding   = Embedding(d_in, d_model, mask_zero=True)
        self.position_encoding = PositionalEncoding(d_in, d_model)
        
        ##  Create decoder block layers
        self.decoder_blocks = [DecoderBlock(d_model, num_heads, dk_per_head, dv_per_head, dff) 
                               for ei in range(num_blocks)]
        
    def call(self, inputs, training=False):
        """
        Call docstring
        """
        ##  Resolve inputs
        if len(inputs) != 2 : raise ValueError(f"inputs of length {len(inputs)}, expected 2")
        decoder_input  = inputs[0]
        encoder_output = inputs[1]
        
        ##  Calculate embeddings
        token_embedding   = self.token_embedding(decoder_input, training=training) # shape [batch_size, query_sequence_length, d_model]
        token_embedding  *= self.emb_scalar                                        # shape [batch_size, query_sequence_length, d_model]
        position_encoding = self.position_encoding(decoder_input)                  # shape [batch_size, query_sequence_length, d_model]
        x = self.add([token_embedding, position_encoding], training=training)      # shape [batch_size, query_sequence_length, d_model] 
                
        ##  Pass through decoder blocks
        for decoder_block in self.decoder_blocks :
            x = decoder_block([x, encoder_output], training=training)
            
        ##  Return token probabilities
        return x


In [9]:
#========================================
#===  Create keras layer for decoder  ===
#========================================

class Transformer(Layer) :
    """
    Class docstring
    """
    def __init__(self, num_blocks, d_in, d_out, d_model, num_heads, dk_per_head, dv_per_head, dff):
        """
        Init docstring
        """
        ##  Initialise base class
        super().__init__()
        
        ##  Store dimension information
        self.num_blocks  = num_blocks
        self.d_in        = d_in
        self.d_out       = d_out
        self.d_model     = d_model
        self.num_heads   = num_heads
        self.dk_per_head = dk_per_head
        self.dv_per_head = dv_per_head
        self.dff         = dff
        self.emb_scalar  = tf.math.sqrt(tf.cast(d_model, tf.float32))
                    
        ##  Create encoder and decoder layers
        self.encoder = Encoder(num_blocks, d_in , d_model, num_heads, dk_per_head, dv_per_head, dff)
        self.decoder = Decoder(num_blocks, d_out, d_model, num_heads, dk_per_head, dv_per_head, dff)
                
        ##  Create layers to convert decoder output to token probabilities
        self.linear  = Dense(d_out)
        self.softmax = Softmax()
        
    def call(self, inputs, training=False):
        """
        Call docstring
        """
        ##  Resolve inputs
        if len(inputs) != 2 : raise ValueError(f"inputs of length {len(inputs)}, expected 2")
        encoder_input = inputs[0]
        decoder_input = inputs[1]
        
        ##  Calculate encoding
        x = self.encoder(encoder_input, training=training)
                
        ##  Calculate decoding
        x = self.decoder([decoder_input, x], training=training)
                        
        ##  Turn decoder outputs into token probabilities
        x = self.linear (x, training=training)
        x = self.softmax(x, training=training)
            
        ##  Return token probabilities
        return x


In [10]:
#============================================================================
#===  Create method for building simple model that performs the decoding  ===
#============================================================================

def build_transformer_model(num_blocks, d_in, d_out, d_model, num_heads, dk_per_head, dv_per_head, dff, name=None) :
    """
    Method docstring
    """ 
    ##  Create model inputs
    original_message   = Input((None,))
    translated_message = Input((None,))
    
    ##  Perform attention step
    x = Transformer(num_blocks, d_in, d_out, d_model, num_heads, dk_per_head, dv_per_head, dff)([original_message, translated_message])
    
    ##  Create model
    model = Model([original_message, translated_message], x, name=name)
    
    ##  Return
    return model
    

In [30]:
#==============================================================
#===  Build and print model to check for structural errors  ===
#==============================================================

d_in        = 7765    # portuguese vocab size
d_out       = 7010    # english vocab size
d_model     = 128
num_blocks  = 4
num_heads   = 8
dk_per_head = d_model  # int(d_model / num_heads)
dv_per_head = d_model  # int(d_model / num_heads)
dff         = 512

transformer = build_transformer_model(num_blocks, d_in, d_out, d_model, num_heads, dk_per_head, dv_per_head, 
                                      dff, name="transformer")

transformer.summary(expand_nested=True)


Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 transformer_1 (Transformer)    (None, None, 7010)   10184162    ['input_3[0][0]',                
                                                                  'input_4[0][0]']                
                                                                                                  
Total params: 10,184,162
Trainable params: 10,184,162
Non-trainable params: 0
__________

In [31]:
transformer([tf.cast([[2, 5, 912, 3]], dtype=tf.int32), tf.cast([[2, 84]], dtype=tf.int32)])

<tf.Tensor: shape=(1, 2, 7010), dtype=float32, numpy=
array([[[1.74837885e-04, 1.42590186e-04, 1.58885814e-04, ...,
         9.54626012e-05, 1.51287401e-04, 1.15952156e-04],
        [1.75502399e-04, 1.42746474e-04, 1.57207760e-04, ...,
         9.56099975e-05, 1.51239874e-04, 1.16575655e-04]]], dtype=float32)>

In [32]:

model_weights_fname = 'saved_model/transformer_weights'

transformer.load_weights(model_weights_fname)


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2ca5be350>

In [33]:
transformer([tf.cast([[2, 5, 912, 3]], dtype=tf.int32), tf.cast([[2, 84]], dtype=tf.int32)])

<tf.Tensor: shape=(1, 2, 7010), dtype=float32, numpy=
array([[[1.74837885e-04, 1.42590186e-04, 1.58885814e-04, ...,
         9.54626012e-05, 1.51287401e-04, 1.15952156e-04],
        [1.75502399e-04, 1.42746474e-04, 1.57207760e-04, ...,
         9.56099975e-05, 1.51239874e-04, 1.16575655e-04]]], dtype=float32)>

In [20]:

tokenizer_model_fname = 'ted_hrlr_translate_pt_en_converter'

tokenizers = tf.saved_model.load(tokenizer_model_fname)


In [42]:

def translate_sentence(sentence, transformer, tokenizers, max_tokens=100) :
    with tf.device("CPU") :
        tokenised_sentence = tokenizers.pt.tokenize([sentence]).to_tensor()
        print(tokenised_sentence)
        
        start_end   = tokenizers.en.tokenize([''])[0]
        begin_token = start_end[0 ]
        end_token   = start_end[-1]
        
        tokenised_translated_sentence = tf.cast([[begin_token]], dtype=tf.int64)
                
        best_token, num_tokens = begin_token, 1
        while best_token != end_token and num_tokens < max_tokens :
            #print("CALL")
            #print(tokenised_sentence)
            #print(tokenised_translated_sentence)
            token_probs = transformer([tokenised_sentence, tokenised_translated_sentence])
            #print(token_probs[0,-1,:])
            #print(token_probs[0,-1,5906:5911])
            best_token  = tf.argmax(token_probs[0,-1,:], axis=-1)
            tokenised_translated_sentence = tf.concat([tokenised_translated_sentence, 
                                                       tf.reshape(best_token, (1,1))],
                                                      axis=-1)
            num_tokens += 1
            
        translated_sentence = tokenizers.en.detokenize(tokenised_translated_sentence)[0]
        return translated_sentence.numpy().decode('utf-8')
    

In [45]:
translate_sentence('Olá, como você está, minha cabeça dói', transformer, tokenizers)

tf.Tensor([[   2 1616   14   97  483  105   14  138  693   93  266    3]], shape=(1, 12), dtype=int64)


'primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates primates'

In [46]:

tokenised_translated_sentence = tf.cast([[2, 2265,   13,  100,   79,   86,   13,   99,  589,   13,   45,
           9,   49,  613,   15,    3]], dtype=tf.int64)
translated_sentence = tokenizers.en.detokenize(tokenised_translated_sentence)[0]
print(translated_sentence.numpy().decode('utf-8'))

hi , as you are , my head , i ' m voice .


In [23]:
def masked_loss(label, pred):
    mask = label != 0
    #loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss


def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)


In [24]:

transformer.compile(loss=masked_loss, optimizer="adam", metrics=[masked_accuracy])


In [25]:
train_batches = tf.data.Dataset.load("tokenised_train_batches")
val_batches   = tf.data.Dataset.load("tokenised_val_batches")

with tf.device("CPU") :
    transformer.fit(
        train_batches.take(10), 
        epochs=1, 
        validation_data=val_batches.take(10))
                           

