In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
def create_padding_mask(seq, pad_value=0):
  seq = tf.cast(tf.math.equal(seq, pad_value), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [None]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [5]:
# Tensorflow create task mask
def create_task_mask_tf(seq):
  seq = tf.cast(tf.math.equal(seq, 1), tf.float32)
  mask = tf.map_fn(get_task_one_mask_tf, elems=seq)
  mask = tf.expand_dims(mask, 1) # make compatible with padding mask
  return mask

In [None]:
def get_task_one_mask_tf(A):
  N = A.shape[0]
  column = tf.reshape((A>0), (N,1))
  mask = tf.ones((N, N), dtype=tf.bool)
  mask = tf.where(column, False, tf.math.logical_not(tf.linalg.band_part(mask, 0, -1)))
  mask = tf.cumsum(tf.cast(mask, dtype=tf.float32))
  # mask = tf.cast(tf.cumsum(tf.cast(mask, dtype=tf.int8)), dtype=tf.bool)
  mask = tf.cast(mask, dtype=tf.bool)
  A = tf.expand_dims(A, axis=0)
  B = tf.where(mask, 0, tf.tile(A, (N,1)))
  B = tf.cast(B, tf.float32)
  return B

In [None]:
def create_masks(inp, pad_value): # Look-ahead masking needed for both encoder & decoder
    look_ahead_mask = create_look_ahead_mask(tf.shape(inp)[1])
    padding_mask = create_padding_mask(inp, pad_value=pad_value)
    combined_mask = tf.maximum(padding_mask, look_ahead_mask) # whether there is pad or mask
    return combined_mask

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

NameError: ignored

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, mask):
    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    return out2

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3

In [None]:
def create_model(conf): # Functional model without list comprehension
  input = tf.keras.Input(shape=(conf["window_size"], conf["n_features"]), name = "input")
  seq_len = tf.shape(input)[1]
  # mask = create_masks(input[:,:,0], pad_value = conf["padding_values"][0]) # input 0 = E, input 1 = r
  mask = create_final_mask(input, pad_value = conf["padding_values"][0]) # input 0 = E, input 1 = r

  ###### Encoder ###########
  x = 0 # will be casted to sum first embedding
  
  # Embedding layers
  for i in conf["enc_emb"]:
    size = conf["vocab_sizes"][i]
    embedding = tf.keras.layers.Embedding(int(size), conf["d_model"])
    x += embedding(input[:,:,i]) # TODO: Multiply? Remove x=0?
  
  # Dense layers
  for i in conf["enc_dense"]:
    dense = tf.keras.layers.Dense(conf["d_model"])
    dense_out = dense(input[:,:,i])
    dense_out = tf.expand_dims(dense_out, axis=1)
    x += dense_out
  
  # Pos encoding
  pos_encoding = positional_encoding(conf["window_size"], conf["d_model"])
  pos_encoding = pos_encoding[:, :seq_len, :]
  x += pos_encoding

  for i in range(conf["enc_num_layers"]):
      enc_layer = EncoderLayer(conf["d_model"], conf["num_heads"], conf["dff"], conf["dropout_rate"])
      x = enc_layer(x, mask)
      out = x
  
  ####### Decoder #######
  y = 0 # will be casted to sum first embedding
  
  # Embedding layers
  for i in conf["dec_emb"]:
    size = conf["vocab_sizes"][i]
    embedding = tf.keras.layers.Embedding(int(size), conf["d_model"])
    y += embedding(input[:,:,i]) # TODO: Multiply? Remove x=0?

  # Dense layers
  for i in conf["dec_dense"]:
    dense = tf.keras.layers.Dense(conf["d_model"])
    dense_out = dense(input[:,:,i])
    dense_out = tf.expand_dims(dense_out, axis=1)
    y += dense_out

  # Pos encoding
  y += pos_encoding

  for i in range(conf["dec_num_layers"]):
    dec_layer = DecoderLayer(conf["d_model"], conf["num_heads"], conf["dff"], conf["dropout_rate"])
    y = dec_layer(y, x, mask)
    out = y
  
  output = tf.keras.layers.Dense(3)(out) # final layer

  # Instantiate an end-to-end model
  model = tf.keras.Model(input, output)
  return model

In [1]:
def create_model_seperate_input(conf):
  input = [tf.keras.Input(shape=conf["window_size"], name=conf["features"][i]) for i in range(conf["n_features"])]
  seq_len = tf.shape(input)[1]
  mask = create_masks(input[0], pad_value = conf["padding_values"][0]) # input 0 = E, input 1 = r

  ###### Encoder ###########
  x = 0 # will be casted to sum first embedding
  
  # Embedding layers
  for i in conf["enc_emb"]:
    size = conf["vocab_sizes"][i]
    embedding = tf.keras.layers.Embedding(int(size), conf["d_model"])
    x += embedding(input[i]) # TODO: Multiply? Remove x=0?
  
  # Dense layers
  for i in conf["enc_dense"]:
    dense = tf.keras.layers.Dense(conf["d_model"])
    dense_out = dense(input[i])
    dense_out = tf.expand_dims(dense_out, axis=1)
    x += dense_out
  
  # Pos encoding
  pos_encoding = positional_encoding(conf["window_size"], conf["d_model"])
  pos_encoding = pos_encoding[:, :seq_len, :]
  x += pos_encoding

  for i in range(conf["enc_num_layers"]):
      enc_layer = EncoderLayer(conf["d_model"], conf["num_heads"], conf["dff"], conf["dropout_rate"])
      x = enc_layer(x, mask)
      out = x
  
  ####### Decoder #######
  y = 0 # will be casted to sum first embedding
  
  # Embedding layers
  for i in conf["dec_emb"]:
    size = conf["vocab_sizes"][i]
    embedding = tf.keras.layers.Embedding(int(size), conf["d_model"])
    y += embedding(input[i]) # TODO: Multiply? Remove x=0?

  # Dense layers
  for i in conf["dec_dense"]:
    dense = tf.keras.layers.Dense(conf["d_model"])
    dense_out = dense(input[i])
    dense_out = tf.expand_dims(dense_out, axis=1)
    y += dense_out

  # Pos encoding
  y += pos_encoding

  for i in range(conf["dec_num_layers"]):
    dec_layer = DecoderLayer(conf["d_model"], conf["num_heads"], conf["dff"], conf["dropout_rate"])
    y = dec_layer(y, x, mask)
    out = y
  
  output = tf.keras.layers.Dense(3)(out) # final layer

  # Instantiate an end-to-end model
  model = tf.keras.Model(input, output)
  return model

In [None]:
# COPY OF WORKING MODEL!!
# def create_model(conf): # Functional model without list comprehension
#   input = tf.keras.Input(shape=(conf["window_size"], conf["n_features"]), name = "input")
#   seq_len = tf.shape(input)[1]
#   mask = create_masks(input[:,:,0], pad_value = conf["padding_values"][0]) # input 0 = E, input 1 = r

#   ###### Encoder ###########
#   x = 0 # will be casted to sum first embedding
  
#   # Embedding layers
#   for i in conf["enc_emb"]:
#     size = conf["vocab_sizes"][i]
#     embedding = tf.keras.layers.Embedding(int(size), conf["d_model"])
#     x += embedding(input[:,:,i]) # TODO: Multiply? Remove x=0?
  
#   # Dense layers
#   for i in conf["enc_dense"]:
#     dense = tf.keras.layers.Dense(conf["d_model"])
#     dense_out = dense(input[:,:,i])
#     dense_out = tf.expand_dims(dense_out, axis=1)
#     x += dense_out
  
#   # Pos encoding
#   pos_encoding = positional_encoding(conf["window_size"], conf["d_model"])
#   pos_encoding = pos_encoding[:, :seq_len, :]
#   x += pos_encoding

#   for i in range(conf["enc_num_layers"]):
#       enc_layer = EncoderLayer(conf["d_model"], conf["num_heads"], conf["dff"], conf["dropout_rate"])
#       x = enc_layer(x, mask)
#       out = x
  
#   ####### Decoder #######
#   y = 0 # will be casted to sum first embedding
  
#   # Embedding layers
#   for i in conf["dec_emb"]:
#     size = conf["vocab_sizes"][i]
#     embedding = tf.keras.layers.Embedding(int(size), conf["d_model"])
#     y += embedding(input[:,:,i]) # TODO: Multiply? Remove x=0?

#   # Dense layers
#   for i in conf["dec_dense"]:
#     dense = tf.keras.layers.Dense(conf["d_model"])
#     dense_out = dense(input[:,:,i])
#     dense_out = tf.expand_dims(dense_out, axis=1)
#     y += dense_out

#   # Pos encoding
#   y += pos_encoding

#   for i in range(conf["dec_num_layers"]):
#     dec_layer = DecoderLayer(conf["d_model"], conf["num_heads"], conf["dff"], conf["dropout_rate"])
#     y = dec_layer(y, x, mask)
#     out = y
  
#   output = tf.keras.layers.Dense(3)(out) # final layer

#   # Instantiate an end-to-end model
#   model = tf.keras.Model(input, output)
#   return model

In [None]:
# shapes = [(None, ), (None, )]
# def create_model():
#   inputs = [tf.keras.Input(shape=shapes[i], name=selections_final[i]) for i in range(len(selections_final))]
#   seq_len = tf.shape(inputs[0])[1]
#   mask = create_masks(inputs[0], pad_value = pad_mapping["E"])
  
#   enc_embeddings = [tf.keras.layers.Embedding(size, d_model) for size in input_vocab_size]
#   x = [enc_embeddings[i](inputs[i]) for i in range(len(inputs))]
#   x = tf.keras.layers.Add()(x)

#   pos_encoding = positional_encoding(THR_E, d_model)
#   pos_encoding = pos_encoding[:, :seq_len, :]
  
#   x = tf.keras.layers.Add()([x, pos_encoding])
#   enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
  
#   for i in range(num_layers):
#       x = enc_layers[i](x, mask, training=True)

#   output = tf.keras.layers.Dense(3)(x) # final layer
  
#   # Instantiate an end-to-end model
#   model = tf.keras.Model(inputs, output)
  
#   return model

In [None]:
# x = [emb * tf.math.sqrt(tf.cast(self.d_model, tf.float32)) for emb in x] # Scale it or not???

In [None]:
# if q_inputs:
#     embs = [self.embedding(inp) for inp in q_inputs] # New: incorporate for more than one embedding. Each (batch_size, input_seq_len, d_model)
#     q = [emb * tf.math.sqrt(tf.cast(self.d_model, tf.float32)) for emb in embs] 
#     q = tf.math.add_n(inputs=q)
#     q += self.pos_encoding[:, :seq_len, :]
#     q = self.dropout(q, training=training)


In [None]:
# # From https://machinetalk.org/2019/04/29/create-the-transformer-with-tensorflow-2-0/
# def positional_embedding(pos, model_size):
#     PE = np.zeros((1, model_size))
#     for i in range(model_size):
#         if i % 2 == 0:
#             PE[:, i] = np.sin(pos / 10000 ** (i / model_size))
#         else:
#             PE[:, i] = np.cos(pos / 10000 ** ((i - 1) / model_size))
#     return PE

In [None]:
# # https://stackoverflow.com/questions/63072898/tensorflow-2-how-to-use-stack-of-dense-layers-in-keras-functional-api
# def get_layers(inp, layer, n_layers, mask, training):
#     for i in range(n_layers):
#         x = layer(inp, mask, training)
#         inp = x
#     return x

In [None]:
    # task_mask =  create_task_mask(inputs[:,:,-1]) 
    # task_mask = tf.py_function(func=create_task_mask, inp=[inputs[:,:,-1]], Tout=tf.float32)

In [None]:
# Numpy create task mask

In [None]:
# def create_task_mask(task_input):
#   seq = tf.cast(tf.math.equal(task_input, 1), tf.float32)
#   mask = np.apply_along_axis(get_task_one_mask, 1, seq)
#   mask = tf.expand_dims(mask, 1) # make compatible with padding mask
#   return mask

In [None]:
# def get_task_one_mask(A): # https://stackoverflow.com/questions/65045232/set-values-in-row-to-zero-before-index-value-of-row-numpy-or-tensorflow
#   N = A.shape[0]
#   column = (A > 0).reshape((N, 1))
#   mask = np.ones((N, N), dtype=np.bool)
#   mask = np.where(column, False, np.tril(mask, -1))
#   mask = np.cumsum(mask, axis=0)
#   B = np.where(mask, 0, np.tile(A, (N, 1)))
#   return B