https://github.com/uzaymacar/attention-mechanisms/blob/master/layers.py

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import argparse
import time
import os
import unicodedata
import re
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import get_file, to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, Dense, RepeatVector, TimeDistributed, Flatten, Lambda, Concatenate, Permute, Reshape
from tensorflow.compat.v1.keras.layers import CuDNNLSTM   # CuDNNLSTM not yet released for TF 2.0
from tensorflow.keras.backend import permute_dimensions

In [10]:
# Argument specification
parser = argparse.ArgumentParser()
parser.add_argument("--config",
                    default=0,
                    help="Integer value representing a model configuration")

_StoreAction(option_strings=['--config'], dest='config', nargs=None, const=None, default=0, type=None, choices=None, help='Integer value representing a model configuration', metavar=None)

In [9]:
# args = parser.parse_args()

usage: ipykernel_launcher.py [-h] [--config CONFIG]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-aa6a7091-e67a-428b-897f-21ec4597b988.json


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [0]:
# Set seeds for reproducibility
np.random.seed(500)
tf.random.set_seed(500)

# Set global constants
embedding_dim = 128     # number of dimensions to represent each character in vector space
batch_size = 100        # feed in the neural network in 100-example training batches
num_epochs = 30         # number of times the neural network goes over EACH training example
# config = int(args.config)  # model-configuration

In [12]:
# Load Spanish-to-English dataset (.zip)
zipped = get_file(
    fname='spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

file = os.path.join(os.path.dirname(zipped), 'spa-eng/spa.txt')

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [0]:
def unicode_to_ascii(string):
    """Function to convert the string from unicode file to ascii format"""
    return ''.join(char for char in unicodedata.normalize('NFD', string)
                   if unicodedata.category(char) != 'Mn')

In [0]:
def preprocess_sentence(sentence):
    """
    Function to preprocess sentences according to machine translation conventions. Includes
    conversion to ascii characters, general cleaning operations, and removal of accents.
    """
    sentence = unicode_to_ascii(sentence.lower().strip())
    # Creates a space between a word and the punctuation following it, ex: "hi dad." => "hi dad ."
    sentence = re.sub(r'([?.!,¿])', r' \1 ', sentence)
    sentence = re.sub(r'[" "]+', ' ', sentence)

    # Replace everything with space except (a-z, A-Z, '.', '?', '!', ',')
    sentence = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', sentence)

    # Remove spaces
    sentence = sentence.rstrip().strip()

    # Add a start and an end token to the sentence for the model to recognize
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [0]:
def create_dataset(path, num_examples):
    """Returns sentence pairs in [ENGLISH, SPANISH] format"""
    lines = open(path, encoding='utf8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(sentence)
                   for sentence in line.split('\t')]
                  for line in lines[:num_examples]]
    return zip(*word_pairs)

In [0]:
# Load and process pairwise English and Spanish sentences
english_sentences, spanish_sentences = create_dataset(path=file, num_examples=None)

In [18]:
english_sentences[:5]

('<start> go . <end>',
 '<start> go . <end>',
 '<start> go . <end>',
 '<start> go . <end>',
 '<start> hi . <end>')

In [0]:
def max_length(tensor):
    """Function that returns the maximum length of any element in a given tensor"""
    return max(len(tensor_unit) for tensor_unit in tensor)


In [0]:
def tokenize(language):
    """Function to tokenize language by mapping words to integer indices"""
    # Perform tokenization
    language_tokenizer = Tokenizer(filters='')
    language_tokenizer.fit_on_texts(language)
    tensor = language_tokenizer.texts_to_sequences(language)
    # Pad sequences to maximum found sequence length by appending 0s to end
    tensor = pad_sequences(sequences=tensor, padding='post')

    return tensor, language_tokenizer

In [0]:
def load_dataset(path, num_examples=None):
    """Function to load dataset"""
    # Create cleaned input-output pairs
    target_language, input_language = create_dataset(path, num_examples)
    # Create language tokenizers and extract tensors
    input_tensor, input_language_tokenizer = tokenize(input_language)
    target_tensor, target_language_tokenizer = tokenize(target_language)

    return input_tensor, target_tensor, input_language_tokenizer, target_language_tokenizer

In [0]:
# Get example (input) tensors, label (target) tensors, and distinct tokenizers for both languages
input_tensor, target_tensor, input_language_tokenizer, target_language_tokenizer = load_dataset(
    path=file, num_examples=None
)

In [0]:
# Setup more global constants
input_vocabulary_size = len(input_language_tokenizer.word_index) + 1
target_vocabulary_size = len(target_language_tokenizer.word_index) + 1

# Calculate maximum sequence lengths of the input and target tensors
input_sequence_length, target_sequence_length = max_length(input_tensor), max_length(target_tensor)

# Split data to training and validation sets
X_train, X_test, Y_train, Y_test = train_test_split(input_tensor,
                                                    target_tensor,
                                                    test_size=0.2,
                                                    random_state=500)

In [34]:
input_sequence_length

53

In [35]:
target_sequence_length

51

In [0]:
# Compute batch size and cutoff training & validation examples to fit
training_cutoff, test_cutoff = len(X_train) % batch_size, len(X_test) % batch_size

X_train, Y_train = X_train[:-training_cutoff], Y_train[:-training_cutoff]

X_test, Y_test = X_test[:-test_cutoff], Y_test[:-test_cutoff]

In [25]:
X_train.shape

(95100, 53)

In [26]:
X_train[:1]

array([[   1,   10,  254, 3263,   36,  144,    4, 2227,    7,   10,  560,
           3,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [0]:
# Feed in current labels (Y) as decoder inputs, and pad current labels by 1 word
# Check https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg for better understanding
X_train_target, X_test_target = Y_train, Y_test

Y_train = np.array(pad_sequences(sequences=np.array([sequence[1:] for sequence in Y_train]),
                                 maxlen=target_sequence_length,
                                 padding='post'))

Y_test = np.array(pad_sequences(sequences=np.array([sequence[1:] for sequence in Y_test]),
                                maxlen=target_sequence_length,
                                padding='post'))

In [28]:
Y_train[0]

array([   5,  387, 1187,    5,  152,   80,   60, 1999,    5,  300,    3,
          2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [29]:
X_train_target[0]

array([   1,    5,  387, 1187,    5,  152,   80,   60, 1999,    5,  300,
          3,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

## Model

In [0]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(Attention, self).__init__(**kwargs)

  def build(self, input_shape):
    # input should be (B, t, h)
    self.input_sequence_length = input_shape[0][1]  # number of input sequence length  
                                                    # input((10,)) -> embedding(100, 128) == None, 10, 128
    self.hidden_dim = input_shape[0][2]             # last layer units
    self.target_sequence_length = input_shape[1][1] # the model inputs : model.fit([x1, x2])

    self.W_a = Dense(units=self.hidden_dim, use_bias=False)
    self.W_a.build(input_shape=(None, None, self.hidden_dim))
    self._trainable_weights += self.W_a._trainable_weights

    super(Attention, self).build(input_shape)
  
  def call(self, inputs): # input should be
    target_hidden_state = inputs[1]
    # print('target_hidden_state', target_hidden_state)
    current_timestep = inputs[2]
    # print('current_timestep', current_timestep)
    source_hidden_states = inputs[0]
    # print('source_hidden_states', source_hidden_states)  # from decoder

    target_hidden_state = tf.expand_dims(target_hidden_state, 1)
    # print('target_hidden_state_expand_dim', target_hidden_state)

    #Global
    source_hidden_states = source_hidden_states

    # General
    weighted_hidden_states = self.W_a(source_hidden_states)
    attention_score = tf.keras.layers.Dot(axes=[2,2])([weighted_hidden_states, target_hidden_state])

    attention_weights = tf.keras.layers.Activation('softmax')(attention_score)

    context_vector = source_hidden_states * attention_score
    return context_vector, attention_weights


In [0]:
test_input = Input(shape=(10,))
x = Embedding(100, 128)(test_input)
x = CuDNNLSTM(128, return_sequences=True)(x)  # encoder ouput 

d_input = Input(shape=(10,))
d_e = Embedding(100, 128)(d_input)

decoder_lstm = CuDNNLSTM(128, return_state=True)
d, d_h_s, d_h_c = decoder_lstm(d_e)

decoder_dense_layer = Dense(units=10, activation='softmax')

In [68]:
d_e

<tf.Tensor 'embedding_16/Identity:0' shape=(None, 10, 128) dtype=float32>

In [51]:
print(x) # return sequence
print(d) # 
print(d_h_s)
print(d_c_s)

Tensor("cu_dnnlstm_8/Identity:0", shape=(None, 10, 128), dtype=float32)
Tensor("cu_dnnlstm_9/Identity:0", shape=(None, 128), dtype=float32)
Tensor("cu_dnnlstm_9/Identity_1:0", shape=(None, 128), dtype=float32)
Tensor("cu_dnnlstm_9/Identity_2:0", shape=(None, 128), dtype=float32)


In [59]:
x = Attention()([x, d_h_s, 0]) # only one time step

target_hidden_state Tensor("cu_dnnlstm_13/Identity_1:0", shape=(None, 128), dtype=float32)
current_timestep tf.Tensor(0, shape=(), dtype=int32)
source_hidden_states Tensor("cu_dnnlstm_12/Identity:0", shape=(None, 10, 128), dtype=float32)
target_hidden_state_expand_dim Tensor("ExpandDims_2:0", shape=(None, 1, 128), dtype=float32)


In [0]:
tf.keras.backend.clear_session()

In [92]:
attention_layer = Attention()

outputs = []
for timestep in range(10):
  current_word = tf.keras.layers.Lambda(lambda x: x[:, timestep:timestep+1, :])(d_e) #0:1, 1:2, 
  # print(current_word)
  context_vector, attention_weights = attention_layer([x, d_h_s, timestep])
  # print(context_vector)

  decoder_input = Concatenate(axis=1)([context_vector, current_word])
  print(decoder_input)

  d, d_h_s, d_h_c = decoder_lstm(decoder_input, initial_state=[d_h_s, d_h_c])

  decoder_output = decoder_dense_layer(d)
  outputs.append(decoder_output)

Tensor("concatenate_24/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_25/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_26/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_27/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_28/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_29/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_30/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_31/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_32/Identity:0", shape=(None, 11, 128), dtype=float32)
Tensor("concatenate_33/Identity:0", shape=(None, 11, 128), dtype=float32)


In [93]:
outputs

[<tf.Tensor 'dense_10/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_1/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_2/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_3/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_4/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_5/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_6/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_7/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_8/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense_10_9/Identity:0' shape=(None, 10) dtype=float32>]

In [94]:
tf.stack(outputs)

<tf.Tensor 'stack:0' shape=(10, None, 10) dtype=float32>

In [95]:
outputs = tf.keras.layers.Lambda(lambda x: tf.keras.backend.permute_dimensions(tf.stack(x), pattern=(1,0,2)))(outputs)

outputs

<tf.Tensor 'lambda_75/Identity:0' shape=(None, 10, 10) dtype=float32>

In [0]:
# Create word-level multi-class classification (machine translation), sequence-to-sequence model
# Input Layers
# i)  Initialize input & target sequences
X_input = Input(shape=(input_sequence_length,), batch_size=batch_size, name='input_sequences')
X_target = Input(shape=(target_sequence_length,), batch_size=batch_size, name='target_sequences')


# ii) Initialize hidden & cell states
initial_hidden_state = Input(shape=(128,), batch_size=batch_size, name='hidden_state')
initial_cell_state = Input(shape=(128,), batch_size=batch_size, name='cell_state')

hidden_state, cell_state = initial_hidden_state, initial_cell_state
# NOTE: Here hidden state refers to the recurrently propagated input to the cell, whereas cell
# state refer to the cell state directly from the previous cell.

In [0]:
# Word-Embedding Layers
# i)  Embed input sequences from the input language
embedded_input = Embedding(input_dim=input_vocabulary_size, output_dim=embedding_dim)(X_input)
# ii) Embed target sequences from the target language
embedded_target = Embedding(input_dim=target_vocabulary_size, output_dim=embedding_dim)(X_target)
# NOTE: The embedded target sequences (deriving from X_target) allow us to enforce Teacher Forcing:
# using the actual output (correct translation) from the training dataset at the current time step
# as input in the next time step, rather than the output generated by the network.

In [0]:
# Recurrent Layers
# i)  Encoder
encoder_output = CuDNNLSTM(units=128, return_sequences=True)(embedded_input)

# ii) Decoder
decoder_recurrent_layer = CuDNNLSTM(units=128, return_state=True)
# NOTE: The encoder is always fully vectorized and returns the hidden representations of the whole
# sequence at once, whereas the decoder does this step by step.
