# Attention is All You Need

In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Declare Static Variables

These parameters are mostly stolen from the Google Paper

In [2]:
EMBEDDING_DIM = 256
ATTENTION_UNITS = 10
ENCODER_UNITS = 1024
DECODER_UNITS = 1024
BATCH_SIZE = 64

### Load Data

In [3]:
formal = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt').read()
informal = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt').read()

formal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt').read()
informal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt').read()

In [4]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [5]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Preprocess data

In [6]:
def tokenize(corpus):
    """ Tokenize data and pad sequences """
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token='<OOV>')
    tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post')
    return padded_seqs, tokenizer

In [7]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)

In [8]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

In [9]:
example_input_batch, example_target_batch = next(iter(train))

## Positional Embedding

In [10]:
def positional_embedding(p, model_size):
    p_emb = np.zeros((1, model_size))
    for i in range(model_size):
        if i % 2 == 0:
            p_emb[:, i] = np.sin(p / 10000 ** (i / model_size))
        else:
            p_emb[:, i] = np.cos(p / 10000 ** (i / model_size))
    return p_emb

max_length = input_train.shape[1]
MODEL_SIZE = 128

pes = [positional_embedding(i, MODEL_SIZE) for i in range(max_length)]

pes = np.concatenate(pes, axis=0)
pes = tf.constant(pes, dtype=tf.float32)

## Multi-Head Attention

Computing 
$$ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,...,head_h)W^o$$ 
where $$head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)$$
and attention is 
$$ \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$$ 

In [55]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, model_size, h):
        super(MultiHeadAttention, self).__init__()
        self.query_size = model_size // h
        self.key_size = model_size // h
        self. value_size = model_size // h
        self.h = h
        self.wq = [tf.keras.layers.Dense(self.query_size) for _ in range(h)]
        self.wk = [tf.keras.layers.Dense(self.key_size) for _ in range(h)]
        self.wv = [tf.keras.layers.Dense(self.value_size) for _ in range(h)]
        self.wo = tf.keras.layers.Dense(model_size)

    def __one_head_attention(self, query, value, i):
        """run for each query, value, key in h"""
        # query shape: (batch_size, query_length, model_size)
        # value shape: (batch_size, value_length, model_size)
        score = tf.matmul(self.wq[i](query), self.wk[i](value), transpose_b=True)

        # eq(1) from AAYN
        d_k = tf.math.sqrt(tf.cast(self.key_size, dtype=tf.float32))

        # score shape: (batch_size, query_length, value_length)
        score /= d_k

        # attention shape: (batch_size, query_length, value_length)
        attention = tf.nn.softmax(score, axis=2)

        # context shape: (batch_size, query_length, value_length)
        head = tf.matmul(attention, value)

        return head 

    def call(self, query, value):
        """This computes the multi head attention by calling for each h"""
        # compute one head attention for each head
        multi_head = [self.__one_head_attention(query, value, i) for i in range(self.h)]

        # concat all heads 
        multi_head = tf.concat(multi_head, axis=2)

        # multi_head shape: (batch_size, query_length, model_size)
        mutli_head = self.wo(multi_head)

        return mutli_head

## Encoder

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    

In [78]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Encoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h

        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)

        self.attention = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]

        self.FFN_l1 = [tf.keras.layers.Dense(4 * model_size, activation='relu') for _ in range(num_layers)]
        self.FFN_l2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.FFN_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]

    def call(self, seq):
        sub_in = []
        for i in range(seq.shape[1]):
            E = self.embedding(tf.expand_dims(seq[:, i], axis=1))
            sub_in.append(E + pes[i, :])

        sub_in = tf.concat(sub_in, axis=1)

        # MultiHeadAttention
        for i in range(self.num_layers):
            sub_out = [self.attention[i](
                tf.expand_dims(sub_in[:, j, :], axis=1), sub_in)
            for j in range(sub_in.shape[1])]

            # sub_out shape: (batch_size, sequence_length, model_size)
            sub_out = tf.concat(sub_out, axis=1)

            # norm and add
            sub_out = self.attention_norm[i](sub_out + sub_in)

            # Feed Forward Network
            FFN_out = self.FFN_l2[i](self.FFN_l1[i](sub_out))

            #  add and norm
            FFN_out = self.FFN_norm[i](FFN_out + sub_out)

        return FFN_out

## Decoder

In [57]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Decoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h

        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)

        self.mha1 = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.mha1_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.mha2 = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.mha2_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]

        self.FFN_l1 = [tf.keras.layers.Dense(4 * model_size) for _ in range(num_layers)]
        self.FFN_l2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.FFN_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]

        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, seq, enc_opt):
        E = [self.embedding(tf.expand_dims(seq[:, i], axis=1)) + pes[i,:]
                     for i in range(seq.shape[1])]
        E_out = tf.concat(E, axis=1)
        
        for i in range(self.num_layers):
            # First MHA layer
            mha1_out = [self.mha1[i](
                tf.expand_dims(E_out[:, j, :], axis=1),
                E_out[:, :j, :]
            ) for j in range(E_out.shape[1])]
            
            # add and norm
            mha1_out = tf.concat(mha1_out, axis=1)
            mha1_out = self.mha1_norm[i](mha1_out + E_out)
            
            # Second MHA layer
            mha2_out = [self.mha2[i](
                tf.expand_dims(mha1_out[:, j, :], axis=1), 
                enc_output
            ) for j in range(mha1_out.shape[1])]
            
            # add and norm
            mha2_out = tf.concat(mha2_out, axis=1)
            mha2_out = self.mha2_norm[i](mha2_out + mha1_out)
            
            # FFN
            FFN_out = self.FFN_l2[i](self.FFN_l1[i](mha2_out))
            FFN_out = self.FFN_norm[i](FFN_out + mha2_out)
        
            output = self.fc(FFN_out)
        
        return output

In [58]:
H = 2
NUM_LAYERS = 2

example_input_sequence = example_input_batch[0]
example_output_sequence = example_target_batch[0]

In [79]:
encoder = Encoder(input_vocab_size, MODEL_SIZE, NUM_LAYERS, H)
decoder = Decoder(target_vocab_size, MODEL_SIZE, NUM_LAYERS, H)

In [80]:
en_sequence_in = tf.constant([[1, 2, 3, 4, 6, 7, 8, 0, 0, 0], 
                              [1, 2, 3, 4, 6, 7, 8, 0, 0, 0]])
fr_sequence_in = tf.constant([[1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0],
                              [1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0]])

In [81]:
en_sequence_in

<tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 2, 3, 4, 6, 7, 8, 0, 0, 0],
       [1, 2, 3, 4, 6, 7, 8, 0, 0, 0]], dtype=int32)>

In [82]:
ex = tf.reshape(example_input_sequence, (1,32))

In [84]:
enc_output = encoder(ex)
dec_output = decoder(fr_sequence_in, enc_output)