# Attention is All You Need

In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Load Data

In [3]:
formal = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt').read()
informal = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt').read()

formal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt').read()
informal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt').read()

In [4]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [5]:
def process_seq_target_input(seq):
    """
    This inserts a space in between the last word and a period
    This function covers shifting right for being fed to the Transformer
    """
    
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return s + ' <end>'

In [6]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
f_corpus_input = [process_seq_target_input(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Preprocess data

In [7]:
def tokenize(corpus):
    """ Tokenize data and pad sequences """
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token='<OOV>')
    tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, maxlen=30, padding='post')
    return padded_seqs, tokenizer

In [8]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)
target_input_train, target_input_tokenizer = tokenize(f_corpus_input)

In [9]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_input_train, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

In [10]:
example_input_batch, example_target_input_batch, example_target_batch = next(iter(train))

### Declare Static Variables

In [14]:
BATCH_SIZE = 64
D_MODEL = 512
NX = 6
H = 8

## Positional Embedding

Need to compute Positional Embeddign from 3.5

$$ PE_{(pos, 2i)} = \sin(pos, 10000^{2i/d_{model}}) \\
PE_{(pos, 2i+1)} = \cos(pos, 10000^{2i/d_{model}})
$$

## Multi-Head Attention

Computing 
$$ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,...,head_h)W^o$$ 
where $$head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)$$
and attention is 
$$ \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$$ 

In seciton 3.2.3 of AAYN encoder-decoder for seq2seq models keys and values are both form the encoder output, so treating key and value as the same input here.

In [20]:
class MultiHeadAttention(tf.keras.Model):
    def __init__(self, model_size, h):
        super(MultiHeadAttention, self).__init__()
        self.mha_size = model_size // h
        self.h = h

        # learn different weights for all 
        self.wq = [tf.keras.layers.Dense(self.mha_size) for _ in range(h)]
        self.wk = [tf.keras.layers.Dense(self.mha_size) for _ in range(h)]
        self.wv = [tf.keras.layers.Dense(self.mha_size) for _ in range(h)]
        self.wo = tf.keras.layers.Dense(model_size)

    def scaled_dot_product_attention(self, q, k, i, mask=None):
        """run for each query, value, key in h"""
        # query shape: (batch_size, query_length, model_size)
        # value shape: (batch_size, key_length, model_size)
        score = tf.matmul(self.wq[i](q), self.wk[i](k), transpose_b=True)

        # eq(1) from AAYN
        d_k = tf.math.sqrt(tf.cast(self.mha_size, dtype=tf.float32))

        # score shape: (batch_size, query_length, value_length)
        score /= d_k

        # apply mask
        if mask:
            score += mask * 1e-8 

        # attention shape: (batch_size, query_length, value_length)
        attention = tf.nn.softmax(score, axis=2)

        # context shape: (batch_size, query_length, value_length)
        head = tf.matmul(attention, self.wv[i](k))

        return head 

    def call(self, q, k, mask=None):
        """This computes the multi head attention by calling for each h"""
        # compute one head attention for each head
        multi_head = [self.scaled_dot_product_attention(q, k, i, mask) for i in range(self.h)]
        
        # concat all heads 
        multi_head = tf.concat(multi_head, axis=2)

        # multi_head shape: (batch_size, query_length, model_size)
        mutli_head = self.wo(multi_head)

        return mutli_head

## Encoder

In [24]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, h):
        super(EncoderLayer, self).__init__()
        self.d_model = d_model
        self.h = h 

        self.mha = MultiHeadAttention(d_model, h)
        self.mha_norm = tf.keras.layers.LayerNormalization()

        self.FFN_l1 = tf.keras.layers.Dense(4 * d_model, activation='relu')
        self.FFN_l2 = tf.keras.layers.Dense(d_model)
        self.FFN_norm = tf.keras.layers.LayerNormalization()
    
    def call(self, E_out, mask=None):
        mha_out = self.mha(E_out, E_out, mask)
        mha_out = self.mha_norm(E_out + mha_out)

        # Feed Forward Network
        FFN_out = self.FFN_l2(self.FFN_l1(mha_out))

        #  add and norm
        FFN_out = self.FFN_norm(FFN_out + mha_out)
        
        return FFN_out

In [97]:
np.concatenate(np.array(pes), axis=0).shape

(3840,)

In [78]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_layers, h):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.h = h

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.encoder_layers = [EncoderLayer(vocab_size, d_model, h) 
                               for _ in range(num_layers)]

    def positional_encoding(self, pos):
        return np.array([
            np.sin(pos / 10000 ** (i / self.d_model)) 
            if i % 2 == 0 else np.cos(pos / 10000 ** (i / self.d_model))
            for i in range(self.d_model)
        ])
    
    def call(self, seq, mask=None):
        
        # Embedding Layer
        # E_out shape: (batch_size x max_length x d_model)
        E_out = self.embedding(seq)
        
        # Positional Embedding 
        pes = [self.positional_encoding(i) for i in range(seq.shape[1])]
        
        print(pes)
        # E_out shape: 
        E_out += pes[:seq.shape[1], :]
        
        # create x variable
        x = E_out

        # MultiHeadAttention
        for i in range(self.num_layers):
            x = self.encoder_layers[i](x, mask)
            
        return x 

In [79]:
encoder = Encoder(input_vocab_size, D_MODEL, NX, H)

## Decoder

In [80]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Decoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h

        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)

        self.mha1 = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.mha1_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.mha2 = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.mha2_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]

        self.FFN_l1 = [tf.keras.layers.Dense(4 * model_size) for _ in range(num_layers)]
        self.FFN_l2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.FFN_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]

        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, seq, enc_opt, mask=None):
        E_out = self.embedding(seq)
        E_out *= pes[:seq.shape[1], :]
        
        for i in range(self.num_layers):
            # Define mask
            pad_mask = tf.linalg.band_part(tf.ones((len(seq), len(seq))), -1, 0)
            
            # First MHA layer
            mha1_out = self.mha1[i](E_out, E_out, pad_mask)
            mha1_out = self.mha1_norm[i](mha1_out + E_out)
            
            # Second MHA layer
            mha2_out = self.mha2[i](E_out, enc_opt, pad_mask)
            mha2_out = self.mha2_norm[i](mha2_out + mha1_out)
            
            # FFN
            FFN_out = self.FFN_l2[i](self.FFN_l1[i](mha2_out))
            FFN_out = self.FFN_norm[i](FFN_out + mha2_out)
        
            output = self.fc(FFN_out)
        
        return output

In [81]:
example_input_sequence = example_input_batch[0]
example_output_sequence = example_target_batch[0]

In [82]:
encoder = Encoder(input_vocab_size, D_MODEL, NX, H)
decoder = Decoder(target_vocab_size, D_MODEL, NX, H)

In [85]:
ex = tf.reshape(example_input_sequence, (1,example_input_sequence.shape[0]))
ex1 = tf.reshape(example_output_sequence, (1,example_output_sequence.shape[0]))

In [86]:
enc_output = encoder(ex)
dec_output = decoder(ex1, enc_output)

[array([0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1

TypeError: list indices must be integers or slices, not tuple

## Define Training

In [19]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)

def loss_func(real, preds):
    """Calculate and return loss"""
    # caclulate loss
    loss = static_loss(real, preds)

    # create mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)

    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [20]:
@tf.function
def train_step(in_seq, targ_in_seq, targ_out_seq):
    with tf.GradientTape() as tape:
        mask = 1 - tf.cast(tf.equal(in_seq, 0), dtype=tf.float32)
        enc_opt = encoder(in_seq, mask)
        dec_opt = decoder(targ_in_seq, enc_opt, mask)
        loss = loss_func(targ_out_seq, dec_opt)
        
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss / targ_out_seq.shape[1]

In [21]:
NUM_EPOCHS = 10
start = datetime.now()
for epoch in range(NUM_EPOCHS):
    total_loss=0
    for batch, (inpt, targ_inpt, targ_out) in enumerate(train.take(steps_per_epoch)):
        batch_loss = train_step(inpt, targ_inpt, targ_out)
        total_loss += batch_loss
        
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                     batch,
                                                     batch_loss.numpy()))
    print('Epoch {} Loss {:.4f}'.format(epoch+1,
                                        total_loss/steps_per_epoch))
    print('Time taken for 1 epoch {} seconds\n'.format(datetime.now() - start))



KeyboardInterrupt: 