https://blog.floydhub.com/attention-mechanism/

https://github.com/gabrielloye/Attention_Seq2seq-Translation/blob/master/main.ipynb

https://github.com/uzaymacar/attention-mechanisms/blob/master/layers.py#L237


In [3]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import tensorflow as tf
import unicodedata
from tqdm import tqdm_notebook
print(tf.__version__)

2.1.0
2.1.0


In [5]:
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip

--2020-03-10 11:43:04--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196, 2606:4700:3033::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5939832 (5.7M) [application/zip]
Saving to: ‘fra-eng.zip’


2020-03-10 11:43:05 (5.14 MB/s) - ‘fra-eng.zip’ saved [5939832/5939832]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [0]:
df = pd.read_csv('/content/fra.txt', sep='\t', header=None)
df.columns = ['eng', 'fra', 'attr']

In [7]:
df.head()

Unnamed: 0,eng,fra,attr
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Salut.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Cours !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run!,Courez !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [0]:
df = df[['eng', 'fra']]

In [9]:
df.head()

Unnamed: 0,eng,fra
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


### Clear text

In [0]:
def unicode_to_ascii(sentence):
  return ''.join(char for char in unicodedata.normalize('NFD', sentence) if unicodedata.category(char) != 'Mn')

def clean_text(sentence):
  s = unicode_to_ascii(sentence.lower().strip())
  s = re.sub(r'([!.,?])', r' \1 ', s)
  s = re.sub(r'[^a-zA-Z!.,?]+', r' ', s)
  s = re.sub(r'[\s]+', ' ', s)
  s = '<start> ' + s + ' <end>'
  return s

In [11]:
test_sentence = 'helllo!  its   me.'

print(clean_text(test_sentence))

<start> helllo ! its me .  <end>


In [0]:
df['eng_clean'] = df['eng'].apply(clean_text)
df['fra_clean'] = df['fra'].apply(clean_text)

In [13]:
df.head()

Unnamed: 0,eng,fra,eng_clean,fra_clean
0,Go.,Va !,<start> go . <end>,<start> va ! <end>
1,Hi.,Salut !,<start> hi . <end>,<start> salut ! <end>
2,Hi.,Salut.,<start> hi . <end>,<start> salut . <end>
3,Run!,Cours !,<start> run ! <end>,<start> cours ! <end>
4,Run!,Courez !,<start> run ! <end>,<start> courez ! <end>


In [0]:
eng_data = df['eng_clean'].tolist()
fra_data = df['fra_clean'].tolist()

In [0]:
def lang_tokenize(data):
  lang_token = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_token.fit_on_texts(data)

  convert_data = lang_token.texts_to_sequences(data)
  return convert_data, lang_token

In [0]:
eng_token_data, eng_token = lang_tokenize(eng_data)
fra_token_data, fra_token = lang_tokenize(fra_data)

In [17]:
# Get # of vocabulary

eng_vocab_size = len(eng_token.word_index) + 1
fra_vocab_size = len(fra_token.word_index) + 1

print(eng_vocab_size)
print(fra_vocab_size)

13860
22791


In [0]:
def max_seq_length(data):
  return max([len(seq) for seq in data])

In [19]:
eng_maxlen = max_seq_length(eng_token_data)
fra_maxlen = max_seq_length(fra_token_data)


print(eng_maxlen)
print(fra_maxlen)

54
65


In [0]:
def padding_data(data):
  maxlen = max_seq_length(data)
  return tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, padding='post')

In [0]:
eng_token_data = padding_data(eng_token_data)
fra_token_data = padding_data(fra_token_data)

In [22]:
eng_token_data

array([[   1,   49,    3, ...,    0,    0,    0],
       [   1, 2658,    3, ...,    0,    0,    0],
       [   1, 2658,    3, ...,    0,    0,    0],
       ...,
       [   1,  365,   51, ...,    0,    0,    0],
       [   1,   69,  280, ...,    3,    2,    0],
       [   1,   14,  175, ..., 3418,    3,    2]], dtype=int32)

In [23]:
fra_token_data

array([[   1,  123,   38, ...,    0,    0,    0],
       [   1, 3538,   38, ...,    0,    0,    0],
       [   1, 3538,    3, ...,    0,    0,    0],
       ...,
       [   1, 7296,   12, ...,    0,    0,    0],
       [   1,   60,  175, ..., 2108,    3,    2],
       [   1,   12,    9, ...,    2,    0,    0]], dtype=int32)

In [0]:
eng_input = eng_token_data
fra_input = fra_token_data
fra_output = np.zeros(shape=fra_input.shape, dtype=np.int)
fra_output[:, :-1] = fra_input[:, 1:]

In [25]:
print(eng_input.shape)
print(eng_input)
print('==========')
print(fra_input.shape)
print(fra_input)
print('==========')
print(fra_output.shape)
print(fra_output)

(174481, 54)
[[   1   49    3 ...    0    0    0]
 [   1 2658    3 ...    0    0    0]
 [   1 2658    3 ...    0    0    0]
 ...
 [   1  365   51 ...    0    0    0]
 [   1   69  280 ...    3    2    0]
 [   1   14  175 ... 3418    3    2]]
(174481, 65)
[[   1  123   38 ...    0    0    0]
 [   1 3538   38 ...    0    0    0]
 [   1 3538    3 ...    0    0    0]
 ...
 [   1 7296   12 ...    0    0    0]
 [   1   60  175 ... 2108    3    2]
 [   1   12    9 ...    2    0    0]]
(174481, 65)
[[ 123   38    2 ...    0    0    0]
 [3538   38    2 ...    0    0    0]
 [3538    3    2 ...    0    0    0]
 ...
 [7296   12   42 ...    0    0    0]
 [  60  175   21 ...    3    2    0]
 [  12    9  105 ...    0    0    0]]


In [26]:
print(fra_input.shape)
print(fra_input[0])
print('==========')
print(fra_output.shape)
print(fra_output[0])

(174481, 65)
[  1 123  38   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]
(174481, 65)
[123  38   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0]


## Define Architecture

In [0]:
class Encoder(tf.keras.models.Model):
  def __init__(self, vocab_size, emb_dim, rnn_units):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.rnn_units = rnn_units

    self.emb = tf.keras.layers.Embedding(vocab_size, emb_dim)
    self.gru = tf.compat.v1.keras.layers.CuDNNGRU(rnn_units, return_sequence=True, return_state=True)

  def call(self, x, hidden):
    emb = self.emb(x)
    rnn_output, rnn_hidden = self.gru(emb, initial_state=hidden)
  
  def init_state(self, bn):
    return tf.zeros((bn, self.rnn_units))

In [0]:
class Bahdanau_Attention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Bahdanau_Attention, self).__init__()
    self.units = units

    self.W = tf.keras.layers.Dense(units)
    self.U = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, encoder_output, decoder_hidden):
                                                        # encoder_output: b, t, u 
    decoder_hidden = tf.expand_dims(decoder_hidden, 1)  # b, 1, u
    score = self.V(tf.nn.tanh(self.W(decoder_hidden) + self.U(encoder_output)))
    attention_weight = tf.nn.softmax(score, axis=1)
    # print(attention_weight)

    context_vector = encoder_output * attention_weight
    context_vector = tf.reduce_mean(context_vector, axis=1)
    # print(context_vector)

    return context_vector, attention_weight

In [0]:
a = tf.random.uniform(shape=(1,3,5))
b = tf.random.uniform(shape=(1,5))

print(a)
print('---------------')
print(b)

tf.Tensor(
[[[0.00771248 0.9852612  0.25292265 0.13914967 0.6024177 ]
  [0.9131743  0.43149817 0.8123046  0.9156581  0.99011695]
  [0.7070732  0.20018566 0.64653873 0.6798359  0.22464204]]], shape=(1, 3, 5), dtype=float32)
---------------
tf.Tensor([[0.06153405 0.91346765 0.9422804  0.58728063 0.41951406]], shape=(1, 5), dtype=float32)


In [0]:
b_attention = Bahdanau_Attention(10)

b_attention(a,b)

(<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[0.10603117, 0.12666608, 0.17720748, 0.041311  , 0.20936732]],
       dtype=float32)>, <tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
 array([[[0.39734358],
         [0.28802815],
         [0.31462827]]], dtype=float32)>)

In [0]:
w = tf.keras.layers.Dense(10, use_bias=False)

b = tf.expand_dims(b, 1)
a = w(a)
b = w(b)

In [0]:
a

<tf.Tensor: shape=(1, 3, 10), dtype=float32, numpy=
array([[[ 0.20375574, -0.23621656, -0.25879854,  0.13877232,
         -0.38118407, -0.25291228,  0.09400544,  0.59773743,
         -0.27770942,  0.80091876],
        [ 0.22341207, -1.2450184 , -0.79486835, -0.2050021 ,
         -0.30260095, -0.35428983, -0.26468548,  1.1088781 ,
         -0.04022653,  0.68044156],
        [-0.15711586, -0.7898226 , -0.52895784, -0.2123275 ,
          0.12146198,  0.05117954,  0.01428309,  0.639457  ,
         -0.10310812,  0.26187828]]], dtype=float32)>

In [0]:
b

<tf.Tensor: shape=(1, 1, 10), dtype=float32, numpy=
array([[[-0.08647674, -0.3943518 , -0.24134779,  0.0051918 ,
          0.33981574, -0.10482366,  0.13457695,  0.7195976 ,
         -0.6077579 ,  0.66818184]]], dtype=float32)>

In [0]:
c = a + b
c

<tf.Tensor: shape=(1, 3, 10), dtype=float32, numpy=
array([[[ 0.11727899, -0.6305684 , -0.5001463 ,  0.14396413,
         -0.04136834, -0.35773593,  0.22858238,  1.317335  ,
         -0.88546735,  1.4691006 ],
        [ 0.13693532, -1.6393702 , -1.0362161 , -0.1998103 ,
          0.03721479, -0.45911348, -0.13010854,  1.8284757 ,
         -0.64798445,  1.3486234 ],
        [-0.2435926 , -1.1841744 , -0.77030563, -0.20713569,
          0.46127772, -0.05364412,  0.14886004,  1.3590546 ,
         -0.71086603,  0.93006015]]], dtype=float32)>

In [0]:
tf.nn.tanh(c)

<tf.Tensor: shape=(1, 3, 10), dtype=float32, numpy=
array([[[ 0.11674424, -0.5584435 , -0.46223223,  0.14297773,
         -0.04134475, -0.3432182 ,  0.22468273,  0.8661196 ,
         -0.7091477 ,  0.8994057 ],
        [ 0.1360858 , -0.9273845 , -0.7763895 , -0.19719301,
          0.03719762, -0.4293614 , -0.1293793 ,  0.94967675,
         -0.57031155,  0.873728  ],
        [-0.23888624, -0.82876337, -0.6471071 , -0.20422328,
          0.43112504, -0.05359272,  0.14777015,  0.8761736 ,
         -0.61121964,  0.73062193]]], dtype=float32)>

In [0]:
v = tf.keras.layers.Dense(1, use_bias=False)

v(tf.nn.tanh(c))

<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[0.12602353],
        [0.7237127 ],
        [0.63241607]]], dtype=float32)>

In [0]:
tf.nn.softmax(v(tf.nn.tanh(c)), axis=-1)

<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[1.],
        [1.],
        [1.]]], dtype=float32)>

In [0]:
weight = tf.nn.softmax(v(tf.nn.tanh(c)), axis=1)

In [0]:
weight

<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[0.2233535 ],
        [0.40603727],
        [0.37060925]]], dtype=float32)>

In [0]:
a 

<tf.Tensor: shape=(1, 3, 10), dtype=float32, numpy=
array([[[ 0.20375574, -0.23621656, -0.25879854,  0.13877232,
         -0.38118407, -0.25291228,  0.09400544,  0.59773743,
         -0.27770942,  0.80091876],
        [ 0.22341207, -1.2450184 , -0.79486835, -0.2050021 ,
         -0.30260095, -0.35428983, -0.26468548,  1.1088781 ,
         -0.04022653,  0.68044156],
        [-0.15711586, -0.7898226 , -0.52895784, -0.2123275 ,
          0.12146198,  0.05117954,  0.01428309,  0.639457  ,
         -0.10310812,  0.26187828]]], dtype=float32)>

In [0]:
a * weight

<tf.Tensor: shape=(1, 3, 10), dtype=float32, numpy=
array([[[ 0.04550956, -0.0527598 , -0.05780356,  0.03099529,
         -0.0851388 , -0.05648885,  0.02099644,  0.13350675,
         -0.06202737,  0.17888801],
        [ 0.09071363, -0.50552386, -0.3227462 , -0.08323849,
         -0.12286726, -0.14385487, -0.10747217,  0.45024586,
         -0.01633347,  0.27628464],
        [-0.05822859, -0.29271555, -0.19603667, -0.07869054,
          0.04501493,  0.01896761,  0.00529345,  0.23698868,
         -0.03821282,  0.09705451]]], dtype=float32)>

In [0]:
a = tf.random.uniform(shape=(1,3,5))
b = tf.random.uniform(shape=(1,5))

print(a)
print('---------------')
print(b)

tf.Tensor(
[[[5.3998423e-01 2.2149825e-01 9.1811538e-01 1.7248273e-01 4.3145978e-01]
  [2.4858928e-01 4.6512210e-01 8.7995028e-01 4.1377544e-04 8.4980643e-01]
  [6.4341414e-01 2.4284780e-01 7.1873355e-01 4.9809349e-01 7.3244667e-01]]], shape=(1, 3, 5), dtype=float32)
---------------
tf.Tensor([[0.27627218 0.14845407 0.19686663 0.7210643  0.84055376]], shape=(1, 5), dtype=float32)


In [0]:
b = tf.expand_dims(b, 1)
print(a)
print(b)

tf.Tensor(
[[[5.3998423e-01 2.2149825e-01 9.1811538e-01 1.7248273e-01 4.3145978e-01]
  [2.4858928e-01 4.6512210e-01 8.7995028e-01 4.1377544e-04 8.4980643e-01]
  [6.4341414e-01 2.4284780e-01 7.1873355e-01 4.9809349e-01 7.3244667e-01]]], shape=(1, 3, 5), dtype=float32)
tf.Tensor([[[0.27627218 0.14845407 0.19686663 0.7210643  0.84055376]]], shape=(1, 1, 5), dtype=float32)


In [0]:
a

<tf.Tensor: shape=(1, 3, 5), dtype=float32, numpy=
array([[[5.3998423e-01, 2.2149825e-01, 9.1811538e-01, 1.7248273e-01,
         4.3145978e-01],
        [2.4858928e-01, 4.6512210e-01, 8.7995028e-01, 4.1377544e-04,
         8.4980643e-01],
        [6.4341414e-01, 2.4284780e-01, 7.1873355e-01, 4.9809349e-01,
         7.3244667e-01]]], dtype=float32)>

In [0]:
tf.transpose(b)

<tf.Tensor: shape=(5, 1, 1), dtype=float32, numpy=
array([[[0.27627218]],

       [[0.14845407]],

       [[0.19686663]],

       [[0.7210643 ]],

       [[0.84055376]]], dtype=float32)>

In [0]:
tf.matmul(a, b, transpose_b=True)

<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[0.8498475],
        [1.0255667],
        [1.330122 ]]], dtype=float32)>

In [0]:
tf.keras.layers.dot([a, b], axes=[2,2])

<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[0.8498475],
        [1.0255667],
        [1.330122 ]]], dtype=float32)>

In [0]:
a = tf.random.uniform(shape=(2,5,5))
b = tf.random.uniform(shape=(2,2,5))
print(a)
print('---------------')
print(b)

tf.matmul(a, b, transpose_b=True)

tf.Tensor(
[[[0.59639084 0.9540185  0.564361   0.6283909  0.83372307]
  [0.9372194  0.49399424 0.50710547 0.33437884 0.07578528]
  [0.18463588 0.4235196  0.43641293 0.65767455 0.36156726]
  [0.77167857 0.4514879  0.04866529 0.56070924 0.73819876]
  [0.39059412 0.07711887 0.87296903 0.81454194 0.7500534 ]]

 [[0.92707443 0.89167714 0.10521507 0.07309866 0.15030444]
  [0.03697574 0.7833693  0.37346756 0.0651269  0.08595693]
  [0.291754   0.03229833 0.3438704  0.7623366  0.09506691]
  [0.9384333  0.1959933  0.33813167 0.22404528 0.08026564]
  [0.40205932 0.01453149 0.44727743 0.9365039  0.7399223 ]]], shape=(2, 5, 5), dtype=float32)
---------------
tf.Tensor(
[[[0.10912609 0.7621201  0.42228377 0.55083966 0.6046891 ]
  [0.8642427  0.48143375 0.60686326 0.59518373 0.31394482]]

 [[0.12075412 0.88272786 0.705027   0.09797454 0.30141854]
  [0.7983682  0.6327152  0.7704388  0.6638516  0.7214711 ]]], shape=(2, 2, 5), dtype=float32)


<tf.Tensor: shape=(2, 5, 2), dtype=float32, numpy=
array([[[1.880765  , 1.9529642 ],
        [0.9229161 , 1.5783633 ],
        [1.1081206 , 1.1332592 ],
        [1.2040905 , 1.479291  ],
        [1.3722696 , 1.6247461 ]],

       [[1.0257021 , 1.542353  ],
        [0.9915614 , 0.9181539 ],
        [0.40952355, 1.0929604 ],
        [0.5708646 , 1.3403752 ],
        [0.6915002 , 1.8303177 ]]], dtype=float32)>

In [0]:
a = tf.random.uniform(shape=(1,2,3))
b = tf.random.uniform(shape=(1,4,3))
print(a)
print('---------------')
print(b)

tf.matmul(a, b, transpose_b=True)

tf.Tensor(
[[[0.18214393 0.783435   0.9724239 ]
  [0.01555264 0.7151736  0.3147776 ]]], shape=(1, 2, 3), dtype=float32)
---------------
tf.Tensor(
[[[0.8125795  0.19316065 0.21604371]
  [0.16776252 0.40517437 0.00324726]
  [0.1728605  0.23059046 0.9968631 ]
  [0.33894432 0.75076044 0.00746596]]], shape=(1, 4, 3), dtype=float32)


<tf.Tensor: shape=(1, 2, 4), dtype=float32, numpy=
array([[[0.5094213 , 0.3511424 , 1.1815116 , 0.65716875],
        [0.21878688, 0.29340133, 0.48139083, 0.5445456 ]]], dtype=float32)>

In [0]:
b_attention = Bahdanau_Attention(512)

b_attention(tf.constant(1., shape=(1,3,5)), tf.constant(2., shape=(1,5)))

(<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[0.33333334, 0.33333334, 0.33333334, 0.33333334, 0.33333334]],
       dtype=float32)>, <tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
 array([[[0.33333334],
         [0.33333334],
         [0.33333334]]], dtype=float32)>)

In [0]:
a = tf.random.uniform(shape=(2,5))
b = tf.random.uniform(shape=(2,5))
print(a)
print('---------------')
print(b)

tf.matmul(a, b, transpose_b=True)

tf.Tensor(
[[0.2543515  0.5521877  0.7471309  0.3309946  0.8241925 ]
 [0.78437185 0.8765317  0.35523176 0.20180106 0.06392014]], shape=(2, 5), dtype=float32)
---------------
tf.Tensor(
[[0.2784196  0.8702729  0.75741565 0.18727171 0.95813024]
 [0.18460798 0.02440369 0.4302225  0.02116096 0.45810235]], shape=(2, 5), dtype=float32)


<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1.9689288, 0.7664319],
       [1.3492998, 0.3525729]], dtype=float32)>

In [0]:
tf.nn.softmax(tf.matmul(a, b, transpose_b=True), axis=-1)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.76896864, 0.23103133],
       [0.73041457, 0.26958543]], dtype=float32)>

In [0]:
test_target_hidden = tf.random.uniform(shape=(10, 1, 200))

In [0]:
test_layer = tf.keras.layers.Dense(100, use_bias=False, activation='softmax')(test_target_hidden)

In [0]:
test_layer.shape

TensorShape([10, 1, 100])

In [0]:
tf.squeeze(test_layer)

<tf.Tensor: shape=(10, 100), dtype=float32, numpy=
array([[0.03566845, 0.01072334, 0.0145654 , 0.00372909, 0.0132218 ,
        0.01663513, 0.01808441, 0.01086651, 0.01443409, 0.00328492,
        0.00368103, 0.00399323, 0.01067732, 0.0046064 , 0.01113123,
        0.00301603, 0.00401986, 0.00649715, 0.00432437, 0.01134801,
        0.00349345, 0.02041277, 0.00953795, 0.0077641 , 0.01186767,
        0.01620325, 0.01079843, 0.00305346, 0.00569801, 0.00217793,
        0.00216136, 0.01824531, 0.02106147, 0.00423268, 0.01300748,
        0.00614769, 0.01152934, 0.01551523, 0.0218783 , 0.01227031,
        0.00432486, 0.01284889, 0.00934043, 0.00819353, 0.00443064,
        0.01080658, 0.02323211, 0.00759577, 0.00639365, 0.00838301,
        0.00946607, 0.00722797, 0.00341699, 0.02055689, 0.01316148,
        0.01380438, 0.03710008, 0.01580246, 0.00959969, 0.00422876,
        0.01129325, 0.00166759, 0.00538054, 0.00453612, 0.00623049,
        0.00574626, 0.02068364, 0.0138677 , 0.01214216, 0.0161769

In [0]:
x = tf.keras.layers.RepeatVector(51)(tf.squeeze(test_layer))

In [0]:
x

<tf.Tensor: shape=(10, 51, 100), dtype=float32, numpy=
array([[[0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        ...,
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905]],

       [[0.02327354, 0.01062937, 0.00816432, ..., 0.00570092,
         0.00565667, 0.00804364],
        [0.02327354, 0.01062937, 0.00816432, ..., 0.00570092,
         0.00565667, 0.00804364],
        [0.02327354, 0.01062937, 0.00816432, ..., 0.00570092,
         0.00565667, 0.00804364],
        ...,
        [0.02327354, 0.01062937, 0.00816432, ..., 0.0

In [0]:
x.shape

TensorShape([10, 51, 100])

In [0]:
x

<tf.Tensor: shape=(10, 51, 100), dtype=float32, numpy=
array([[[0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        ...,
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905],
        [0.03566845, 0.01072334, 0.0145654 , ..., 0.00489827,
         0.00870736, 0.01106905]],

       [[0.02327354, 0.01062937, 0.00816432, ..., 0.00570092,
         0.00565667, 0.00804364],
        [0.02327354, 0.01062937, 0.00816432, ..., 0.00570092,
         0.00565667, 0.00804364],
        [0.02327354, 0.01062937, 0.00816432, ..., 0.00570092,
         0.00565667, 0.00804364],
        ...,
        [0.02327354, 0.01062937, 0.00816432, ..., 0.0

In [0]:
tf.reduce_sum(x, axis=-1)

<tf.Tensor: shape=(10, 51), dtype=float32, numpy=
array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        ],
       [0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
        0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
        0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
        0.99999994, 0.99999994, 0.99999994, 0

In [0]:
tf.random.set_seed(1228)

a = tf.random.uniform(shape=(1,5,5))
x = tf.keras.layers.GRU(units=5)(a)

x2 = tf.keras.layers.GRU(units=5, return_sequences=True, return_state=True)(a)
x3 = tf.keras.layers.GRU(units=5, return_state=True)(a)

In [45]:
x # no return sequence

<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
array([[ 0.5979072 ,  0.26505458, -0.0029679 ,  0.14039624,  0.2028591 ]],
      dtype=float32)>

In [46]:
x2

[<tf.Tensor: shape=(1, 5, 5), dtype=float32, numpy=
 array([[[-0.12080424, -0.09952988, -0.04060221,  0.16478652,
           0.18125446],
         [-0.35564423,  0.09981193,  0.04436762,  0.06537628,
           0.3626107 ],
         [-0.4599059 ,  0.11599981, -0.0143155 ,  0.0832132 ,
           0.45370576],
         [-0.53886944,  0.26404035,  0.05731445, -0.02444088,
           0.4354181 ],
         [-0.6695756 ,  0.36148447,  0.13612437, -0.0632332 ,
           0.5600181 ]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[-0.6695756 ,  0.36148447,  0.13612437, -0.0632332 ,  0.5600181 ]],
       dtype=float32)>]

In [47]:
x3

[<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.45943835,  0.12916149,  0.2460988 , -0.00969852,  0.4139257 ]],
       dtype=float32)>, <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.45943835,  0.12916149,  0.2460988 , -0.00969852,  0.4139257 ]],
       dtype=float32)>]

In [0]:
class Luong_Attention(tf.keras.layers.Layer):
  def __init__(self, units, method='dot'):
    super(Luong_Attention, self).__init__()
    self.units = units
    self.method = method

    # pytorch nn.linear(input, output)
    if method == 'general':
      self.fc = tf.keras.layers.Dense(units, use_bias=False)
    elif method == 'concat':
      self.fc = tf.keras.layers.Dense(units, use_bias=False)
      # self.qc = tf.keras.layers.Dense(units, use_bias=False)
      self.V = tf.keras.layers.Dense(1, use_bias=False)
  
  def call(self, decoder_hidden, encoder_hidden):
    score = tf.keras.layers.Dot(axes=[2,2])([decoder_hidden, encoder_hidden])

    if self.method == 'dot':
      score = tf.keras.layers.Dot(axes=[2,2])([decoder_hidden, encoder_hidden])
      
    elif self.method == 'general':
      score = self.fc(encoder_hidden)
      score = tf.keras.layers.Dot(axes=[2,2])([score, decoder_hidden])

    elif self.method == 'concat':
      score = self.V(tf.nn.tanh(self.fc(encoder_hidden + decoder_hidden)))
    
    attention_weight = tf.nn.softmax(score)

    context_vector = encoder_hidden * attention_weight
    return context_vector, attention_weight

In [0]:
l_attention = Luong_Attention(10)

a = tf.constant(-0.2, shape=(1, 1, 10))
b = tf.constant(0.2, shape=(1, 1, 10))

l_attention(a,b)

(<tf.Tensor: shape=(1, 1, 10), dtype=float32, numpy=
 array([[[0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 1, 1), dtype=float32, numpy=array([[[1.]]], dtype=float32)>)

In [0]:
tf.constant(0.6, shape=(1, 1, 10))

<tf.Tensor: shape=(1, 1, 10), dtype=float32, numpy=
array([[[0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6]]],
      dtype=float32)>

In [0]:
tf.constant(0.2, shape=(1, 1, 10))

<tf.Tensor: shape=(1, 1, 10), dtype=float32, numpy=
array([[[0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]]],
      dtype=float32)>

In [0]:
tf.keras.layers.Dot(axes=[2,2])([tf.constant(2., shape=(1, 1, 5)), tf.constant(3., shape=(1, 1, 5))])

<tf.Tensor: shape=(1, 1, 1), dtype=float32, numpy=array([[[30.]]], dtype=float32)>

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Flatten, Activation, Permute
from tensorflow.keras.layers import Multiply, Lambda, Reshape, Dot, Concatenate, RepeatVector, \
    TimeDistributed, Permute, Bidirectional


class Attention(Layer):
    """
    Layer for implementing two common types of attention mechanisms, i) global (soft) attention
    and ii) local (hard) attention, for two types of sequence tasks, i) many-to-one and
    ii) many-to-many.
    The setting use_bias=False converts the Dense() layers into annotation weight matrices. Softmax
    activation ensures that all weights sum up to 1. Read more here to make more sense of the code
    and implementations:
    i)   https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention
    ii)  https://github.com/philipperemy/keras-attention-mechanism/issues/14
    iii) https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html
    SUGGESTION: If model doesn't converge or the test accuracy is lower than expected, try playing
    around with the hidden size of the recurrent layers, the batch size in training process, or the
    param @window_width if using a 'local' attention.
    NOTE: This implementation takes the hidden states associated with the last timestep of the input
    sequence as the target hidden state (h_t) as suggested by @felixhao28 in i) for many-to-one
    scenarios. Hence, when trying to predict what word (token) comes after sequence ['I', 'love',
    'biscuits', 'and'], we take h('and') with shape (1, H) as the target hidden state. For
    many-to-many scenarios, it takes the hidden state associated with the timestep that is being
    currently iterated in the target sequence, usually by a decoder-like architecture.
    @param (str) context: the context of the problem at hand, specify 'many-to-many' for
           sequence-to-sequence tasks such as machine translation and question answering, or
           specify 'many-to-one' for tasks such as sentiment classification and language modelling
    @param (str) alignment_type: type of attention mechanism to be applied, 'local-m' corresponds to
           monotonic alignment where we take the last @window_width timesteps, 'local-p' corresponds
           to having a Gaussian distribution around the predicted aligned position, whereas
           'local-p*' corresponds to the newly proposed method to adaptively learning the unique
           timesteps to give attention (currently only works for many-to-one scenarios)
    @param (int) window_width: width for set of source hidden states in 'local' attention
    @param (str) score_function: alignment score function config; current implementations include
           the 'dot', 'general', and 'location' both by Luong et al. (2015), 'concat' by Bahdanau et
           al. (2015), and 'scaled_dot' by Vaswani et al. (2017)
    @param (str) model_api: specify to use TF's Sequential OR Functional API, note that attention
           weights are not outputted with the former as it only accepts single-output layers
    """
    def __init__(self, context='many-to-many', alignment_type='global', window_width=None,
                 score_function='general', model_api='functional', **kwargs):
        if context not in ['many-to-many', 'many-to-one']:
            raise ValueError("Argument for param @context is not recognized")
        if alignment_type not in ['global', 'local-m', 'local-p', 'local-p*']:
            raise ValueError("Argument for param @alignment_type is not recognized")
        if alignment_type == 'global' and window_width is not None:
            raise ValueError("Can't use windowed approach with global attention")
        if context == 'many-to-many' and alignment_type == 'local-p*':
            raise ValueError("Can't use local-p* approach in many-to-many scenarios")
        if score_function not in ['dot', 'general', 'location', 'concat', 'scaled_dot']:
            raise ValueError("Argument for param @score_function is not recognized")
        if model_api not in ['sequential', 'functional']:
            raise ValueError("Argument for param @model_api is not recognized")
        super(Attention, self).__init__(**kwargs)
        self.context = context
        self.alignment_type = alignment_type
        self.window_width = window_width  # D
        self.score_function = score_function
        self.model_api = model_api

    def get_config(self):
        base_config = super(Attention, self).get_config()
        base_config['alignment_type'] = self.alignment_type
        base_config['window_width'] = self.window_width
        base_config['score_function'] = self.score_function
        base_config['model_api'] = self.model_api
        return base_config

    def build(self, input_shape):
        # Declare attributes for easy access to dimension values
        if self.context == 'many-to-many':
            self.input_sequence_length, self.hidden_dim = input_shape[0][1], input_shape[0][2]
            self.target_sequence_length = input_shape[1][1]
        elif self.context == 'many-to-one':
            self.input_sequence_length, self.hidden_dim = input_shape[0][1], input_shape[0][2]
        super(Attention, self).build(input_shape)

    def call(self, inputs):
      pass

In [0]:
att = Attention()

In [53]:
att.input_shape

AttributeError: ignored