# Bahdanau Attention

In [1]:
import re
import numpy as np
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## Encdoer Class

In [2]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, enc_dim=256, num_embedding=256, batch_size=32):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.enc_dim = enc_dim
        self.num_embedding = num_embedding
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.num_embedding)
        self.gru = tf.keras.layers.GRU(enc_dim,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        # 워드 임베딩
        # (batch, seq_length) -> (batch, seq_length, num_embedding)
        x = self.embedding(x)
        
        # RNN 출력
        # output.shape: (batch, seq_length, enc_dim)
        # hidden.shape: (batch, enc_dim)
        output, hidden = self.gru(x, initial_state=hidden)
        
        return output, hidden
    
    def init_hidden(self, input_):
        return tf.zeros((tf.shape(input_)[0], self.enc_dim))
  

### Bidirectional RNN Encoder

In [3]:
# class Encoder(tf.keras.layers.Layer):
#     def __init__(self, vocab_size, enc_dim=256, num_embedding=256, batch_size=32):
#         super(Encoder, self).__init__()
#         self.vocab_size = vocab_size
#         self.batch_size = batch_size
#         self.enc_dim = enc_dim
#         self.num_embedding = num_embedding
#         self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.num_embedding)
        
#         self.gru_fw = tf.keras.layers.GRU(enc_dim,
#                                           return_sequences=True,
#                                           return_state=True,
#                                           recurrent_initializer='glorot_uniform')
        
#         self.gru_bw = tf.keras.layers.GRU(enc_dim,
#                                           go_backwards=True,
#                                           return_sequences=True,
#                                           return_state=True,
#                                           recurrent_initializer='glorot_uniform')
        
#         self.gru = tf.keras.layers.Bidirectional(self.gru_fw, backward_layer=self.gru_bw)
        
#     def call(self, x, hidden):
#         # 워드 임베딩
#         # (batch, seq_length) -> (batch, seq_length, num_embedding)
#         x = self.embedding(x)
        
#         # RNN 출력
#         # output.shape: (batch, seq_length, enc_dim * 2)
#         # fw_hidden.shape: (batch, enc_dim)
#         # bw_hidden.shape: (batch, enc_dim)
#         output, fw_hidden, bw_hidden = self.gru(x, initial_state=hidden)
        
#         hidden = tf.concat([fw_hidden, bw_hidden], axis=-1)  # (bs, d_model * 2)
        
#         return output, hidden
    
#     def init_hidden(self, input_):
#         return [tf.zeros((tf.shape(input_)[0], self.enc_dim)) for _ in range(2)]

## Bahdanau Attention Class

In [4]:
class BahdanauAttention(tf.keras.models.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
    
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, encoder_out, hidden):
        # output.shape: (batch, seq_length, enc_dim)
        # hidden.shape: (batch, enc_dim)
        
        # hidden에 시계열 축 추가
        hidden = tf.expand_dims(hidden, axis=1) #out: (16, 1, 1024)
        
        # Bahdanau attention score 계산
        # (batch, enc_dim) -> (batch, 1, enc_dim)
        score = self.V(tf.nn.tanh(self.W1(encoder_out) +\
                                  self.W2(hidden))) #out: 
        
        # softmax를 통해 attention weights 계산
        attn_weights = tf.nn.softmax(score, axis=1)
        
        # context vector 계산
        # ((batch, 1, enc_dim) * (batch, seq_length, enc_dim)) -> (batch, seq_length, enc_dim)
        context_vector =  attn_weights * encoder_out
        
        # (batch, seq_length, enc_dim) -> (batch, enc_dim)
        context_vector = tf.reduce_sum(context_vector, axis=1) 
        return context_vector, attn_weights

## Decoder Class

In [5]:
class Decoder(tf.keras.models.Model):
    def __init__(self, vocab_size, embedding_dim=256, dec_dim=256, batch_size=32):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.dec_dim = dec_dim
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.attn = BahdanauAttention(self.dec_dim)
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_dim,
                                       recurrent_initializer='glorot_uniform',
                                       return_sequences=True,
                                       return_state=True)
        self.fc = tf.keras.layers.Dense(self.vocab_size)
        
    def call(self, x, hidden, enc_out):
        # x.shape = (None, 1)
        # enc_out.shape = (None, seq_length, enc_dim)
        # enc_hidden.shape = (None, enc_dim)
        
        # decoder input의 워드 임베딩
        # (None, 1) -> (None, 1, embedding_dim)
        x = self.embedding(x)
        
        # attention 가중치 계산
        # context.shape = (None, enc_dim)
        # attn_weights.shape = (None, seq_Length, enc_dim)
        context, attn_weights = self.attn(enc_out, hidden)
        
        # x.shape = (None, 1, enc_dim + embedding_dim)
        x = tf.concat((tf.expand_dims(context, 1), x), -1)
        
        # Decoder RNN sequence 출력
        # r_out.shape = (None, 1, dec_dim)
        # r_out.shape = (None, dec_dim)
        r_out, hidden = self.gru(x, initial_state=hidden)
        
        # 시계열 축 제거
        # (None, 1, dec_dim) -> (None, dec_dim)
        out = tf.reshape(r_out,shape=(-1, r_out.shape[2]))
        
        
        return self.fc(out), hidden, attn_weights

## Loss & Accuracy

In [6]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def loss(real, pred):
    # [PAD] - 0 태그를 빼고 loss를 구하기 위해
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

def accuracy(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
    pred *= mask
    acc = train_accuracy(real, pred)
    
    return tf.reduce_mean(acc)

## Main Class

In [7]:
class seq2seq(tf.keras.Model):
    def __init__(self, enc_vocab_size, dec_vocab_size, embedding_dim, enc_dim, dec_dim, batch_size, end_token_idx=3):
        super(seq2seq, self).__init__()
        # 문장의 끝 토큰 [EOS] index - 3 
        self.end_token_idx = end_token_idx
        
        self.encoder = Encoder(enc_vocab_size, embedding_dim, enc_dim, batch_size)
        self.decoder = Decoder(dec_vocab_size, embedding_dim, dec_dim, batch_size)
        
    def call(self, x):
        # encoder, decoder input
        input_, target = x
        
        # encoder 초기값 설정
        enc_hidden = self.encoder.init_hidden(input_)
        # encoder의 RNN 연산 후 출력값
        enc_out, enc_hidden = self.encoder(input_, enc_hidden)
        
        # dec_hidden 초기값 지정
        dec_hidden = enc_hidden
        
        predict_tokens = list()
        for t in range(target.shape[1]):
            # decoder input에 시계열 축 추가 (None, 1, 1)
            dec_input = tf.dtypes.cast(tf.expand_dims(target[:, t], 1), tf.float32)
            # decoder RNN 연산 결과
            predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_out)
            predict_tokens.append(tf.dtypes.cast(predictions, tf.float32))
            
        return tf.stack(predict_tokens, axis=1)
    
    def inference(self, x):
        input_ = x
        
        enc_hidden = self.encoder.init_hidden(input_)
        enc_out, enc_hidden = self.encoder(input_, enc_hidden)
        
        dec_hidden = enc_hidden
        
        dec_input = np.array([2]) # [BOF] index
        dec_input = tf.expand_dims(dec_input, 1)
        
        predict_tokens = list()
        for t in range(0, MAX_SEQUENCE):
            predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_out)
            predict_token = tf.argmax(predictions[0])
            
            if predict_token == self.end_token_idx:
                break
                
            predict_tokens.append(predict_token)
            dec_input = tf.dtypes.cast(tf.expand_dims([predict_token], 0), tf.float32)
        
        return tf.stack(predict_tokens, axis=0).numpy()

# 데이터 불러오기

In [8]:
import pandas as pd

data = pd.read_csv('data/fra.txt', sep='\t')
data.columns=['eng', 'fra', 'etc']
data.drop('etc', axis=1, inplace=True)
data

Unnamed: 0,eng,fra
0,Go.,Marche.
1,Go.,Bouge !
2,Hi.,Salut !
3,Hi.,Salut.
4,Run!,Cours !
...,...,...
190200,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
190201,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
190202,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
190203,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


## 전처리
* 띄어쓰기 기준

In [9]:
def preprocessing(x):
    text_eng = x['eng']
    text_fra = x['fra']
    
    text_eng = re.sub(r"([!.,?])", r" \1", text_eng)
    text_fra = re.sub(r"([!.,?])", r" \1", text_fra)
    
    text_eng = re.sub(r"[^a-zA-Z?.,!]+", " ", text_eng)
    text_fra = re.sub(r"[^a-zA-Z?.,!]+", " ", text_fra)
    
    return text_eng, text_fra

def sentence2length(x):
    x['preprocessed_eng'].split()
    x['preprocessed_fra'].split()
    
    return eng, fra

In [10]:
data['preprocessed_eng'] = data.apply(lambda x: preprocessing(x)[0], axis=1)
data['preprocessed_fra'] = data.apply(lambda x: preprocessing(x)[1], axis=1)
data.tail()
# seq_length = data.apply(lambda x: data, axis=1)

Unnamed: 0,eng,fra,preprocessed_eng,preprocessed_fra
190200,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
190201,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...,Death is something that we re often discourage...,La mort est une chose qu on nous d courage sou...
190202,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...,Since there are usually multiple websites on a...,Puisqu il y a de multiples sites web sur chaqu...
190203,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...,If someone who doesn t know your background sa...,Si quelqu un qui ne conna t pas vos ant c dent...
190204,It may be impossible to get a completely error...,Il est peut-être impossible d'obtenir un Corpu...,It may be impossible to get a completely error...,Il est peut tre impossible d obtenir un Corpus...


In [11]:
# 가장 긴 문장은?
engLen = data['preprocessed_eng'].apply(lambda x: len(x.split()))
fraLen = data['preprocessed_fra'].apply(lambda x: len(x.split()))
engLen.max(), fraLen.max()

(52, 66)

## 영어, 프랑스어 단어 사전 만들기(word->id, id->word)

In [12]:
eng_word_to_id = {'[PAD]': 0, '[UNK]': 1, '[BOS]': 2, '[EOS]': 3}
fra_word_to_id = {'[PAD]': 0, '[UNK]': 1, '[BOS]': 2, '[EOS]': 3}

words_eng = ""
for eng in data['preprocessed_eng']:
    words_eng += eng + " "
words_eng = list(set(words_eng.split()))

for word_eng in words_eng:
    eng_word_to_id[word_eng] = len(eng_word_to_id)

words_fra = ""
for fra in data['preprocessed_fra']:
    words_fra += fra + " "
words_fra = list(set(words_fra.split()))

for word_fra in words_fra:
    fra_word_to_id[word_fra] = len(fra_word_to_id)

# len(vocab_eng), len(vocab_fra) # (17920, 33067)
# len(eng_word_to_id), len(fra_word_to_id)

# 각 숫자별 단어 부여
id_to_eng_word = {_id:word for word, _id in eng_word_to_id.items()}
id_to_fra_word = {_id:word for word, _id in fra_word_to_id.items()}

In [13]:
inputs = []
labels = []
MAX_SEQUENCE = 70
shffled_data = data.sample(frac=1).copy()
shffled_data.reset_index(inplace=True, drop=True)

inputs_eng = []
for eng_sentence in shffled_data['preprocessed_eng']:
    row = [eng_word_to_id[eng_word] for eng_word in eng_sentence.split()]
    row += [0] * (MAX_SEQUENCE - len(row))
    inputs_eng.append(row)

inputs_fra = []
for fra_sentence in shffled_data['preprocessed_fra']:
    # decoder 입력과 label 생성
    row = [fra_word_to_id['[BOS]']]
    row_label = [fra_word_to_id[fra_word] for fra_word in fra_sentence.split()]
    row += row_label
    row_label += [fra_word_to_id['[EOS]']]
    row += [0] * (MAX_SEQUENCE - len(row))
    row_label += [0] * (MAX_SEQUENCE - len(row_label))
    inputs_fra.append(row)
    labels.append(row_label)
    
inputs.append(inputs_eng)
inputs.append(inputs_fra)

In [14]:
inputs = np.array(inputs)
labels = np.array(labels)
inputs.shape, labels.shape

((2, 190205, 70), (190205, 70))

In [15]:
train_idx = int(len(labels) * 0.6)
val_idx = (len(labels) - train_idx)//2 + train_idx

train_X = inputs[:,:train_idx,:]
val_X = inputs[:,train_idx:val_idx,:]
test_X = inputs[:,val_idx:,:]

train_y = labels[:train_idx,:]
val_y = labels[train_idx:val_idx,:]
test_y = labels[val_idx:,:]

train_X.shape, val_X.shape, test_X.shape

((2, 114123, 70), (2, 38041, 70), (2, 38041, 70))

# 학습

In [16]:
enc_dim = 256
dec_dim = 256  # 256 # bidirection RNN이 아닐 경우
embedding_dim = 256
batch_size = 32
EPOCH = 50

model = seq2seq(len(eng_word_to_id),
                len(fra_word_to_id),
                embedding_dim,
                enc_dim,
                dec_dim,
                batch_size)

model.compile(loss=loss,
             optimizer=tf.keras.optimizers.Adam(),
             metrics=[accuracy])


In [None]:
import os
MODEL_PATH = "model"
if not(os.path.isdir(MODEL_PATH)):
    os.makedirs(os.path.join(MODEL_PATH))
    
checkpoint_path = MODEL_PATH + '/weights.h5'

cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1,
                              save_best_only=True, save_weights_only=True)

earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=16)

history = model.fit([train_X[0], train_X[1]], train_y,
                    batch_size=batch_size,
                    epochs=EPOCH,
                    validation_data=((val_X[0], val_X[1]), val_y),
                    callbacks=[earlystop_callback, cp_callback])

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Epoch 00001: val_accuracy improved from -inf to 0.91760, saving model to model\weights.h5
Epoch 2/5

# 추론

In [18]:
x_enc, x_dec = np.zeros(MAX_SEQUENCE), np.zeros(MAX_SEQUENCE)
print(x_enc.shape, x_dec.shape)
x_enc = x_enc.reshape(-1,x_enc.shape[0])
x_dec = x_dec.reshape(-1,x_dec.shape[0])
x_enc.shape

(70,) (70,)


(1, 70)

In [19]:
t1 = test_X[0][-100]
t2 = test_X[1][-100]
print(t1.shape, t2.shape)
t1 = t1.reshape(-1,t1.shape[0])
t2 = t2.reshape(-1,t2.shape[0])
print(t1.shape, t2.shape)

(70,) (70,)
(1, 70) (1, 70)


In [20]:
model([x_enc, x_dec])

model.load_weights('model/weights-17.h5')



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [21]:
model.summary()

Model: "seq2seq"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            multiple                  4667136   
_________________________________________________________________
decoder (Decoder)            multiple                  12352398  
Total params: 17,019,534
Trainable params: 17,019,534
Non-trainable params: 0
_________________________________________________________________


In [36]:
sentence = test_X[0][38038]
print(' '.join([id_to_eng_word[token] for token in sentence if token != 0]))
sentence = sentence.reshape(-1, sentence.shape[0])
pred = model.inference(sentence)
print(pred)
print(' '.join([id_to_fra_word[token] for token in pred if token != 0]))
# pred

She can jump high .
[ 2659  9328 13094 21752  1063 18291  2002   197  7193 14414 12607  3328
 10009 15016 12585  6548 14404]
fruitiers console accueillante puissantes remplit laissent craintif couchais historique rayon estivales disco Et vidences papillons maisons suivit


In [35]:
print(' '.join([id_to_fra_word[token] for token in test_X[1][38038] if token != 0]))

[BOS] Elle peut sauter haut .


In [33]:
t = pd.DataFrame(test_X[0])
t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,16424,10644,790,12904,5141,4303,14552,7842,1458,907,...,0,0,0,0,0,0,0,0,0,0
1,2876,3535,12833,7923,11813,5136,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2372,16166,12030,12795,790,12272,5136,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,14251,5991,1954,4303,16029,2190,2350,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13349,10651,12833,16588,10372,5136,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38036,4303,880,2941,11222,13120,5016,7837,9933,5136,0,...,0,0,0,0,0,0,0,0,0,0
38037,9373,14038,11739,5991,12469,13120,2350,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38038,15018,16166,13593,11355,5136,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38039,4303,9503,5517,16436,5136,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
t