# Transformer tensorflow 2.0


## install tensorflow 2.0

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
    %tensorflow_version 2.x
except Exception:
    pass
# !pip install tensorflow_probability==0.8.0rc0 --upgrade
!pip install sentencepiece
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

TensorFlow 2.x selected.
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.5MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.83


In [0]:
!mkdir -p checkpoints
!mkdir -p datasets

In [0]:
# print tensorflow versions
!pip freeze | grep tensorflow
!nvidia-smi

mesh-tensorflow==0.0.5
tensorflow==2.0.0
tensorflow-estimator==2.0.0
tensorflow-gpu==2.0.0
tensorflow-hub==0.6.0
tensorflow-metadata==0.14.0
tensorflow-probability==0.7.0
Fri Oct 18 15:04:51 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.40       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+----------------------------------------------------------------------------

In [0]:
print('is gpu available?: ', tf.test.is_gpu_available())
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

is gpu available?:  True


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 1221983531376434526, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 15163588495924811008
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 6209084037304517343
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 11330115994
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1766124167096464211
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"]

## Layer implementation
![transformer](https://github.com/strutive07/TIL/raw/master/images/transformer0.PNG)

구현할 layer 목록
- embedding
- positional encoding
- encoder
- decoder
- multi head attention
- scaled dot product attention
- position wise feed forward network


### Embedding layer, Positional Encoding
word embedding: [tf.keras.layers.Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)

Convert 2D sequence (batch_size, input_length) -> 3D (batch_size, input_length, $d_{model}$)

![positional encoding](https://github.com/strutive07/TIL/raw/master/images/1566655262694.png)

Positional encoding 은 $pos/10000^{2i/d_{model}}$ 을 각도로 sin, cos 에 대입했을때 가지는 값이다.
짝수는 sin, 홀수는 cos 값을 대입해준다.

여기서 pos 는 sequence batch 수, index 는 해당 index 의 embedding dimention 이다.


In [0]:
class Embeddinglayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        # model hyper parameter variables
        super(Embeddinglayer, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    
    def call(self, sequences):
        max_sequence_len = sequences.shape[1]
        output = self.embedding(sequences) * tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))
        output += self.positional_encoding(max_sequence_len)
        
        return output
    
    def positional_encoding(self, max_len):
        pos = np.expand_dims(np.arange(0, max_len), axis=1)
        index = np.expand_dims(np.arange(0, self.d_model), axis=0)
        
        pe = self.angle(pos, index)
        
        pe[:, 0::2] = np.sin(pe[:, 0::2])
        pe[:, 1::2] = np.cos(pe[:, 1::2])        
        
        pe = np.expand_dims(pe, axis=0)
        return tf.cast(pe, dtype=tf.float32)
        
    def angle(self, pos, index):
        return pos / np.power(10000, (index - index % 2) / np.float32(self.d_model))

### Scaled Dot-Product Attention, Multi-Head Attention
![대체 텍스트](https://github.com/strutive07/TIL/raw/master/images/1566648835945.png)




Scaled Dot-Product Attention 은 learnable parameter 가 없다. 기존 additive attention 은 attention score 를 구하는 구간에 feed forward layer 가 있었지만, scaled dot-product attention 은 dot-product 연산으로 대체하므로 learnable parameter가 존재하지 않는다.

따라서 구현도 matmul -> scale -> mask -> softmax -> matmul 순서로 진행해주면된다.
주의할점은 추후에 넣어줄 mask 에 encoder 에서도 padding을 학습에 사용하지 않도록 padding mask 를 추가해주어야 한다는것이다.

In [0]:
class ScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, d_h):
        super(ScaledDotProductAttention, self).__init__()
        self.d_h = d_h
        
    def call(self, query, key, value, mask=None):
        matmul_q_and_transposed_k = tf.matmul(query, key, transpose_b=True)
        scale = tf.sqrt(tf.cast(self.d_h, dtype=tf.float32))
        scaled_attention_score = matmul_q_and_transposed_k / scale
        if mask is not None:
            scaled_attention_score += (mask * -1e9)
        
        attention_weight = tf.nn.softmax(scaled_attention_score, axis=-1)
        
        return tf.matmul(attention_weight, value), attention_weight

이제 핵심인 multi head attention 을 구현해보자.

Multi head attention 은 Query, Key, Value head 수 만큼 나뉜 후, 각각 다른 linear projection, 각각 다른 scaled dot-product 를 진행한 후 concat, linear projection 을 진행하는것으로 논문에 나와있는데, 이는 tf.reshape, tf.transform 으로 한번에 연산할 수 있다.

아래 코드의 42번째 줄을 보면, 기존 N 개의 head 로 split 하기 이전, 원본 query, key, value 의 shape 은 (batch_size, seq_len, d_model) 인 것을 알 수 있다. tf.reshape, tf.transform 으로 (batch_size, attention_head_count, seq_len, d_h) 으로 쪼개준다.
이는 논문에서 $d_{query} = d_{key} = d_{value} = d_{model}/heads$ 를 구현한것이다.

코드를 보면 split 하기 이전에 linear projection 을 진행한것을 볼 수 있는데, 이는 어짜피 reshape 으로 쪼개져서 사용되므로 쪼개고 곱하던, 곱하고 쪼개던 상관이 없다. 따라서 하나의 연산으로 진행해준다.

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, attention_head_count, d_model, dropout_prob):
        super(MultiHeadAttention, self).__init__()

        # model hyper parameter variables
        self.attention_head_count = attention_head_count
        self.d_model = d_model
        self.dropout_prob = dropout_prob

        if d_model % attention_head_count != 0:
            raise ValueError(
                f"d_model({d_model}) % attention_head_count({attention_head_count}) is not zero."
                f"d_model must be multiple of attention_head_count."
            )
        
        self.d_h = d_model // attention_head_count
        
        self.w_query = tf.keras.layers.Dense(d_model)
        self.w_key = tf.keras.layers.Dense(d_model)
        self.w_value = tf.keras.layers.Dense(d_model)
        
        self.scaled_dot_product = ScaledDotProductAttention(self.d_h)
        
        self.ff = tf.keras.layers.Dense(d_model)
    
    def call(self, query, key, value, mask=None):
        batch_size = tf.shape(query)[0]
        
        query = self.w_query(query)
        key = self.w_key(key)
        value = self.w_value(value)
        
        query = self.split_head(query, batch_size)
        key = self.split_head(key, batch_size)
        value = self.split_head(value, batch_size)
        
        output, attention = self.scaled_dot_product(query, key, value, mask)
        output = self.concat_head(output, batch_size)
        
        return self.ff(output), attention
        
    
    def split_head(self, tensor, batch_size):
        # input tensor: (batch_size, seq_len, d_model)
        return tf.transpose(
            tf.reshape(
                tensor, 
                (batch_size, -1, self.attention_head_count, self.d_h)
                # tensor: (batch_size, seq_len_splited, attention_head_count, d_h)
            ),
            [0, 2, 1, 3]
            # tensor: (batch_size, attention_head_count, seq_len_splited, d_h)
        )
    
    def concat_head(self, tensor, batch_size):
        return tf.reshape(
            tf.transpose(tensor, [0, 2, 1, 3]), 
            (batch_size, -1, self.attention_head_count * self.d_h)
        )

### Position-Wise Feed-Forward Network
![대체 텍스트](https://github.com/strutive07/TIL/raw/master/images/1566658909167.png)

Multi head attention 에서 나온 여러가지 attention 정보를 정리해주는 역활인 Position-wise feed-forward network이다.
구현은 FF-Relu-Dropout-FF 순서로 sequential 하게 진행하면된다.

In [0]:
class PositionWiseFeedForwardLayer(tf.keras.layers.Layer):
    def __init__(self, d_point_wise_ff, d_model, dropout_prob):
        super(PositionWiseFeedForwardLayer, self).__init__()
        self.w_1 = tf.keras.layers.Dense(d_point_wise_ff)
        self.w_2 = tf.keras.layers.Dense(d_model)
    
    def call(self, input):
        input = self.w_1(input)
        input = tf.nn.relu(input)
        return self.w_2(input)

### Encoder

![대체 텍스트](https://github.com/strutive07/TIL/raw/master/images/1566654859388.png)

이제 내부 layer 들이 준비되었으니, encoder 를 구현해보자.
encoder 는 Multi head attention - Dropout - LayerNorm with Residual connection - Position-wise Feed-Forward - Dropout - LayerNorm with Residual connection 순서로 진행된다.

이제 하나하나 layer 를 쌓아보자.

특이점은 딱히 없고, tf.add 를 통해 Residual connection 을 잘 해주면될듯하다. 또한 padding 을 학습에 적용하지 못하도록 padding mask 를 잘 넣어주는것도 중요하다.

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
      def __init__(self, attention_head_count, d_model, d_point_wise_ff, dropout_prob):
            super(EncoderLayer, self).__init__()

            # model hyper parameter variables
            self.attention_head_count = attention_head_count
            self.d_model = d_model
            self.d_point_wise_ff = d_point_wise_ff
            self.dropout_prob = dropout_prob

            self.multi_head_attention = MultiHeadAttention(attention_head_count, d_model, dropout_prob)
            self.dropout_1 = tf.keras.layers.Dropout(dropout_prob)
            self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

            self.position_wise_feed_forward_layer = PositionWiseFeedForwardLayer(
                d_point_wise_ff, 
                d_model,
                dropout_prob
            )
            self.dropout_2 = tf.keras.layers.Dropout(dropout_prob)
            self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    
      def call(self, input, mask, training):
            output, attention = self.multi_head_attention(input, input, input, mask)
            output = self.dropout_1(output, training=training)
            output = self.layer_norm_1(tf.add(input, output)) # residual network
            
            output = self.position_wise_feed_forward_layer(output)
            output = self.dropout_2(output, training=training)
            output = self.layer_norm_2(tf.add(input, output)) #residual network
            
            return output, attention

### Decoder
![대체 텍스트](https://github.com/strutive07/TIL/raw/master/images/1566657684622.png)

decoder 는 조금 주의할구간이 있다. 기존 encoder 에서 residual connection을 input 만 진행했다면, 이번에는 decoder input, masked multi-head attention 두게를 해야하므로 변수 사용을 조심하자.

또한 encoder 의 output을 key, value 로 사용하고있다. 이번에도 padding mask를 잘 넣어주여야하고, masked multi head attention 에는 look ahead padding 을 넣어주어야한다. 미래의 데이터를 기반으로 현재 데이터를 생성하지 못하도록 마스크를 넣어준다.


In [0]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, attention_head_count, d_model, d_point_wise_ff, dropout_prob):
        super(DecoderLayer, self).__init__()
        
        # model hyper parameter variables
        self.attention_head_count = attention_head_count
        self.d_model = d_model
        self.d_point_wise_ff = d_point_wise_ff
        self.dropout_prob = dropout_prob
        
        self.masked_multi_head_attention = MultiHeadAttention(attention_head_count, d_model, dropout_prob)
        self.dropout_1 = tf.keras.layers.Dropout(dropout_prob)
        self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.encoder_decoder_attention = MultiHeadAttention(attention_head_count, d_model, dropout_prob)
        self.dropout_2 = tf.keras.layers.Dropout(dropout_prob)
        self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.position_wise_feed_forward_layer = PositionWiseFeedForwardLayer(
            d_point_wise_ff, 
            d_model,
            dropout_prob
        )
        self.dropout_3 = tf.keras.layers.Dropout(dropout_prob)
        self.layer_norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, decoder_input, encoder_output, look_ahead_mask, padding_mask, training):
        output, attention_1 = self.masked_multi_head_attention(
            decoder_input,
            decoder_input,
            decoder_input,
            look_ahead_mask
        )
        output = self.dropout_1(output, training=training)
        query = self.layer_norm_1(tf.add(decoder_input, output)) # residual network
        output, attention_2 = self.encoder_decoder_attention(
            query,
            encoder_output,
            encoder_output,
            padding_mask
        )
        output = self.dropout_2(output, training=training)
        encoder_decoder_attention_output = self.layer_norm_2(tf.add(output, query))
        
        output = self.position_wise_feed_forward_layer(encoder_decoder_attention_output)
        output = self.dropout_3(output, training=training)
        output = self.layer_norm_3(tf.add(encoder_decoder_attention_output, output)) #residual network
        
        return output, attention_1, attention_2

### Masking

![masking_1](https://user-images.githubusercontent.com/26921984/67160277-331bb100-f38a-11e9-9a4a-c5520fdb1b4a.png)


![masking_2](https://user-images.githubusercontent.com/26921984/67160309-7bd36a00-f38a-11e9-991d-5575080a2c3d.png)


padding과 decoder의 look a head 부분을 학습하지 못하도록 masking을 해주어야한다.

In [0]:
class Mask:
    @classmethod
    def create_padding_mask(cls, sequences):
        sequences = tf.cast(tf.math.equal(sequences, 0), dtype=tf.float32)
        return sequences[:, tf.newaxis, tf.newaxis, :]
    
    @classmethod
    def create_look_ahead_mask(cls, seq_len):
        return 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    
    @classmethod
    def create_masks(cls, input, target):
        encoder_padding_mask = Mask.create_padding_mask(input)
        
        decoder_padding_mask = Mask.create_padding_mask(input)
        
        look_ahead_mask = tf.maximum(
            Mask.create_look_ahead_mask(tf.shape(target)[1]),
            Mask.create_padding_mask(target)
        )
        
        return encoder_padding_mask, look_ahead_mask, decoder_padding_mask

## Make Model

자 이제 모든 준비가 끝났다. Encoder 를 잘 스택해서 결과를 얻고, 해당 결과를 스택된 decoder 에 넣어준다.
모델의 마지막 부분인 linear projection 을 추가한다.

마지막 softmax 는 trainer 에 넣을지 모델에 넣을지 고민중이다.

In [0]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 input_vocab_size,
                 target_vocab_size,
                 encoder_count,
                 decoder_count,
                 attention_head_count,
                 d_model,
                 d_point_wise_ff,
                 dropout_prob):
        super(Transformer, self).__init__()

        # model hyper parameter variables
        self.encoder_count = encoder_count
        self.decoder_count = decoder_count
        self.attention_head_count = attention_head_count
        self.d_model = d_model,
        self.d_point_wise_ff = d_point_wise_ff,
        self.dropout_prob = dropout_prob

        self.encoder_embedding_layer = Embeddinglayer(input_vocab_size, d_model)
        self.encoder_embedding_dropout = tf.keras.layers.Dropout(dropout_prob)
        self.decoder_embedding_layer = Embeddinglayer(target_vocab_size, d_model)
        self.decoder_embedding_dropout = tf.keras.layers.Dropout(dropout_prob)

        self.encoder_layers = [
            EncoderLayer(
                attention_head_count,
                d_model,
                d_point_wise_ff,
                dropout_prob
            ) for _ in range(encoder_count)
        ]

        self.decoder_layers = [
            DecoderLayer(
                attention_head_count,
                d_model,
                d_point_wise_ff,
                dropout_prob
            )for _ in range(decoder_count)
        ]

        self.linear = tf.keras.layers.Dense(target_vocab_size)
            
    def call(self,
             input,
             target,
             input_padding_mask,
             look_ahead_mask,
             target_padding_mask,
             training
            ):
        encoder_tensor = self.encoder_embedding_layer(input)
        encoder_tensor = self.encoder_embedding_dropout(encoder_tensor, training=training)
        
        for i in range(self.encoder_count):
            encoder_tensor, _ = self.encoder_layers[i](encoder_tensor, input_padding_mask, training=training)
        target = self.decoder_embedding_layer(target)
        decoder_tensor = self.decoder_embedding_dropout(target, training=training)
        for i in range(self.decoder_count):
            decoder_tensor, _, _ = self.decoder_layers[i](
                decoder_tensor,
                encoder_tensor,
                look_ahead_mask,
                target_padding_mask,
                training=training
            )
        return self.linear(decoder_tensor)

## Data loader

Data는 WMT14 데이터를 사용한다.
전처리는 byte pair encoding을 사용한다. Byte pair encoding을 사용하기위해 sentencepiece module을 사용한다.

전처리 순서
1. Data download
2. Data validation, 개행 단위 문장 분리
3. Training byte pair encoding 
4. Encoding data with trained byte pair encoding
5. Split training set, validation set
6. Create tf.data.Dataset

In [0]:
import os
from urllib.request import urlretrieve

from tqdm import tqdm

import tensorflow as tf
import pickle
import sentencepiece

class DataLoader:
    DIR = None
    PATHS = {}
    BPE_VOCAB_SIZE=0
    dictionary = {
        'source': {
            'token2idx':None,
            'idx2token':None,
        },
        'target': {
            'token2idx':None,
            'idx2token':None,
        }
    }
    CONFIG = {
        'wmt14/en-de': {
            'source_lang': 'en',
            'target_lang': 'de',
            'base_url': 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/',
            'train_files': ['train.en', 'train.de'],
            'vocab_files': ['vocab.50K.en', 'vocab.50K.de'],
            'dictionary_files': ['dict.en-de'],
            'test_files': [
                'newstest2012.en', 'newstest2012.de',
                'newstest2013.en', 'newstest2013.de',
                'newstest2014.en', 'newstest2014.de',
                'newstest2015.en', 'newstest2015.de',
            ]
        }
    }
    BPE_MODEL_SUFFIX= '.model'
    BPE_VOCAB_SUFFIX= '.vocab'
    BPE_RESULT_SUFFIX= '.sequences'
    SEQ_MAX_LEN = {
        'source': 100,
        'target': 100
    }
    DATA_LIMIT = None
    TRAIN_RATIO = 0.9
    BATCH_SIZE = 16

    source_sp = None
    target_sp = None

    def __init__(self, dataset_name, data_dir, batch_size=16, bpe_vocab_size=32000, seq_max_len_source=100, seq_max_len_target=100, data_limit=None, train_ratio=0.9):
        if dataset_name is None or data_dir is None:
            raise ValueError('dataset_name and data_dir must be defined')
        self.DIR = data_dir
        self.DATASET = dataset_name
        self.BPE_VOCAB_SIZE = bpe_vocab_size
        self.SEQ_MAX_LEN['source'] = seq_max_len_source
        self.SEQ_MAX_LEN['target'] = seq_max_len_target
        self.DATA_LIMIT = data_limit
        self.TRAIN_RATIO = train_ratio
        self.BATCH_SIZE = batch_size

        self.PATHS['source_data'] = os.path.join(self.DIR, self.CONFIG[self.DATASET]['train_files'][0])
        self.PATHS['source_bpe_prefix'] = self.PATHS['source_data'] + '.segmented'

        self.PATHS['target_data'] = os.path.join(self.DIR, self.CONFIG[self.DATASET]['train_files'][1])
        self.PATHS['target_bpe_prefix'] = self.PATHS['target_data'] + '.segmented'

    def load(self):
        print('#1 download data')
        self.download_dataset()

        print('#2 parse data')
        source_data = self.parse_data_and_save(self.PATHS['source_data'])
        target_data = self.parse_data_and_save(self.PATHS['target_data'])
        
        print('#3 train bpe')
        
        self.train_bpe(self.PATHS['source_data'], self.PATHS['source_bpe_prefix'])
        self.train_bpe(self.PATHS['target_data'], self.PATHS['target_bpe_prefix'])

        print('#4 load bpe vocab')
        self.load_bpe_encoder()

        print('#5 encode data with bpe')
        source_sequences = self.texts_to_sequences(
            self.bulk_sentence_piece(
                source_data,
                self.PATHS['source_bpe_prefix'] + self.BPE_MODEL_SUFFIX,
                self.PATHS['source_bpe_prefix'] + self.BPE_RESULT_SUFFIX
            ),
            mode="source"
        )
        target_sequences = self.texts_to_sequences(
            self.bulk_sentence_piece(
                target_data,
                self.PATHS['target_bpe_prefix'] + self.BPE_MODEL_SUFFIX,
                self.PATHS['target_bpe_prefix'] + self.BPE_RESULT_SUFFIX
            ),
            mode="target"
        )

        print('source sequence example:', source_sequences[0])
        print('target sequence example:', target_sequences[0])


        source_sequences_train, source_sequences_val, target_sequences_train, target_sequences_val = train_test_split(
            source_sequences, target_sequences, train_size=self.TRAIN_RATIO
        )

        if self.DATA_LIMIT is not None:
            print('data size limit ON. limit size:', DATA_LIMIT)
            source_sequences_train = source_sequences_train[:DATA_LIMIT]
            target_sequences_train = target_sequences_train[:DATA_LIMIT]

        print('source_sequences_train', len(source_sequences_train))
        print('source_sequences_val', len(source_sequences_val))
        print('target_sequences_train', len(target_sequences_train))
        print('target_sequences_val', len(target_sequences_val))

        print('train set size: ', len(source_sequences_train))
        print('validation set size: ', len(source_sequences_val))
        TRAIN_SET_SIZE = len(source_sequences_train)
        VALIDATION_SET_SIZE = len(source_sequences_val)

        train_dataset = self.create_dataset(
            source_sequences_train,
            target_sequences_train
        )

        val_dataset = self.create_dataset(
            source_sequences_val,
            target_sequences_val
        )

        return train_dataset, val_dataset
    
    def load_test(self, index=0):
        if index < 0 or index >= len(self.CONFIG[self.DATASET]['test_files']) // 2:
            raise ValueError('test file index out of range. min: 0, max: {}'.format(len(self.CONFIG[self.DATASET]['test_files']) // 2 - 1))
        print('#1 download data')
        self.download_dataset()

        print('#2 parse data')

        source_test_data_path = os.path.join(self.DIR, self.CONFIG[self.DATASET]['test_files'][index * 2])
        target_test_data_path = os.path.join(self.DIR, self.CONFIG[self.DATASET]['test_files'][index * 2 + 1])

        source_data = self.parse_data_and_save(source_test_data_path)
        target_data = self.parse_data_and_save(target_test_data_path)

        print('#3 load bpe vocab')

        self.dictionary['source']['token2idx'], self.dictionary['source']['idx2token'] = self.load_bpe_vocab(
            self.PATHS['source_bpe_prefix'] + self.BPE_VOCAB_SUFFIX)
        self.dictionary['target']['token2idx'], self.dictionary['target']['idx2token'] = self.load_bpe_vocab(
            self.PATHS['target_bpe_prefix'] + self.BPE_VOCAB_SUFFIX)

        return source_data, target_data
    
    def load_bpe_encoder(self):
        self.dictionary['source']['token2idx'], self.dictionary['source']['idx2token'] =  self.load_bpe_vocab(self.PATHS['source_bpe_prefix'] + self.BPE_VOCAB_SUFFIX)
        self.dictionary['target']['token2idx'], self.dictionary['target']['idx2token'] =  self.load_bpe_vocab(self.PATHS['target_bpe_prefix'] + self.BPE_VOCAB_SUFFIX)

    def download_dataset(self):
        for file in (self.CONFIG[self.DATASET]['train_files']
                     + self.CONFIG[self.DATASET]['vocab_files']
                     + self.CONFIG[self.DATASET]['dictionary_files']
                     + self.CONFIG[self.DATASET]['test_files']):
            self._download("{}{}".format(self.CONFIG[self.DATASET]['base_url'], file))

    def _download(self, url):
        path = os.path.join(self.DIR, url.split('/')[-1])
        if not os.path.exists(path):
            with TqdmCustom(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=url) as t:
                urlretrieve(url, path, t.update_to)
    
    def parse_data_and_save(self, path):
        print('load data from {}'.format(path))
        with open(path, encoding='utf-8') as f:
            lines = f.read().strip().split('\n')

        if lines is None:
            raise ValueError('Vocab file is invalid')
            
        with open(path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(lines))
        
        return lines
    
    def train_bpe(self, data_path, model_prefix):
        model_path = model_prefix + self.BPE_MODEL_SUFFIX
        vocab_path = model_prefix + self.BPE_VOCAB_SUFFIX

        if not(os.path.exists(model_path) and os.path.exists(vocab_path)):
            print('bpe model does not exist. train bpe. model path:', model_path, ' vocab path:', vocab_path)
            train_source_params = "--input={} \
                --pad_id=0 \
                --unk_id=1 \
                --bos_id=2 \
                --eos_id=3 \
                --model_prefix={} \
                --vocab_size={} \
                --model_type=bpe ".format(
                data_path,
                model_prefix,
                self.BPE_VOCAB_SIZE
            )
            sentencepiece.SentencePieceTrainer.Train(train_source_params)
        else:
            print('bpe model exist. load bpe. model path:', model_path, ' vocab path:', vocab_path)

    def bulk_sentence_piece(self, source_data, source_bpe_model_path, result_data_path):
        if os.path.exists(result_data_path):
            print('encoded data exist. load data. path:', result_data_path)
            with open(result_data_path, 'r', encoding='utf-8') as f:
                 sequences = f.read().strip().split('\n')
                 return sequences

        sp = sentencepiece.SentencePieceProcessor()
        sp.load(source_bpe_model_path)
        print('encoded data does not exist. encode data. path:', result_data_path)
        sequences = []
        with open(result_data_path, 'w') as f:
            for sentence in tqdm(source_data):
                pieces = sp.EncodeAsPieces(sentence)
                sequence = " ".join(pieces)
                sequences.append(sequence)
                f.write(sequence + "\n")
        return sequences
    
    def encode_data(self, input, mode='source'):
        if mode != 'source' and mode != 'target':
            ValueError('not allowed mode.')

        if mode == 'source':
            if self.source_sp is None:
                self.source_sp = sentencepiece.SentencePieceProcessor()
                self.source_sp.load(self.PATHS['source_bpe_prefix'] + self.BPE_MODEL_SUFFIX)

            pieces = self.source_sp.EncodeAsPieces(input)
            sequence = " ".join(pieces)

            return sequence
        elif mode == 'target':
            if self.target_sp is None:
                self.target_sp = sentencepiece.SentencePieceProcessor()
                self.target_sp.load(self.PATHS['target_bpe_prefix'] + self.BPE_MODEL_SUFFIX)
            
            pieces = self.target_sp.EncodeAsPieces(input)
            sequence = " ".join(pieces)

            return sequence
        else:
            ValueError('not allowed mode.')
    
    def load_bpe_vocab(self, bpe_vocab_path):
        vocab = [line.split()[0] for line in open(bpe_vocab_path, 'r').read().splitlines()]
        token2idx = {}
        idx2token = {}

        for idx, token in enumerate(vocab):
            token2idx[token] = idx
            idx2token[idx] = token
        return token2idx, idx2token
    
    def texts_to_sequences(self, texts, mode='source'):
        if mode != 'source' and mode != 'target':
            ValueError('not allowed mode.')
            
        sequences = []
        for text in texts:
            text_list = ["<s>"] + text.split() + ["</s>"]
            
            sequence = [
                        self.dictionary[mode]['token2idx'].get(
                            token, self.dictionary[mode]['token2idx']["<unk>"]
                        )
                        for token in text_list
            ]
            sequences.append(sequence)
        return sequences

    def sequences_to_texts(self, sequences, mode='source'):
        if mode != 'source' and mode != 'target':
            ValueError('not allowed mode.')
            
        texts = []
        for sequence in sequences:
            if mode == 'source':
                if self.source_sp is None:
                    self.source_sp = sentencepiece.SentencePieceProcessor()
                    self.source_sp.load(self.PATHS['source_bpe_prefix'] + self.BPE_MODEL_SUFFIX)
                text = self.source_sp.DecodeIds(sequence)
            else:
                if self.target_sp is None:
                    self.target_sp = sentencepiece.SentencePieceProcessor()
                    self.target_sp.load(self.PATHS['target_bpe_prefix'] + self.BPE_MODEL_SUFFIX)
                text = self.target_sp.DecodeIds(sequence)
            texts.append(text)
        return texts
    
    def create_dataset(self, source_sequences, target_sequences):
        new_source_sequences = []
        new_target_sequences = []
        for source, target in zip(source_sequences, target_sequences):
            if len(source) > self.SEQ_MAX_LEN['source']:
                continue
            if len(target) > self.SEQ_MAX_LEN['target']:
                continue
            new_source_sequences.append(source)
            new_target_sequences.append(target)
            
        source_sequences = tf.keras.preprocessing.sequence.pad_sequences(
            sequences=source_sequences, maxlen=self.SEQ_MAX_LEN['source'], padding='post'
        )
        target_sequences = tf.keras.preprocessing.sequence.pad_sequences(
            sequences=target_sequences, maxlen=self.SEQ_MAX_LEN['target'], padding='post'
        )
        dataset = tf.data.Dataset.from_tensor_slices(
            (source_sequences, target_sequences)
        )
        dataset = dataset.padded_batch(self.BATCH_SIZE, self.DATA_SHAPES, self.PADDINGS)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset


class TqdmCustom(tqdm):

    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

## Hyper parameters

In [0]:
# hyper paramaters
TRAIN_RATIO = 0.9
D_POINT_WISE_FF = 2048
D_MODEL = 512
ENCODER_COUNT = DECODER_COUNT = 6
EPOCHS = 20
ATTENTION_HEAD_COUNT = 8
DROPOUT_PROB = 0.1
BATCH_SIZE = 16
SEQ_MAX_LEN_SOURCE = 100
SEQ_MAX_LEN_TARGET = 100
BPE_VOCAB_SIZE = 32000
DATA_LIMIT = None

## Training model

### Load data

In [0]:
data_loader = DataLoader(
    'wmt14/en-de', 
    'datasets',
    batch_size=BATCH_SIZE
)

In [0]:
dataset, val_dataset = data_loader.load()

### Load model

In [0]:
 transformer = Transformer(
     input_vocab_size=BPE_VOCAB_SIZE,
     target_vocab_size=BPE_VOCAB_SIZE,
     encoder_count=ENCODER_COUNT,
     decoder_count=DECODER_COUNT,
     attention_head_count=ATTENTION_HEAD_COUNT,
     d_model=D_MODEL,
     d_point_wise_ff=D_POINT_WISE_FF,
     dropout_prob=DROPOUT_PROB
 )

### Optimizer

![optimizer](https://user-images.githubusercontent.com/26921984/67160202-4c702d80-f389-11e9-9ec3-6d43e222ff6f.png)

Adam optimizer를 사용하고, optimizer의 learning rate에 warmup 을 넣어주기위해 tf.keras.optimizers.schedules.LearningRateSchedule 를 사용한다.

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
            
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
learning_rate = CustomSchedule(D_MODEL)
optimizer = tf.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.losses.CategoricalCrossentropy(from_logits=True, reduction='none')

### Label smoothing

![label_smoothing](https://user-images.githubusercontent.com/26921984/67160345-b806ca80-f38a-11e9-8035-019f8542fdf4.png)

In [0]:
def label_smoothing(target_data, depth, epsilon=0.1):
    target_data_one_hot = tf.one_hot(target_data, depth=depth)
    n = target_data_one_hot.get_shape().as_list()[-1]
    return ((1 - epsilon) * target_data_one_hot) + (epsilon / n)

In [0]:
import time
import datetime
class Trainer:
    def __init__(
            self,
            model,
            dataset,
            loss_object=None,
            optimizer=None,
            checkpoint_dir='./checkpoints',
            batch_size=None,
            distribute_strategy=None,
            vocab_size=32000,
            epoch=20,
    ):
        self.batch_size = batch_size
        self.distribute_strategy = distribute_strategy
        self.model = model
        self.loss_object = loss_object
        self.optimizer = optimizer
        self.checkpoint_dir = checkpoint_dir
        self.vocab_size = vocab_size
        self.epoch = epoch
        self.dataset = dataset

        os.makedirs(self.checkpoint_dir, exist_ok=True)
        if self.optimizer is None:
            self.checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=self.model)
        else:
            self.checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=self.optimizer, model=self.model)
        self.checkpoint_manager = tf.train.CheckpointManager(self.checkpoint, self.checkpoint_dir, max_to_keep=3)

        # metrics
        self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('train_accuracy')
        self.validation_loss = tf.keras.metrics.Mean('validation_loss', dtype=tf.float32)
        self.validation_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('validation_accuracy')

    def multi_gpu_train(self, reset_checkpoint=False):
        with self.distribute_strategy.scope():
            self.dataset = self.distribute_strategy.experimental_distribute_dataset(self.dataset)
            self.trainer(reset_checkpoint=reset_checkpoint, is_distributed=True)

    
    def single_gpu_train(self, reset_checkpoint=False):
        self.trainer(reset_checkpoint=reset_checkpoint, is_distributed=False)

    def trainer(self, reset_checkpoint, is_distributed=False):
        current_day = datetime.datetime.now().strftime("%Y%m%d")
        train_log_dir = './logs/gradient_tape/' + current_day + '/train'
        os.makedirs(train_log_dir, exist_ok=True)
        train_summary_writer = tf.summary.create_file_writer(train_log_dir)

        if not reset_checkpoint:
            if self.checkpoint_manager.latest_checkpoint:
                print("Restored from {}".format(self.checkpoint_manager.latest_checkpoint))
            else:
                print("Initializing from scratch.")

            self.checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint
            )
        else:
            print("reset and initializing from scratch.")

        for epoch in range(self.epoch):
            start = time.time()
            print('start learning')

            for (batch, (input, target)) in enumerate(self.dataset):
                if is_distributed:
                    loss = self.distributed_train_step(input, target)
                else:
                    loss = self.train_step(input, target)

                self.checkpoint.step.assign_add(1)
                if batch % 50 == 0:
                    print(
                        "Epoch: {}, Batch: {}, Loss:{}, Accuracy: {}".format(epoch, batch, self.train_loss.result(),
                                                                             self.train_accuracy.result()))
                if batch % 10000 == 0 and batch != 0:
                    tf.py_function(self.checkpoint_manager.save, [], [])
            print("{} | Epoch: {} Loss:{}, Accuracy: {}, time: {} sec".format(
                datetime.datetime.now(), epoch, self.train_loss.result(), self.train_accuracy.result(),
                time.time() - start
            ))
            with train_summary_writer.as_default():
                tf.summary.scalar('train_loss', self.train_loss.result(), step=epoch)
                tf.summary.scalar('train_accuracy', self.train_accuracy.result(), step=epoch)

            tf.py_function(self.checkpoint_manager.save, [], [])

            self.train_loss.reset_states()
            self.train_accuracy.reset_states()
            self.validation_loss.reset_states()
            self.validation_accuracy.reset_states()

        tf.py_function(self.checkpoint_manager.save, [], [])

    def train_step(self, input, target):
        target_include_start = target[:, :-1]
        target_include_end = target[:, 1:]
        encoder_padding_mask, look_ahead_mask, decoder_padding_mask = Mask.create_masks(
            input, target_include_start
        )

        with tf.GradientTape() as tape:
            pred = self.model.call(
                input=input,
                target=target_include_start,
                input_padding_mask=encoder_padding_mask,
                look_ahead_mask=look_ahead_mask,
                target_padding_mask=decoder_padding_mask,
                training=True
            )

            loss = self.loss_function(target_include_end, pred)

        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        self.train_loss(loss)
        self.train_accuracy(target_include_end, pred)
        if self.distribute_strategy is None:
            return tf.reduce_mean(loss)
        else:
            return loss

    def loss_function(self, real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        real_one_hot = label_smoothing(real, depth=self.vocab_size)
        loss = self.loss_object(real_one_hot, pred)

        mask = tf.cast(mask, dtype=loss.dtype)

        loss *= mask
        return tf.reduce_mean(loss)

    @tf.function
    def distributed_train_step(self, input, target):
        loss = self.distribute_strategy.experimental_run_v2(self.train_step, args=(input, target))
        loss_value = self.distribute_strategy.reduce(tf.distribute.ReduceOp.MEAN, loss, axis=None)
        return tf.reduce_mean(loss_value)

### Single GPU learning

In [0]:
trainer = Trainer(
    model=transformer,
    dataset=dataset,
    loss_object=loss_object,
    optimizer=optimizer,
    checkpoint_dir='./checkpoints2',
    vocab_size=BPE_VOCAB_SIZE,
    epoch=EPOCHS,
)

In [0]:
trainer.single_gpu_train(True)

### Multi GPU Learning

In [0]:
strategy = tf.distribute.MirroredStrategy()
trainer = Trainer(
    model=transformer,
    dataset=dataset,
    loss_object=loss_object,
    optimizer=optimizer,
    batch_size=BATCH_SIZE,
    distribute_strategy=strategy,
    vocab_size=BPE_VOCAB_SIZE,
    epoch=EPOCHS,
)

In [0]:
trainer.multi_gpu_train(True)

## Evaluation

In [0]:
def translate(input, data_loader, trainer, seq_max_len_target=100):
    if data_loader is None:
        ValueError('data loader is None')
    
    if trainer is None:
        ValueError('trainer is None')

    if trainer.model is None:
        ValueError('model is None')
    
    if not isinstance(seq_max_len_target, int):
        ValueError('seq_max_len_target is not int')

    encoded_data = data_loader.encode_data(input, mode='source')
    encoded_data = data_loader.texts_to_sequences([encoded_data])
    encoder_input = tf.convert_to_tensor(
        encoded_data,
        dtype=tf.int32
    )
    decoder_input = [data_loader.dictionary['target']['token2idx']['<s>']]
    decoder_input = tf.expand_dims(decoder_input, 0)
    decoder_end_token = data_loader.dictionary['target']['token2idx']['</s>']
    
    for i in range(SEQ_MAX_LEN_TARGET):
        encoder_padding_mask, look_ahead_mask, decoder_padding_mask = Mask.create_masks(
            encoder_input, decoder_input
        )
        pred = trainer.model.call(
            input=encoder_input,
            target=decoder_input,
            input_padding_mask=encoder_padding_mask,
            look_ahead_mask=look_ahead_mask,
            target_padding_mask=decoder_padding_mask,
            training=False
        )
        pred = pred[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(pred, axis=-1), dtype=tf.int32)
        
        if predicted_id == decoder_end_token:
            return tf.squeeze(decoder_input, axis=0)
        decoder_input = tf.concat([decoder_input, predicted_id], axis=-1)

    
    return tf.squeeze(decoder_input, axis=0)

### Sample

In [0]:
data_loader.load_bpe_encoder()

In [0]:
import time
import datetime
from google.colab import files

trainer = Trainer(
    model=transformer,
    dataset=None,
    loss_object=None,
    optimizer=None,
    checkpoint_dir='./checkpoints'
)
if trainer.checkpoint_manager.latest_checkpoint:
    print("Restored from {}".format(trainer.checkpoint_manager.latest_checkpoint))
else:
    print("Initializing from scratch.")

trainer.checkpoint.restore(
    trainer.checkpoint_manager.latest_checkpoint
)

Restored from ./checkpoints/ckpt-92


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f882013de80>

In [0]:
output = translate('I love apple.', data_loader, trainer)
res = data_loader.sequences_to_texts([output.numpy().tolist()], mode='target')
print(res)

['Ich liebe Apple .']


### Calculate BLEU score

In [0]:
source_data, target_data = data_loader.load_test(index=3)

#1 download data
#2 parse data
load data from datasets/newstest2015.en
load data from datasets/newstest2015.de
#3 load bpe vocab


In [0]:
from tqdm import tqdm
import pickle
translated_data = []
data = zip(source_data, target_data)

for source, target in tqdm(data):
    output = translate(source, data_loader, trainer)
    res = data_loader.sequences_to_texts([output.numpy().tolist()], mode='target')
    translated_data.append({
        'source': source,
        'target': target,
        'output': res
    })

2169it [3:40:42,  5.42s/it]


In [0]:
import pickle
with open('translated_data_2015_with_labelsmoothing_checkpoint_92.pickle', 'wb') as f:
    pickle.dump(translated_data, f)

Copying file://translated_data_2015_with_labelsmoothing_checkpoint_92.pickle [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/908.5 KiB.                                    


In [0]:
import pickle
with open('translated_data_2014.pickle', 'rb') as f:
    data = pickle.load(f)

In [0]:
translated = ""
for d in data:
    translated += d['output'][0] + '\n'
with open('datasets/res', 'w') as f:
    f.write(translated)

In [0]:
import os, re
ref = 'datasets/newstest2014.de'
translation = 'datasets/res'
get_bleu_score = "perl datasets/multi-bleu.perl {} < {} > {}".format(ref, translation, "temp")
os.system(get_bleu_score)
bleu_score_report = open("temp", "r").read()
with open(translation, "a") as fout:
    fout.write("\n{}".format(bleu_score_report))
score = re.findall("BLEU = ([^,]+)", bleu_score_report)[0]
new_translation = translation + "B{}".format(score)
os.system("mv {} {}".format(translation, new_translation))


In [0]:
bleu_score_report