In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import re
import konlpy
from collections import Counter

In [2]:
korean_train_data = []
english_train_data = []

with open("./data/korean-english-park.train.ko", "r") as f:
    korean_train_data = f.read().splitlines()

with open("./data/korean-english-park.train.en", "r") as f:
    english_train_data = f.read().splitlines()

In [3]:
train_data = pd.DataFrame([i for i in zip(korean_train_data, english_train_data)], columns=["korean", "english"])
train_data 

Unnamed: 0,korean,english
0,"개인용 컴퓨터 사용의 상당 부분은 ""이것보다 뛰어날 수 있느냐?""","Much of personal computing is about ""can you t..."
1,모든 광마우스와 마찬가지 로 이 광마우스도 책상 위에 놓는 마우스 패드를 필요로 하...,so a mention a few weeks ago about a rechargea...
2,그러나 이것은 또한 책상도 필요로 하지 않는다.,"Like all optical mice, But it also doesn't nee..."
3,"79.95달러하는 이 최첨단 무선 광마우스는 허공에서 팔목, 팔, 그외에 어떤 부분...",uses gyroscopic sensors to control the cursor ...
4,정보 관리들은 동남 아시아에서의 선박들에 대한 많은 (테러) 계획들이 실패로 돌아갔...,Intelligence officials have revealed a spate o...
...,...,...
94118,“우리는 3월 8일 김승연 회장과 그의 아들이 보복폭행에 가담한 혐의를 찾기 위해 ...,””We are hoping to seize material evidence to ...
94119,월요일 술집 종업원 6명은 김회장과 아들에게 폭행을 당했음을 진술했다고 경찰은 말했다.,"” On Monday, police secured statements from si..."
94120,그러나 불충분한 증거 확보로 수사에 어려움이 있다.,But the lack of material evidence is making it...
94121,김회장과 그의 아들은 보복폭행 혐의를 강력히 부인하고 있다.,Kim and his son both deny the allegations.


In [4]:
train_data = train_data.sample(20000)

In [5]:
print(len(train_data[train_data.duplicated(["korean", "english"])]))
train_data = train_data.drop_duplicates(["korean", "english"], ignore_index=True)

1990


In [6]:
def preprocess(sentence, korean=True):
    sentence = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s.,?!\'\"]", "", sentence)
    sentence = re.sub("\"+", "", sentence)
    sentence = re.sub(r"\.+", " .", sentence)
    sentence = re.sub("!+", " !", sentence)
    sentence = re.sub("\?+", " ?", sentence)
    sentence.lower().strip()
    if not korean:
        sentence = "<sos> " + sentence + " <eos>"
    
    return sentence

In [7]:
korean_corpus = train_data["korean"]
english_corpus = train_data["english"]

In [8]:
korean_corpus = [preprocess(sentence, korean=True) for sentence in korean_corpus]
english_corpus = [preprocess(sentence, korean=False) for sentence in english_corpus]

In [9]:
def konlpy_tokenizer(tokenizer, corpus):
    words = []
    sentences = []
    for c in corpus:
        temp = tokenizer.morphs(c)
        words += temp
        sentences.append(temp)
        
    counter = Counter(words)
    counter = counter.most_common()
    
    words = ["<pad>", "<unk>"] + [key for key, _ in counter]
    word_to_index = {word : index for index, word in enumerate(words)}
    tensor = []
    
    for c in sentences:
        temp = []
        for s in c:
            if s in words:
                temp.append(word_to_index[s])
            else:
                temp.append(word_to_index["<unk>"])
        tensor.append(temp)
    
    return tensor, word_to_index

In [10]:
mecab = konlpy.tag.Mecab()
korean_tokens, korean_word_to_index = konlpy_tokenizer(mecab, korean_corpus)

In [11]:
WORD_SIZE = len(korean_word_to_index)

In [12]:
korean_index_to_word = {word : index for word, index in enumerate(korean_word_to_index)}

In [13]:
english_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=" ")
english_tokenizer.fit_on_texts(english_corpus)
english_tokens = english_tokenizer.texts_to_sequences(english_corpus)

In [14]:
encoder_input = []
decoder_input = []

for k, e in zip(korean_tokens, english_tokens):
    if len(e) < 40 and len(k) < 40:
        encoder_input.append(k)
        decoder_input.append(e)

In [15]:
encoder_input = tf.keras.preprocessing.sequence.pad_sequences(encoder_input, padding='post')
decoder_input = tf.keras.preprocessing.sequence.pad_sequences(decoder_input, padding='post')

In [16]:
print(len(encoder_input))
print(len(decoder_input))
print(WORD_SIZE)
print(len(english_tokenizer.word_index))

13957
13957
28771
35762


In [17]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i)/d_model)
    
    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]
    
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    
    return sinusoid_table

In [18]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
            
        self.depth = d_model // self.num_heads
            
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
            
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
            
    def split_heads(self, x):
        batch_size = x.shape[0]
        split_x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        batch_size = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (batch_size, -1, self.d_model))

        return combined_x

        
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
            
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask
        )
        out = self.combine_heads(out)
        out = self.linear(out)
                
        return out, attention_weights

In [19]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.w_1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.w_2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.w_1(x)
        out = self.w_2(out)
            
        return out

In [20]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.dropout(out)
        out += residual
        
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual
        
        return out, enc_attn

In [21]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, causality_mask, padding_mask):
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.dropout(out)
        out += residual

        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.enc_dec_attn(out, enc_out, enc_out, causality_mask)
        out = self.dropout(out)
        out += residual
       
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [22]:
class Encoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [23]:
class Decoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]                  
                            
    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [24]:
class Transformer(tf.keras.Model):
    def __init__(self, 
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 src_vocab_size,
                 tgt_vocab_size,
                 pos_len,
                 dropout=0.2,
                 shared=True):
        super(Transformer, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)

        self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.dropout = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared = shared

        if shared: self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]
        out = emb(x)

        if self.shared: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.dropout(out)

        return out

    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = self.decoder(dec_in, enc_out, causality_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [25]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [26]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

learning_rate = LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [27]:
transformer = Transformer(n_layers=2,
                          d_model=512,
                          n_heads=8,
                          d_ff=2048,
                          src_vocab_size=WORD_SIZE,
                          tgt_vocab_size=len(english_tokenizer.word_index),
                          pos_len=39,
                          dropout=0.2,
                          shared=True)

In [28]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [29]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
        
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])
    
    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss, enc_attns, dec_attns, dec_enc_attns

In [37]:
from tqdm import tqdm
import random

EPOCHS = 1
BATCH_SIZE = 128
loss = []

for epoch in range(EPOCHS):
    total_loss = 0
    
    idx_list = list(range(0, encoder_input.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)  

    for (batch, idx) in enumerate(t):
        batch_loss, _, _, _ = train_step(encoder_input[idx:idx+BATCH_SIZE],
                                decoder_input[idx:idx+BATCH_SIZE],
                                transformer,
                                optimizer)
        
        total_loss += batch_loss
        loss.append((total_loss.numpy() / (batch + 1)))
        
        t.set_description_str('Epoch %2d' % (epoch + 1)) 
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

Epoch  1: 100%|██████████| 110/110 [00:43<00:00,  2.55it/s, Loss 1.3574]


In [33]:
def decoder_inference(input_sentence):
    sentence = preprocess(input_sentence)
    input_sentence = [korean_word_to_index[s] for s in mecab.morphs(input_sentence)]
    input_sentence = tf.keras.preprocessing.sequence.pad_sequences([input_sentence], padding="post", maxlen=39)
    
    output_sentence = []
    output_sequence = tf.expand_dims([english_tokenizer.word_index["<sos>"]], 0)
    
    flag = True
    while(flag):
        enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(input_sentence, output_sequence)
        predictions, _, _, _ = transformer(input_sentence, output_sequence, enc_padding_mask, combined_mask, dec_padding_mask)
        
        predicted_id = tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()
        
        if predicted_id == english_tokenizer.word_index["<eos>"]:
            flag = False
            
        output_sentence.append(english_tokenizer.index_word[predicted_id])
        output_sequence = tf.concat([output_sequence, tf.expand_dims([predicted_id], 0)], axis=-1)
    return " ".join(output_sentence[:-1])


In [38]:
print(decoder_inference("처음부터 시작"))
print(decoder_inference("전쟁이 발발하였습니다"))
print(decoder_inference("지구온난화가 심각하다고 주장했습니다"))

it began it began out
the war is now war a war .
in swine gaza gaza .


## 결론
### attentional seq2seq와 비교해보기 위해 이전 프로젝트와 전처리를 같게 놓고 모델을 트랜스포머로 바꿔봤습니다. 트랜스포머의 강력함을 엿볼수 있었던 프로젝트ㅋㅋㅋㅋ 좋네..좋아
### 일단 이전 attention하고 마찬가지로 데이터를 20000개로 샘플링하고 중복제거 최대 문장길이 날려줬기 때문에 대략 만 4천개 정도를 학습데이터로 잡아서 품질은 그렇게까지 좋다고는 못합니다만, 아무래도 핵심은 어텐션과 트랜스포머와의 비교였기 때문에
### 학습 속도부터 비교해 보면 어텐션에서는 학습속도가 매우 느리다는 문제가 있었습니다. 그게 한국어와 영어의 특성때문인지 원인은 알수없지만 트랜스포머를 학습에서부터 굉장히 강력한 퍼포먼스를 보였습니다. 작은 데이터셋에 학습률이 에폭당 너무 빨라서 적당한 에폭에도 오버피팅이 나버린..ㅠ 
### 데이터의 결과에서도 확연한 차이가 났습니다. 저 문장들이 사실 어텐션에서 너무 번역이 안되서 겨우겨우 비슷한 번역되는 몇개 찾은거랔ㅋㅋㅋㅋ 그거 찾는것도 힘들었는데 확실히 트랜스포머를 왠만한 문장도 다 먹히니 역시 압도적
### 끝으로 사실상 이번 프로젝트는 트랜스포머보다 텐서 다루기가 더 늘었던 플젝 하 진짜 텐서 연습해야되는데 너무빡셈 다음플젝까지해서 텐서 잡아놔야지

In [101]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
