<a href="https://colab.research.google.com/github/sonhs99/NLP/blob/main/implementation/Transformer_KoChat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer 모델 구현

사용 데이터: 인공적으로 제작된 문답 쌍 11,876개 (https://github.com/songys/Chatbot_data)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

MODEL_PATH = 'drive/MyDrive/weights/Transformer/'
PREV_MODEL_NAME = 'TRv1-0'
MODEL_NAME = 'TRv1-0'

In [None]:
import tensorflow as tf
import numpy as np
import urllib.request
import pandas as pd

urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


## 1. 데이터 정제 및 사전 구축

In [None]:
import re

# 데이터 정제
# 문장부호를 단어로부터 분리

def preprocess(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

questions = []
for sentence in train_data['Q']:
    questions.append(preprocess(sentence))

answers = []
for sentence in train_data['A']:
    answers.append(preprocess(sentence))

print(questions[:5])
print(answers[:5])

train_Q, train_A = questions[:-1000], answers[:-1000]
test_Q, test_A = questions[-1000:], answers[-1000:]

['12시 땡 !', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요 .', '위로해 드립니다 .', '여행은 언제나 좋죠 .', '여행은 언제나 좋죠 .', '눈살이 찌푸려지죠 .']


In [None]:
import tensorflow_datasets as tfds

# Tokenizer 선언
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13
)

# 사전 크기 조정
# 문자열에 대한 사전 크기에 시작, 종결 토큰 추가
start_token, end_token = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
vocab_size = tokenizer.vocab_size + 2

print('시작 토큰 :', start_token)
print('종결 토큰 :', end_token)
print('사전 크기 :', vocab_size)

시작 토큰 : [8178]
종결 토큰 : [8179]
사전 크기 : 8180


In [None]:
print('원본 :', questions[20])
tokenized_string = tokenizer.encode(questions[20])
print('부호화 문자열 :', tokenized_string)
print('복호화 :', tokenizer.decode(tokenized_string))

원본 : 가스비 비싼데 감기 걸리겠어
부호화 문자열 : [5766, 611, 3509, 141, 685, 3747, 849]
복호화 : 가스비 비싼데 감기 걸리겠어


In [None]:
max_length = 40

def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    for s1, s2 in zip(inputs, outputs):
        s1 = start_token + tokenizer.encode(s1) + end_token
        s2 = start_token + tokenizer.encode(s2) + end_token

        tokenized_inputs.append(s1)
        tokenized_outputs.append(s2)

    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=max_length, padding='post'
    )
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=max_length, padding='post'
    )

    return tokenized_inputs, tokenized_outputs

train_Q, train_A = tokenize_and_filter(train_Q, train_A)
print(train_Q, train_A)

[[8178 7915 4207 ...    0    0    0]
 [8178 7971   47 ...    0    0    0]
 [8178 7973 1435 ...    0    0    0]
 ...
 [8178 2537 4180 ...    0    0    0]
 [8178 2537  166 ...    0    0    0]
 [8178 4096   93 ...    0    0    0]] [[8178 3844   74 ...    0    0    0]
 [8178 1830 5502 ...    0    0    0]
 [8178 3400  777 ...    0    0    0]
 ...
 [8178 1413 5134 ...    0    0    0]
 [8178 1255 2941 ...    0    0    0]
 [8178  822 6535 ...    0    0    0]]


In [None]:
batch_size = 256
buffer_size = 20000

def create_dataset(questions, answers):
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input': questions,
            'dec_inputs': answers[:, :-1] # 종결 토큰 제거, 1문장 39단어
        },
        {
            'outputs': answers[:, 1:], # 시작 토큰 제거, 1문장 39단어
        }
    ))

    dataset = dataset.cache()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

train_dataset = create_dataset(train_Q, train_A)

## 2. Custom Layer 선언

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, dim):
        super(PositionalEncoding, self).__init__()
        self.pos_vec = self.get_vec(position, dim)

    def get_vec(self, pos, dim):
        pos_range = tf.range(pos, dtype=tf.float32)[:, tf.newaxis]
        dim_range = tf.range(dim, dtype=tf.float32)[tf.newaxis, :]
        angles = pos_range / tf.pow(10000, 2 * (dim_range // 2) / tf.cast(dim, tf.float32))

        sines = tf.math.sin(angles[:, 0::2])
        cosines = tf.math.cos(angles[:, 1::2])

        pos_vec = np.zeros(angles.shape)
        pos_vec[:, 0::2] = sines
        pos_vec[:, 1::2] = cosines

        tf_pos_vec = tf.constant(pos_vec)
        tf_pos_vec = tf_pos_vec[tf.newaxis, :]
        return tf.cast(tf_pos_vec, tf.float32)

    def call(self, x):
        # print(x.shape, self.pos_vec.shape)
        return x + self.pos_vec[:, :tf.shape(x)[1], :]

In [None]:
from tensorflow.keras.layers import Dense

def attention(query, key, value, mask):
    depth = tf.cast(tf.shape(key)[-1], dtype=tf.float32)
    logit = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(depth)

    if mask is not None:
        logit += (mask * -1e9)

    dist = tf.nn.softmax(logit, axis=-1)
    return tf.matmul(dist, value)

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_head, dim):
        super(AttentionLayer, self).__init__()

        assert dim % num_head == 0

        self.query = Dense(dim)
        self.key = Dense(dim)
        self.value = Dense(dim)

        self.w_o = Dense(dim)

        self.num_head = num_head
        self.dim = dim
        self.head_size = dim // num_head

    def split_value(self, x, batch_size):
        split = tf.reshape(x, [batch_size, -1, self.num_head, self.head_size])  # [batch, num_vocab, num_head, head_dim]
        return tf.transpose(split, [0, 2, 1, 3])                                # [batch, num_head, num_vocab, head_dim]

    def call(self, inputs):
        query = self.query(inputs[0])
        key = self.key(inputs[1])
        value = self.value(inputs[2])
        mask = inputs[3]

        batch_size = tf.shape(query)[0]

        query = self.split_value(query, batch_size)
        key = self.split_value(key, batch_size)
        value = self.split_value(value, batch_size)

        attention_value = attention(query, key, value, mask)                # [batch, num_head, num_vocab, head_dim]
        attention_value = tf.transpose(attention_value, perm=[0, 2, 1, 3])  # [batch, num_vocab, num_head, head_dim]
        attention_value = tf.reshape(attention_value, [batch_size, -1, self.dim]) # [batch, num_vocab, d_model] (num_head * head_dim = d_model)
        return self.w_o(attention_value)

## 3. Encoder-Decoder 블록 및 Transformer 모델 생성

In [None]:
def encoder_block(dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None, dim])
    padding_input = tf.keras.layers.Input([1, 1, None])

    attention_x = AttentionLayer(num_head, dim)([input, input, input, padding_input])
    attention_x = tf.keras.layers.Dropout(dropout)(attention_x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(input + attention_x)

    ffnn_x = Dense(dif, activation='relu')(x)
    ffnn_x = Dense(dim)(ffnn_x)

    ffnn_x = tf.keras.layers.Dropout(dropout)(ffnn_x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + ffnn_x)
    return tf.keras.Model(inputs=[input, padding_input], outputs=x)

def encoder(vocab_size, num_layers, dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None,])
    padding_input = tf.keras.layers.Input([1, 1, None])

    embedding = tf.keras.layers.Embedding(vocab_size, dim)(input)
    embedding *= tf.math.sqrt(tf.cast(dim, dtype=tf.float32)) # Embedding Vector가 작게 학습되는 것을 방지
    positional_encoding = PositionalEncoding(vocab_size, dim)(embedding)
    output = tf.keras.layers.Dropout(dropout)(positional_encoding)

    for i in range(num_layers):
        output = encoder_block(dif, num_head, dim, dropout)([output, padding_input])

    return tf.keras.Model(inputs=[input, padding_input], outputs=output, name='encoder')

In [None]:
def decoder_block(dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None, dim])
    encoder_input = tf.keras.layers.Input([None, dim])
    padding_input = tf.keras.layers.Input([1, 1, None])
    mask_input = tf.keras.layers.Input([1, None, None])

    mask_attention_x = AttentionLayer(num_head, dim)([input, input, input, mask_input])
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(input + mask_attention_x)

    attention_x = AttentionLayer(num_head, dim)([x, encoder_input, encoder_input, padding_input])
    attention_x = tf.keras.layers.Dropout(dropout)(attention_x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + attention_x)

    ffnn_x = Dense(dif, activation='relu')(x)
    ffnn_x = Dense(dim)(ffnn_x)

    ffnn_x = tf.keras.layers.Dropout(dropout)(ffnn_x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + ffnn_x)
    return tf.keras.Model(inputs=[input, encoder_input, padding_input, mask_input], outputs=x)

def decoder(vocab_size, num_layers, dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None,])
    encoder_input = tf.keras.layers.Input([None, dim])
    padding_input = tf.keras.layers.Input([1, 1, None])
    mask_input = tf.keras.layers.Input([1, None, None])

    embedding = tf.keras.layers.Embedding(vocab_size, dim)(input)
    embedding *= tf.math.sqrt(tf.cast(dim, dtype=tf.float32)) # ?
    positional_encoding = PositionalEncoding(vocab_size, dim)(embedding)
    output = tf.keras.layers.Dropout(dropout)(positional_encoding)

    for i in range(num_layers):
        output = decoder_block(dif, num_head, dim, dropout)([output, encoder_input, padding_input, mask_input])

    return tf.keras.Model(inputs=[input, encoder_input, padding_input, mask_input], outputs=output, name='decoder')

In [None]:
def padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), dtype=tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    t_look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    t_padding_mask = padding_mask(x)
    return tf.maximum(t_look_ahead_mask, t_padding_mask)

def transformer(vocab_size, num_layers, dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None,], name='input')
    dec_input = tf.keras.layers.Input([None,], name='dec_inputs')

    enc_padding = tf.keras.layers.Lambda(padding_mask, output_shape=[1, 1, None])(input)
    dec_padding = tf.keras.layers.Lambda(padding_mask, output_shape=[1, 1, None])(input)

    dec_mask = tf.keras.layers.Lambda(look_ahead_mask, output_shape=[1, None, None])(dec_input)

    enc = encoder(vocab_size, num_layers, dif, num_head, dim, dropout)([input, enc_padding])
    dec = decoder(vocab_size, num_layers, dif, num_head, dim, dropout)([dec_input, enc, dec_padding, dec_mask])

    output = tf.keras.layers.Dense(vocab_size, name='outputs')(dec)
    return tf.keras.Model(inputs=[input, dec_input], outputs=output)


In [None]:
dim = 512
num_layers = 6
num_head = 8
diff = 2048
dropout = 0.1

model = transformer(
    vocab_size=vocab_size,
    num_layers=num_layers,
    dif=diff,
    dim=dim,
    num_head=num_head,
    dropout=dropout
)

if PREV_MODEL_NAME: model.load_weights(MODEL_PATH+PREV_MODEL_NAME+'.h5')

## 4. 모델 Compile 및 학습

In [None]:
class TransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, dim, warmup_step=4000):
        self.d_model = tf.cast(dim, tf.float32)
        self.warmup_step = warmup_step

    def __call__(self, step):
        step = step
        step = tf.cast(step, tf.float32)
        temp1 = tf.math.rsqrt(step)
        temp2 = step * (self.warmup_step ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(temp1, temp2)

    def get_config(self):
        config = {
            'dim': self.d_model,
            'warmup_step': self.warmup_step,
        }
        return config

In [None]:
lr = TransformerSchedule(dim)
opt = tf.optimizers.Adam(lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, max_length - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, max_length - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none'
    )(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)

model.compile(
    optimizer=opt, loss=loss_function, metrics=[accuracy])

In [None]:
model.summary(expand_nested=True)

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, None)]               0         []                            
                                                                                                  
 dec_inputs (InputLayer)     [(None, None)]               0         []                            
                                                                                                  
 lambda (Lambda)             (None, 1, 1, None)           0         ['input[0][0]']               
                                                                                                  
 encoder (Functional)        (None, None, 512)            2310246   ['input[0][0]',               
                                                          4          'lambda[0][0]']       

In [None]:
epoch = 1000
history = model.fit(train_dataset, epochs=epoch)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
import pandas as pd

model.save_weights(MODEL_PATH + MODEL_NAME+'.h5', save_format='h5')

hist = pd.DataFrame(history.history)
hist_filename = MODEL_NAME + '.json'
with open(MODEL_PATH + hist_filename, 'w') as file:
    hist.to_json(file)

## 5. Test

In [None]:
def eval(input_token):
    answer = tf.expand_dims(start_token, 0)
    for i in range(max_length):
        prediction = model(inputs=[input_token, answer], training=False)
        prediction = prediction[:, -1:, :]
        next_token = tf.cast(tf.argmax(prediction, axis=-1), dtype=tf.int32)

        if tf.equal(next_token, end_token[0]):
            break

        answer = tf.concat([answer, next_token], axis=-1)
    return tf.squeeze(answer, axis=0)

def pred(sentence):
    sentence = preprocess(sentence)
    tokenized = tf.expand_dims(start_token + tokenizer.encode(sentence) + end_token, axis=0)

    answer_token = eval(tokenized)
    answer = tokenizer.decode(
        [x for x in answer_token if x < tokenizer.vocab_size]
    )

    print(f"질문: {sentence}")
    print(f'정답(Token): {answer_token}')
    print(f'정답: {answer}')

    return answer

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

Q, A = [], []
for test_q, test_a in tqdm(zip(test_Q, test_A)):
    test_q = preprocess(test_q)
    test_q = tf.expand_dims(start_token + tokenizer.encode(test_q) + end_token, axis=0)
    pred_token = [x for x in eval(test_q).numpy() if x < tokenizer.vocab_size]
    Q.append(pred_token)

    test_a = preprocess(test_a)
    A.append([tokenizer.encode(test_a)])

1000it [41:29,  2.49s/it]


In [None]:
print('BLEU-1 :', corpus_bleu(A, Q, weights=(1, 0, 0, 0)))
print('BLEU-2 :', corpus_bleu(A, Q, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3 :', corpus_bleu(A, Q, weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-4 :', corpus_bleu(A, Q, weights=(0.25, 0.25, 0.25, 0.25)))

BLEU-1 : 0.008939462117329095
BLEU-2 : 0.0036108485651868876
BLEU-3 : 0.0019243356049317322
BLEU-4 : 0.0011447194277632124


In [None]:
pred('영화')

질문: 영화 볼래 ?
정답(Token): [8178 6072 1089   27   75   18   22    1]
정답: 최신 영화가 좋을 것 같아요 .


'최신 영화가 좋을 것 같아요 .'

In [None]:
pred('밥')

질문: 밥 먹고 싶어
정답(Token): [8178  842    8  126    1]
정답: 맛있는 거 드세요 .


'맛있는 거 드세요 .'

In [None]:
pred('곧')

질문: 곧 연말이야
정답(Token): [8178 1202  213    1]
정답: 혼자가 아니에요 .


'혼자가 아니에요 .'