<a href="https://colab.research.google.com/github/sonhs99/NLP/blob/main/implementation/GPT1_KoChat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT - 1 모델 구현 - Pretrain

사용 데이터: 인공적으로 제작된 문답 쌍 11,876개 (https://github.com/songys/Chatbot_data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MODEL_PATH = 'drive/MyDrive/weights/GPT1/'
PREV_MODEL_NAME = None #'koGPTv3'
MODEL_NAME = 'koGPTv5'

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np
import urllib.request
import pandas as pd

urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


## 1. 데이터 정제 및 사전 구축

In [None]:
import re

# 데이터 정제
# 문장부호를 단어로부터 분리

def preprocess(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

questions = []
for sentence in train_data['Q']:
    questions.append(preprocess(sentence))

answers = []
for sentence in train_data['A']:
    answers.append(preprocess(sentence))

print(questions[:5])
print(answers[:5])

train_Q, train_A = questions[:-1000], answers[:-1000]
test_Q, test_A = questions[-1000:], answers[-1000:]

['12시 땡 !', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요 .', '위로해 드립니다 .', '여행은 언제나 좋죠 .', '여행은 언제나 좋죠 .', '눈살이 찌푸려지죠 .']


In [None]:
import tensorflow_datasets as tfds

# Tokenizer 선언
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13
)

# 사전 크기 조정
# 문자열에 대한 사전 크기에 시작, 종결 토큰 추가
start_token, end_token, delimiter_token = [tokenizer.vocab_size], [tokenizer.vocab_size + 1], [tokenizer.vocab_size + 2]
vocab_size = tokenizer.vocab_size + 3

print('시작 토큰 :', start_token)
print('종결 토큰 :', end_token)
print('사전 크기 :', vocab_size)

시작 토큰 : [8178]
종결 토큰 : [8179]
사전 크기 : 8181


In [None]:
print('원본 :', questions[20])
tokenized_string = tokenizer.encode(questions[20])
print('부호화 문자열 :', tokenized_string)
print('복호화 :', tokenizer.decode(tokenized_string))

원본 : 가스비 비싼데 감기 걸리겠어
부호화 문자열 : [5766, 611, 3509, 141, 685, 3747, 849]
복호화 : 가스비 비싼데 감기 걸리겠어


In [None]:
max_length = 40

def tokenize_and_filter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    for s1, s2 in zip(inputs, outputs):
        s1 = start_token + tokenizer.encode(s1) + end_token
        s2 = start_token + tokenizer.encode(s2) + end_token

        tokenized_inputs.append(s1)
        tokenized_outputs.append(s2)

    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=max_length+1, padding='post'
    )
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=max_length+1, padding='post'
    )

    return tokenized_inputs, tokenized_outputs

train_Q, train_A = tokenize_and_filter(train_Q, train_A)
print(train_Q.shape, train_A.shape)

(10823, 41) (10823, 41)


In [None]:
batch_size = 256
buffer_size = 20000

def create_dataset(questions, answers):
    data = tf.concat([questions, answers], axis=0)
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input': data[:, :-1] # 종결 토큰 제거, 1문장 40단어
        },
        {
            'output': data[:, 1:], # 시작 토큰 제거, 1문장 40단어
        }
    ))

    dataset = dataset.cache()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

train_dataset = create_dataset(train_Q, train_A)

## 2. Custom Layer 선언

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, dim):
        super(PositionalEncoding, self).__init__()
        self.sequence_length = sequence_length
        self.pos_vec = tf.keras.layers.Embedding(sequence_length, dim)

    def call(self, x):
        vec = tf.range(self.sequence_length)
        return x + self.pos_vec(vec)[:tf.shape(x)[1]]

In [None]:
def attention(query, key, value, mask):
    depth = tf.cast(tf.shape(key)[-1], dtype=tf.float32)
    logit = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(depth)

    if mask is not None:
        logit += (mask * -1e9)

    dist = tf.nn.softmax(logit, axis=-1)
    return tf.matmul(dist, value)

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_head, dim):
        super(AttentionLayer, self).__init__()

        assert dim % num_head == 0

        self.query = tf.keras.layers.Dense(dim, use_bias=False)
        self.key = tf.keras.layers.Dense(dim, use_bias=False)
        self.value = tf.keras.layers.Dense(dim, use_bias=False)

        self.w_o = tf.keras.layers.Dense(dim, use_bias=False)

        self.num_head = num_head
        self.dim = dim
        self.head_size = dim // num_head

    def split_value(self, x, batch_size):
        split = tf.reshape(x, [batch_size, -1, self.num_head, self.head_size])  # [batch, num_vocab, num_head, head_dim]
        return tf.transpose(split, [0, 2, 1, 3])                                # [batch, num_head, num_vocab, head_dim]

    def call(self, inputs):
        query = self.query(inputs[0])
        key = self.key(inputs[1])
        value = self.value(inputs[2])
        mask = inputs[3]

        batch_size = tf.shape(query)[0]

        query = self.split_value(query, batch_size)
        key = self.split_value(key, batch_size)
        value = self.split_value(value, batch_size)

        attention_value = attention(query, key, value, mask)                # [batch, num_head, num_vocab, head_dim]
        attention_value = tf.transpose(attention_value, perm=[0, 2, 1, 3])  # [batch, num_vocab, num_head, head_dim]
        attention_value = tf.reshape(attention_value, [batch_size, -1, self.dim]) # [batch, num_vocab, d_model] (num_head * head_dim = d_model)
        return self.w_o(attention_value)

In [None]:
class EmbeddingTransposeLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_layer: tf.keras.layers.Embedding, **kargs):
        super(EmbeddingTransposeLayer, self).__init__(**kargs)
        self.embeddings = embedding_layer

    def build(self, input_shape):
        self.custom_weights = self.embeddings.weights[0]
        self.build = True

    def compute_output_shape(self, input_shape):
        return input_shape[0], tf.keras.backend.int_shape(self.custom_weights)[0]

    def call(self, x):
        return tf.keras.backend.dot(
            x, tf.keras.backend.transpose(self.custom_weights))

## 3. Decoder 블록 및 GPT 모델 생성

In [None]:
def decoder_block(dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None, dim])
    mask_input = tf.keras.layers.Input([1, None, None])

    mask_attention_x = AttentionLayer(num_head, dim)([input, input, input, mask_input])
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(input + mask_attention_x)

    ffnn_x = tf.keras.layers.Dense(dif, activation='gelu')(x)
    ffnn_x = tf.keras.layers.Dense(dim)(ffnn_x)

    ffnn_x = tf.keras.layers.Dropout(dropout)(ffnn_x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + ffnn_x)
    return tf.keras.Model(inputs=[input, mask_input], outputs=x)

def decoder(vocab_size, num_layers, dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None, dim])
    mask_input = tf.keras.layers.Input([1, None, None])

    positional_encoding = PositionalEncoding(max_length * 2 + 3, dim)(input)
    output = tf.keras.layers.Dropout(dropout)(positional_encoding)

    for i in range(num_layers):
        output = decoder_block(dif, num_head, dim, dropout)([output, mask_input])

    return tf.keras.Model(inputs=[input, mask_input], outputs=output, name='decoder')

In [None]:
def padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), dtype=tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    t_look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    t_padding_mask = padding_mask(x)
    return tf.maximum(t_look_ahead_mask, t_padding_mask)

def gpt_model(vocab_size, num_layers, dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None,], name='input')

    embedding_layer = tf.keras.layers.Embedding(vocab_size, dim)
    embedding = embedding_layer(input)
    dec_mask = tf.keras.layers.Lambda(look_ahead_mask, output_shape=[1, None, None])(input)

    dec = decoder(vocab_size, num_layers, dif, num_head, dim, dropout)([embedding, dec_mask])

    output = EmbeddingTransposeLayer(embedding_layer, name='output')(dec)
    return tf.keras.Model(inputs=input, outputs=output)

In [None]:
dim = 768
num_layers = 12
num_head = 12
diff = 3072
dropout = 0.1

model = gpt_model(
    vocab_size=vocab_size,
    num_layers=num_layers,
    dif=diff,
    dim=dim,
    num_head=num_head,
    dropout=dropout
)

if PREV_MODEL_NAME: model.load_weights(MODEL_PATH+PREV_MODEL_NAME+'.h5')

## 4. 모델 Compile 및 학습

In [None]:
class TransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, dim, warmup_step=4000):
        self.d_model = tf.cast(dim, tf.float32)
        self.warmup_step = warmup_step

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        temp1 = tf.math.rsqrt(step)
        temp2 = step * (self.warmup_step ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(temp1, temp2)

    def get_config(self):
        config = {
            'dim': self.d_model,
            'warmup_step': self.warmup_step,
        }
        return config

In [None]:
# lr = tf.keras.optimizers.schedules.CosineDecay(
#     initial_learning_rate=0,
#     decay_steps=2000,
#     warmup_target=2.5e-4,
#     warmup_steps=2000
# )
lr = TransformerSchedule(dim)
opt = tf.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, max_length))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

def perplexity(y_true, y_pred):
    loss = loss_function(y_true, y_pred)
    return tf.math.exp(loss)

def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, max_length))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none'
    )(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)

model.compile(
    optimizer=opt, loss=loss_function, metrics=[accuracy, perplexity])

In [None]:
model.summary(expand_nested=True)

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 768)            6283008   ['input[0][0]']               
                                                                                                  
 lambda (Lambda)             (None, 1, None, None)        0         ['input[0][0]']               
                                                                                                  
 decoder (Functional)        (None, None, 768)            8508134   ['embedding[0][0]',           
                                                          4          'lambda[0][0]']       

In [None]:
epoch = 400
history = model.fit(train_dataset, epochs=epoch, callbacks=[
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, start_from_epoch=100)
])

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

In [None]:
import pandas as pd

model.save_weights(MODEL_PATH + MODEL_NAME+'.h5', save_format='h5')

hist = pd.DataFrame(history.history)
hist_filename = MODEL_NAME + '.json'
with open(MODEL_PATH + hist_filename, 'w') as file:
    hist.to_json(file)

## 5. Test

In [None]:
def eval(tokens):
    answer = tf.expand_dims(start_token + tokens, 0)
    for i in range(max_length):
        prediction = model(inputs=answer, training=False)
        prediction = prediction[:, -1:, :]
        next_token = tf.cast(tf.argmax(prediction, axis=-1), dtype=tf.int32)

        if tf.equal(next_token, end_token[0]):
            break

        answer = tf.concat([answer, next_token], axis=-1)
    return tf.squeeze(answer, axis=0)

def pred(sentence):
    sentence = preprocess(sentence)
    tokenized = tokenizer.encode(sentence)

    answer_token = eval(tokenized)
    answer = tokenizer.decode(
        [x for x in answer_token if x < tokenizer.vocab_size]
    )

    print(f"단어: {sentence}")
    print(f'생성(Token): {answer_token}')
    print(f'생성: {answer}')

    return answer

In [None]:
pred('라면')

단어: 라면
생성(Token): [8178 4423 8157 8102 8059   31 7356 7954 2380   32    1]
생성: 라면봉지 레시피 그대로 해보세요 .


'라면봉지 레시피 그대로 해보세요 .'

In [None]:
pred('밥')

단어: 밥
생성(Token): [8178 1330 4192  456  505    1]
생성: 밥심으로 사는 거죠 .


'밥심으로 사는 거죠 .'

In [None]:
pred('땅')

단어: 땅
생성(Token): [8178 2662  657 1322]
생성: 땅이나 살까


'땅이나 살까'