<a href="https://colab.research.google.com/github/sonhs99/NLP/blob/main/implementation/GPT1_KoChat_ft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT - 1 모델 구현 - Fine-tuning

사용 데이터: 인공적으로 제작된 문답 쌍 11,876개 (https://github.com/songys/Chatbot_data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MODEL_PATH = 'drive/MyDrive/weights/GPT1/'
PREV_MODEL_NAME = 'koGPTv5-chatbot2'
GPT_MODEL_NAME = 'koGPTv5'
MODEL_NAME = 'koGPTv5-chatbot2'

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np
import urllib.request
import pandas as pd

urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


## 1. 데이터 정제 및 사전 구축

In [None]:
import re
import random

# 데이터 정제
# 문장부호를 단어로부터 분리

def preprocess(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

questions = []
for sentence in train_data['Q']:
    questions.append(preprocess(sentence))

answers = []
for sentence in train_data['A']:
    answers.append(preprocess(sentence))

labels = []
for label in train_data['label']:
    labels.append(int(label))

print(questions[:5])
print(answers[:5])
print(labels[:5])

# dataset = list(zip(questions, answers, labels))
# random.shuffle(dataset)
# questions, answers, labels = zip(*dataset)

# train_Q, train_A, train_l = questions[:-1000], answers[:-1000], labels[:-1000]
# test_Q, test_A, test_l = questions[-1000:], answers[-1000:], labels[-1000:]
train_Q, train_A, train_l = questions, answers, labels

['12시 땡 !', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요 .', '위로해 드립니다 .', '여행은 언제나 좋죠 .', '여행은 언제나 좋죠 .', '눈살이 찌푸려지죠 .']
[0, 0, 0, 0, 0]


In [None]:
import tensorflow_datasets as tfds

# Tokenizer 선언
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13
)

# 사전 크기 조정
# 문자열에 대한 사전 크기에 시작, 종결 토큰 추가
start_token, end_token, delimiter_token = [tokenizer.vocab_size], [tokenizer.vocab_size + 1], [tokenizer.vocab_size + 2]
vocab_size = tokenizer.vocab_size + 3

print('시작 토큰 :', start_token)
print('종결 토큰 :', end_token)
print('사전 크기 :', vocab_size)

시작 토큰 : [8178]
종결 토큰 : [8179]
사전 크기 : 8181


In [None]:
print('원본 :', questions[20])
tokenized_string = tokenizer.encode(questions[20])
print('부호화 문자열 :', tokenized_string)
print('복호화 :', tokenizer.decode(tokenized_string))

원본 : 가스비 비싼데 감기 걸리겠어
부호화 문자열 : [5766, 611, 3509, 141, 685, 3747, 849]
복호화 : 가스비 비싼데 감기 걸리겠어


In [None]:
max_length = 40

def tokenize_and_filter(inputs, outputs):
    sequence, label = [], []

    for s1, s2 in zip(inputs, outputs):
        s1 = tokenizer.encode(s1)
        s2 = tokenizer.encode(s2)

        s = start_token + s1 + delimiter_token + s2 + end_token
        sequence.append(s)

        l = [0]*(len(s1) + 1) + s2 + end_token
        label.append(l)

        # mask = [0]*(len(s1) + 1) + [1]*(len(s2) + 2)

    sequence = tf.keras.preprocessing.sequence.pad_sequences(
        sequence, maxlen=2*max_length+3, padding='post'
    )

    label = tf.keras.preprocessing.sequence.pad_sequences(
        label, maxlen=2*max_length+3, padding='post'
    )

    # mask = tf.keras.preprocessing.sequence.pad_sequences(
    #     mask, maxlen=2*max_length+3, padding='post'
    # )

    return sequence, label

train_S, train_l = tokenize_and_filter(train_Q, train_A)
# test_S = tokenize_and_filter(test_Q, test_A)
print(train_S.shape)

(11823, 83)


In [None]:
batch_size = 128
# buffer_size = 20000

# def create_dataset(questions, answers):
#     dataset = tf.data.Dataset.from_tensor_slices((
#         {
#             'main_input': questions[:, :-1] # 종결 토큰 제거, 1문장 40단어
#         },
#         {
#             'gpt': questions[:, 1:] # 종결 토큰 제거, 1문장 40단어
#         },
#         {
#             'head': answers
#         }
#     ))

#     dataset = dataset.cache()
#     dataset = dataset.shuffle(buffer_size)
#     dataset = dataset.batch(batch_size)
#     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
#     return dataset

# train_dataset = create_dataset(train_S, np.array(train_l))
# train_X = train_S
# train_gpt = train_S[:, 1:]
# train_h = np.array(train_l)

## 2. Custom Layer 선언

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, dim):
        super(PositionalEncoding, self).__init__()
        self.sequence_length = sequence_length
        self.pos_vec = tf.keras.layers.Embedding(sequence_length, dim)

    def call(self, x):
        vec = tf.range(self.sequence_length)
        return x + self.pos_vec(vec)[:tf.shape(x)[1]]

In [None]:
def attention(query, key, value, mask):
    depth = tf.cast(tf.shape(key)[-1], dtype=tf.float32)
    logit = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(depth)

    if mask is not None:
        logit += (mask * -1e9)

    dist = tf.nn.softmax(logit, axis=-1)
    return tf.matmul(dist, value)

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, num_head, dim):
        super(AttentionLayer, self).__init__()

        assert dim % num_head == 0

        self.query = tf.keras.layers.Dense(dim, use_bias=False)
        self.key = tf.keras.layers.Dense(dim, use_bias=False)
        self.value = tf.keras.layers.Dense(dim, use_bias=False)

        self.w_o = tf.keras.layers.Dense(dim, use_bias=False)

        self.num_head = num_head
        self.dim = dim
        self.head_size = dim // num_head

    def split_value(self, x, batch_size):
        split = tf.reshape(x, [batch_size, -1, self.num_head, self.head_size])  # [batch, num_vocab, num_head, head_dim]
        return tf.transpose(split, [0, 2, 1, 3])                                # [batch, num_head, num_vocab, head_dim]

    def call(self, inputs):
        query = self.query(inputs[0])
        key = self.key(inputs[1])
        value = self.value(inputs[2])
        mask = inputs[3]

        batch_size = tf.shape(query)[0]

        query = self.split_value(query, batch_size)
        key = self.split_value(key, batch_size)
        value = self.split_value(value, batch_size)

        attention_value = attention(query, key, value, mask)                # [batch, num_head, num_vocab, head_dim]
        attention_value = tf.transpose(attention_value, perm=[0, 2, 1, 3])  # [batch, num_vocab, num_head, head_dim]
        attention_value = tf.reshape(attention_value, [batch_size, -1, self.dim]) # [batch, num_vocab, d_model] (num_head * head_dim = d_model)
        return self.w_o(attention_value)

In [None]:
class EmbeddingTransposeLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_layer: tf.keras.layers.Embedding, **kargs):
        super(EmbeddingTransposeLayer, self).__init__(**kargs)
        self.embeddings = embedding_layer

    def build(self, input_shape):
        self.custom_weights = self.embeddings.weights[0]
        self.build = True

    def compute_output_shape(self, input_shape):
        return input_shape[0], tf.keras.backend.int_shape(self.custom_weights)[0]

    def call(self, x):
        return tf.keras.backend.dot(
            x, tf.keras.backend.transpose(self.custom_weights))

## 3. Decoder 블록 및 GPT 모델 생성

In [None]:
def decoder_block(dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None, dim])
    mask_input = tf.keras.layers.Input([1, None, None])

    mask_attention_x = AttentionLayer(num_head, dim)([input, input, input, mask_input])
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(input + mask_attention_x)

    ffnn_x = tf.keras.layers.Dense(dif, activation='gelu')(x)
    ffnn_x = tf.keras.layers.Dense(dim)(ffnn_x)

    ffnn_x = tf.keras.layers.Dropout(dropout)(ffnn_x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + ffnn_x)
    return tf.keras.Model(inputs=[input, mask_input], outputs=x)

def decoder(vocab_size, num_layers, dif, num_head, dim, dropout):
    input = tf.keras.layers.Input([None, dim])
    mask_input = tf.keras.layers.Input([1, None, None])

    positional_encoding = PositionalEncoding(2 * max_length + 3, dim)(input)
    output = tf.keras.layers.Dropout(dropout)(positional_encoding)

    for i in range(num_layers):
        output = decoder_block(dif, num_head, dim, dropout)([output, mask_input])

    return tf.keras.Model(inputs=[input, mask_input], outputs=output, name='decoder')

In [None]:
def padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), dtype=tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def look_ahead_mask(x):
    seq_len = tf.shape(x)[1]
    t_look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    t_padding_mask = padding_mask(x)
    return tf.maximum(t_look_ahead_mask, t_padding_mask)

def gpt(vocab_size, num_layers, dif, num_head, dim, dropout, output_dim):
    input = tf.keras.layers.Input([None,], name='input')

    embedding_layer = tf.keras.layers.Embedding(vocab_size, dim)
    embedding = embedding_layer(input)
    dec_mask = tf.keras.layers.Lambda(look_ahead_mask, output_shape=[1, None, None])(input)

    dec = decoder(vocab_size, num_layers, dif, num_head, dim, dropout)([embedding, dec_mask])

    ml_output = EmbeddingTransposeLayer(embedding_layer, name='ml')(dec[:, :-1, :])

    # cls_idx = tf.argmax(tf.equal(input, end_token[0]), axis=1)
    # batch_size = tf.cast(tf.shape(cls_idx), dtype=tf.int64)[0]
    # cls_idx = tf.range(batch_size, dtype=tf.int64) * batch_size + cls_idx
    # gpt_cls = tf.reshape(dec, (-1, dim))
    # gpt_cls = tf.gather(params=gpt_cls, indices=cls_idx)
    cls_output = tf.keras.layers.Dropout(0.1)(dec)
    cls_output = tf.keras.layers.Dense(output_dim, name='cls')(cls_output)

    return [
        tf.keras.Model(inputs=input, outputs=ml_output),
        tf.keras.Model(inputs=input, outputs=cls_output),
        tf.keras.Model(inputs=input, outputs=[cls_output, ml_output])
        ]

In [None]:
dim = 768
num_layers = 12
num_head = 12
diff = 3072
dropout = 0.1

gpt_model, inf_model, model = gpt(
    vocab_size=vocab_size,
    num_layers=num_layers,
    dif=diff,
    dim=dim,
    num_head=num_head,
    dropout=dropout,
    output_dim=vocab_size
)

if PREV_MODEL_NAME: model.load_weights(MODEL_PATH+PREV_MODEL_NAME+'.h5')
elif GPT_MODEL_NAME: gpt_model.load_weights(MODEL_PATH+GPT_MODEL_NAME+'.h5')

## 4. 모델 Compile 및 학습

In [None]:
class TransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, dim, warmup_step=4000):
        self.d_model = tf.cast(dim, tf.float32)
        self.warmup_step = warmup_step

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        temp1 = tf.math.rsqrt(step)
        temp2 = step * (self.warmup_step ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(temp1, temp2)

    def get_config(self):
        config = {
            'dim': self.d_model,
            'warmup_step': self.warmup_step,
        }
        return config

In [None]:
def gen(tokens):
    input = tf.expand_dims(start_token + tokens + delimiter_token, 0)
    answer = []
    for i in range(max_length):
        prediction = inf_model(inputs=input, training=False)[:, -1:, :]
        next_token = tf.cast(tf.argmax(prediction, axis=-1), dtype=tf.int32)

        if tf.equal(next_token, end_token[0]): break
        if tf.equal(next_token, delimiter_token[0]): continue

        input = tf.concat([input, next_token], axis=-1)
        answer.append(next_token[0, 0])
    return np.array(answer)

begin_token = tokenizer.encode('1지망 학교 떨어졌어')

class TextGenerationCallback(tf.keras.callbacks.Callback):
    def __init__(self, generator, s_token, tokenizer):
        self.generator = generator
        self.start_token = s_token
        self.tokenizer = tokenizer

    def on_epoch_end(self, epoch, logs=None):
        pred_token = self.generator(self.start_token)
        answer = tokenizer.decode(pred_token)
        print('\nText :', answer, '$')

In [None]:
pred_token = gen(begin_token)
print(tokenizer.decode(pred_token))

In [None]:
# lr = tf.keras.optimizers.schedules.CosineDecay(
#     initial_learning_rate=0,
#     decay_steps=2000,
#     warmup_target=2.5e-4,
#     warmup_steps=2000
# )
lr = TransformerSchedule(dim)
opt = tf.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def gpt_loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, 2 * max_length + 3))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none'
    )(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)

inf_model.compile(
    optimizer=opt,
    loss=[gpt_loss_function],
    # loss_weights=[1, 0.5],
    )

In [None]:
inf_model.summary(expand_nested=True)

In [None]:
epoch = 400
history = inf_model.fit(train_S, train_l,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_split=0.1,
                    epochs=epoch,
                    callbacks=[
                        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, start_from_epoch=10),
                        TextGenerationCallback(gen, begin_token, tokenizer)
                    ])

In [None]:
import pandas as pd

model.save_weights(MODEL_PATH + MODEL_NAME+'.h5', save_format='h5')

hist = pd.DataFrame(history.history)
hist_filename = MODEL_NAME + '.json'
with open(MODEL_PATH + hist_filename, 'w') as file:
    hist.to_json(file)

## 5. Test

In [None]:
idx = 20
s = [start_token + tokenizer.encode(train_Q[idx]) + delimiter_token + tokenizer.encode(train_A[idx]) + end_token]
pred = np.argmax(model.predict(s)[0])
print(train_Q[idx])
print(train_A[idx])
print(pred, train_l[idx])

In [None]:
inf_model.compile(
    optimizer=opt,
    loss='sparse_categorical_crossentropy',
    metrics='sparse_categorical_accuracy'
)
# test_S = []
# for s1, s2 in zip(train_Q, test_A):
#     s = start_token + tokenizer.encode(s1) + delimiter_token + tokenizer.encode(s2) + end_token
#     test_S.append(s)

# test_S = tf.keras.preprocessing.sequence.pad_sequences(
#         test_S, maxlen=2 * max_length + 3, padding='post'
# )
train_h = np.array(train_l)
inf_model.evaluate(train_S, train_h)

In [None]:
def eval(tokens):
    answer = tf.expand_dims(start_token + tokens, 0)
    for i in range(max_length):
        prediction = inf_model(inputs=answer, training=False)
        prediction = prediction[:, -1:, :]
        next_token = tf.cast(tf.argmax(prediction, axis=-1), dtype=tf.int32)

        if tf.equal(next_token, end_token[0]):
            break

        answer = tf.concat([answer, next_token], axis=-1)
    return tf.squeeze(answer, axis=0)

def pred(sentence):
    sentence = preprocess(sentence)
    tokenized = tokenizer.encode(sentence)

    answer_token = gen(tokenized)
    answer = tokenizer.decode(
        [x for x in answer_token if x < tokenizer.vocab_size]
    )

    print(f"질문: {sentence}")
    print(f'정답(Token): {answer_token}')
    print(f'정답: {answer}')

    return answer

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

Q, A = [], []
for sentense in tqdm(test_Q + test_A):
    sentence = preprocess(sentence)
    tokenized = tokenizer.encode(sentence)
    pred_token = [x for x in eval([tokenized[0]]).numpy() if x < tokenizer.vocab_size]
    Q.append(pred_token)

    A.append([tokenized])

In [None]:
print('BLEU-1 :', corpus_bleu(A, Q, weights=(1, 0, 0, 0)))
print('BLEU-2 :', corpus_bleu(A, Q, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3 :', corpus_bleu(A, Q, weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-4 :', corpus_bleu(A, Q, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
pred('1지망 학교 떨어졌어')

질문: 1지망 학교 떨어졌어
정답(Token): [  70 5377   81    1]
정답: 시간이 무색하네요 .


'시간이 무색하네요 .'

In [None]:
pred('잠이 온다')

질문: 잠이 온다
정답(Token): [560 653   1]
정답: 일찍 주무세요 .


'일찍 주무세요 .'

In [None]:
pred('비가 내리네')

질문: 비가 내리네
정답(Token): [ 535   45 3276    1]
정답: 우리 다 참아보세요 .


'우리 다 참아보세요 .'