### Transformer to GPT-1 

In [15]:
# 1. Encoder 제거 
# 2. Masked Self-Attention으로 변경 
# 3. 입력 데이터 수정 (Auto-Regressive)
# 4. Loss (3.1 Unsupervised pre-training + 3.2 Supervised fine-tuning)로 수정

#### 전처리

In [4]:
import tensorflow
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print(tensorflow.__version__)

2.6.0


In [28]:
data_path = '/aiffel/aiffel/transformer_chatbot/data/ChatbotData .csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [29]:
def preprocess_sentence(data,column=None):
    res = []
    if column:
        for sentence in data[column]:
            sentence =  sentence.lower().strip() # 소문자로 변경후 양쪽 공백 제거
            sentence = re.sub(r"([?.!,])", r" \1 ", sentence) # 특수문자와 분리
            sentence = re.sub(r'[" "]+', " ", sentence) # 공백 한칸으로 조정
            sentence = re.sub(r"[^^가-힣a-zA-Z0-9\s.?!,]+", " ", sentence) #필요없는 문자들은 ' '로 대체, Korean
            sentence = sentence.strip()
            res.append(sentence)
    else:
        sentence =  data.lower().strip() 
        sentence = re.sub(r"([?.!,])", r" \1 ", sentence) 
        sentence = re.sub(r'[" "]+', " ", sentence) 
        sentence = re.sub(r"[^^가-힣a-zA-Z0-9\s.?!,]+", " ", sentence) 
        res = sentence.strip()
    return res

In [30]:
questions = preprocess_sentence(df, 'Q')
answers = preprocess_sentence(df, 'A')

In [31]:
# SubwordTextEncoder Vocabulary 생성
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
VOCAB_SIZE = tokenizer.vocab_size + 2

In [32]:
df['Qlength'] = df['Q'].apply(len)
df['Alength'] = df['A'].apply(len)
print(max(df['Qlength']), max(df['Alength']))

56 76


In [33]:
#TODO : 입력 데이터 수정 (전체 Question, Answer를 하나의 데이터셋으로 사용)
sentences = list(df['Q'].values) + list(df['A'].values) 

In [34]:
MAX_LENGTH = 50
def tokenize_and_filter(sentences):
    tokenized_inputs = []

    for sentence in sentences:
        # 정수 인코딩 (시작 토큰 + 문장 + 종료 토큰)
        tokenized_sentence = START_TOKEN + tokenizer.encode(sentence) + END_TOKEN
        
        # 최대 길이 초과하는 샘플 제거
        if len(tokenized_sentence) <= MAX_LENGTH:
            tokenized_inputs.append(tokenized_sentence)
    
    # 패딩 적용
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=MAX_LENGTH, padding='post')

    return tokenized_inputs

In [35]:
sentences = tokenize_and_filter(sentences)
sentences

array([[8164, 7902, 4198, ...,    0,    0,    0],
       [8164, 7957,   47, ...,    0,    0,    0],
       [8164, 7959, 1433, ...,    0,    0,    0],
       ...,
       [8164, 5204,  287, ...,    0,    0,    0],
       [8164,   14, 3191, ...,    0,    0,    0],
       [8164,  221,  554, ...,    0,    0,    0]], dtype=int32)

In [36]:
#TODO : Auto-Regressive 입력 데이터 수정 
inputs = sentences[:, :-1]  
outputs = sentences[:, 1:]  
print(inputs.shape)
print(outputs.shape)

(23646, 49)
(23646, 49)


In [37]:
BATCH_SIZE = 64
BUFFER_SIZE = 25000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

### PositionalEncoding & Self Attention

In [38]:
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model) #문장 길이(토큰 개수)임베딩 차원(단어 벡터 차원)

    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model)

        sines = tf.math.sin(angle_rads[:, 0::2]) # 0::2는 열 인덱스에서 0번부터 시작해서 2씩 건너뜀 
        cosines = tf.math.cos(angle_rads[:, 1::2])

        # sin과 cosine이 교차되도록 재배열
        pos_encoding = tf.stack([sines, cosines], axis=0)
        pos_encoding = tf.transpose(pos_encoding,[1, 2, 0]) 
        pos_encoding = tf.reshape(pos_encoding, [position, d_model])

        pos_encoding = pos_encoding[tf.newaxis, ...] # tf.newaxis는 텐서의 첫 번째 차원에 새로운 축을 추가
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [39]:
def scaled_dot_product_attention(query, key, value, mask):
    # 어텐션 가중치는 Q와 K의 닷 프로덕트
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # 가중치를 정규화
    depth = tf.cast(tf.shape(key)[-1], tf.float32) #tf.cast는 텐서의 데이터 타입을 변경하는 함수
    logits = matmul_qk / tf.math.sqrt(depth)

    # 패딩에 마스크 추가
    if mask is not None:
        logits += (mask * -1e9)

    # softmax적용
    attention_weights = tf.nn.softmax(logits, axis=-1)

    # 최종 어텐션은 가중치와 V의 닷 프로덕트
    output = tf.matmul(attention_weights, value)
    return output

In [54]:
def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32) # tf.newaxis를 사용하여 (batch_size, 1, 1, sequence length)형태로 변경
    return mask[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(x):
    seq_len = tf.shape(x)[1] # x 텐서의 시퀀스 길이(두 번째 차원)를 seq_len 변수에 저장
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)

<!-- ![img](/aiffel/aiffel/assets/decoder/decoder.png) -->
![img](./assets/decoder/decoder.png)

### Decoder

In [75]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % num_heads == 0

        self.depth = d_model // num_heads
        
        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)
        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(inputs, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
#         query, key, value = inputs

        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]

        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        #
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention,(batch_size, -1, self.d_model))
        outputs = self.dense(concat_attention)
        return outputs


In [80]:
#TODO: 
# Transformer의 Encoder 관련 부분 제거
# L1과 L2에 따라 look_ahead_mask 작동 구분이 필요할거 같음 (future work)

def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask")  # look-ahead mask만 필요

    # Self-attention
    attention1 = MultiHeadAttention(
          d_model, num_heads, name="attention_1")({
              'query': inputs,
              'key': inputs,
              'value': inputs,
              'mask': look_ahead_mask  
          })
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs)

    # FFN
    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention1)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)

    # Dropout & Layer Normalization
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs + attention1)

    return tf.keras.Model(inputs=[inputs, look_ahead_mask], outputs=outputs, name=name)


def decoder(vocab_size, num_layers, units, d_model, num_heads, dropout, name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))

    # Positional Encoding
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    # Decoder Layer 반복
    for i in range(num_layers):
        look_ahead_mask = create_look_ahead_mask(inputs) 
        outputs = decoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name='decoder_layer_{}'.format(i)
        )([outputs, look_ahead_mask])

    return tf.keras.Model(inputs=inputs, outputs=outputs, name=name)


### GPT-1 Loss

![img](./assets/decoder/loss.png)

In [81]:
#TODO: 3.1 Unsupervised pre-training + 3.2 Supervised fine-tuning loss

# masking 필요 x (auto-regressive)
def L1_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))  # 문장 길이 - 1
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(y_true, y_pred)
    return tf.reduce_mean(loss)

# masking 필요 
def L2_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)

# The var name follows the annotation on the GPT-1 paper.
def auxiliary_function(y_true, y_pred, lambda_weigh=0.3):
    L2_loss = L2_function(y_true, y_pred)  
    L1_loss = L1_function(y_true, y_pred)  
    return L2_loss + (lambda_weigh * L1_loss) 

### Train

In [82]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

In [83]:
tf.keras.backend.clear_session()

# 하이퍼파라미터
EPOCHS = 2
NUM_LAYERS = 2 # 인코더와 디코더의 층의 개수
D_MODEL = 256 # 인코더와 디코더 내부의 입, 출력의 고정 차원
NUM_HEADS = 8 # 멀티 헤드 어텐션에서의 헤드 수 
UNITS = 512 # 피드 포워드 신경망의 은닉층의 크기
DROPOUT = 0.1 # 드롭아웃의 비율

model = decoder(
            vocab_size=VOCAB_SIZE,
            num_layers=NUM_LAYERS,
            units=UNITS,
            d_model=D_MODEL,
            num_heads=NUM_HEADS,
            dropout=DROPOUT)

model.compile(optimizer=optimizer, loss=auxiliary_function, metrics=[accuracy])
model.fit(dataset, epochs=EPOCHS, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7a1412fd1040>

In [None]:
'''
Futurework 
1. Train loss: nan 처리
2. Decoder Layer network L1/L2 처리
3. Inference 
'''