In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data_path = os.path.join(os.getenv('HOME'), 'aiffel/transformer_chatbo/data/ChatbotData .csv')

In [3]:
#전처리 함수
def preprocess_sentence(sentence):
    # 양쪽 공백을 제거
    sentence = sentence.strip()

    # 단어와 구두점(punctuation) 사이의 거리를 만듭니다.
    # 예를 들어서 "나는 학생입니다." => "나는 학생입니다 ."와 같이
    # 학생과 온점 사이에 거리를 만듭니다.
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)

    # (가-힣, ".", "?", "!", ",")를 제외한 모든 문자를 공백인 ' '로 대체합니다.
    sentence = re.sub(r'[^가-힣.?!,]', ' ', sentence)
    sentence = sentence.strip()
    return sentence

In [4]:
data =pd.read_csv(data_path)

In [5]:
def load_conversations(data_path, max_samples=None):
    data = pd.read_csv(data_path)
    
    inputs = []
    outputs = []
    
    for idx, row in data.iterrows():
        question = preprocess_sentence(row['Q'])
        answer = preprocess_sentence(row['A'])
        
        inputs.append(question)
        outputs.append(answer)
        
        if max_samples and len(inputs) >= max_samples:
            break
    
    return inputs, outputs

In [6]:
questions, answers = load_conversations(data_path)

In [7]:
questions[0]
print(len(questions))

11823


In [8]:
answers[0]
print(len(answers))

11823


### GPT의 경우 이 모든 단어들을 다 한번에 학습시켜야 하므로, 일단 한번에 합쳐버림

In [31]:
all = questions+answers

In [32]:
len(all)  #잘 합쳐진 것 확인

23646

In [33]:
#단어장 만들기 (각 단어에 고유한 정수 인덱스 부여)

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(all, target_vocab_size=2**13)

print(f"어휘 크기: {tokenizer.vocab_size}")

어휘 크기: 8132


In [34]:
sample_string = all[0]
tokenized_string = tokenizer.encode(sample_string)
original_string = tokenizer.decode(tokenized_string)

In [35]:
print(f'원본 문장: {sample_string}')
print(f'토큰화된 문장: {tokenized_string}')
print(f'디코딩된 문장: {original_string}')

원본 문장: 시 땡 !
토큰화된 문장: [5161, 3053, 42]
디코딩된 문장: 시 땡 !


In [36]:
#unsupervised learning일 때, 종료 토큰만 있으면 된다 (?)

end_token = [tokenizer.vocab_size +1]
print(end_token)

[8133]


In [37]:
# end 가 추가되었으므로, vocab_size도 키우기
VOCAB_SIZE = tokenizer.vocab_size + 1
print(VOCAB_SIZE)

8133


In [38]:
# 패딩하기 전 max_len

sentence_lengths = [len(sentence.split()) for sentence in all]
print(f"질문 길이의 95% 백분위수: {np.percentile(sentence_lengths, 95)}")
print(f"답변 길이의 99% 백분위수: {np.percentile(sentence_lengths, 99)}")

질문 길이의 95% 백분위수: 8.0
답변 길이의 99% 백분위수: 10.0


In [39]:
#정수 인코딩 및 패딩
MAX_LENGTH = 10  #99% 백분위수 기준

# 정수 인코딩, 최대 길이를 초과하는 샘플 제거, 패딩
def tokenize_and_filter(all):
    tokenized_all = []
    for sentence in all:
        sentence = tokenizer.encode(sentence) + end_token 

        # 최대 길이 10 이하인 경우에만 데이터셋으로 허용
        if len(sentence) <= MAX_LENGTH:
            tokenized_all.append(sentence)

    # 최대 길이 10으로 모든 데이터셋을 패딩
    tokenized_all = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_all, maxlen=MAX_LENGTH, padding='post')
   
    return tokenized_all

In [40]:
all_tokenized = tokenize_and_filter(all)
print('단어장의 크기 :',(VOCAB_SIZE))
print('필터링 후의 질문 샘플 개수: {}'.format(len(all_tokenized)))  #필터링 전 23646

단어장의 크기 : 8133
필터링 후의 질문 샘플 개수: 21829


In [41]:
# 입력 시퀀스, 출력 시퀀스 생성
input_sequences = []
output_sequences = []
for seq in all_tokenized:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i])
        output_sequences.append([seq[i]])

In [42]:
input_sequences

[array([5161], dtype=int32),
 array([5161, 3053], dtype=int32),
 array([5161, 3053,   42], dtype=int32),
 array([5161, 3053,   42, 8133], dtype=int32),
 array([5161, 3053,   42, 8133,    0], dtype=int32),
 array([5161, 3053,   42, 8133,    0,    0], dtype=int32),
 array([5161, 3053,   42, 8133,    0,    0,    0], dtype=int32),
 array([5161, 3053,   42, 8133,    0,    0,    0,    0], dtype=int32),
 array([5161, 3053,   42, 8133,    0,    0,    0,    0,    0], dtype=int32),
 array([46], dtype=int32),
 array([ 46, 918], dtype=int32),
 array([  46,  918, 7908], dtype=int32),
 array([  46,  918, 7908,  995], dtype=int32),
 array([  46,  918, 7908,  995, 1722], dtype=int32),
 array([  46,  918, 7908,  995, 1722, 8133], dtype=int32),
 array([  46,  918, 7908,  995, 1722, 8133,    0], dtype=int32),
 array([  46,  918, 7908,  995, 1722, 8133,    0,    0], dtype=int32),
 array([  46,  918, 7908,  995, 1722, 8133,    0,    0,    0], dtype=int32),
 array([3539], dtype=int32),
 array([3539,   86], 

In [43]:
len(input_sequences)

196461

In [44]:
output_sequences

[[3053],
 [42],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [918],
 [7908],
 [995],
 [1722],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [86],
 [3647],
 [68],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [86],
 [1392],
 [3647],
 [68],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [918],
 [78],
 [821],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [203],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [1123],
 [7908],
 [67],
 [5102],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [7580],
 [175],
 [8],
 [2567],
 [439],
 [49],
 [192],
 [8133],
 [0],
 [7580],
 [204],
 [85],
 [183],
 [348],
 [1256],
 [8133],
 [0],
 [0],
 [196],
 [2976],
 [45],
 [3809],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [751],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [0],
 [7171],
 [32],
 [751],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [3797],
 [5],
 [770],
 [8133],
 [0],
 [0],
 [0],
 [0],
 [0],
 [747],
 [24],
 [146],
 [58],
 [955],
 [946],
 [8133],
 [0],
 [0],
 [63],
 [1140],
 [5469],
 [1580],
 [8133],


In [45]:
len(output_sequences)

196461

In [48]:
# 각 시퀀스를 동일한 길이로 패딩
input_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=MAX_LENGTH, padding='post')
output_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(output_sequences, MAX_LENGTH, padding='post')

# numpy 배열로 변환하여 형태 확인
input_sequences_padded = np.array(input_sequences_padded)
output_sequences_padded = np.array(output_sequences_padded)

print("Input sequences shape:", input_sequences_padded.shape)
print("Output sequences shape:", output_sequences_padded.shape)

Input sequences shape: (196461, 10)
Output sequences shape: (196461, 10)


In [49]:
# TensorFlow 데이터셋 생성
BATCH_SIZE = 64
BUFFER_SIZE = len(input_sequences_padded)

dataset = tf.data.Dataset.from_tensor_slices((input_sequences_padded, output_sequences_padded))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

## 필요한 함수들 구현

## 1. Position Embedding 구현

In [51]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'position': self.pos_encoding.shape[0],
            'd_model': self.pos_encoding.shape[1],
        })
        return config
    
    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis],
                                     np.arange(d_model)[np.newaxis, :],
                                     d_model)
        
        # apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        
        # apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        
        pos_encoding = angle_rads[np.newaxis, ...]
        
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates
    
    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

## 2. 멀티헤드 어텐션

In [55]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads ==0
        self.depth = d_model//self.num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size): #x 는 qkv 중 하나. 각 텐서는 batch_size, seq_len, d_model으로 이루어짐
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm = [0,2,1,3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q) 
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights
    
    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (batch_size, num_heads, seq_len_q, seq_len_k)

        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)

        output = tf.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, depth)

        return output, attention_weights

        

## 3. GPT 모델 정의

In [56]:
# 모델 정의
def create_gpt_model(vocab_size, d_model, num_heads, num_layers, max_length):
    inputs = tf.keras.layers.Input(shape=(max_length,))
    embedding = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    pos_encoding = PositionalEncoding(max_length, d_model)(embedding)
    x = pos_encoding

    for _ in range(num_layers):
        attn_output, _ = MultiHeadAttention(d_model, num_heads)(x, x, x, None)
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + attn_output)
        ffn_output = tf.keras.layers.Dense(d_model, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + ffn_output)

    outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [57]:
vocab_size = 8133
d_model = 256
num_heads = 8
num_layers = 4
max_length = 10

model = create_gpt_model(vocab_size, d_model, num_heads, num_layers, max_length)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7d6c7ea07ca0>

In [58]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 256)      2082048     input_2[0][0]                    
__________________________________________________________________________________________________
positional_encoding_1 (Position (None, 10, 256)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
multi_head_attention_1 (MultiHe ((None, None, 256),  263168      positional_encoding_1[0][0]      
                                                                 positional_encoding_1[0][0]  

## 4. 새롭게 fine_tuning 해보기

In [80]:
questions = data['Q'].tolist()
answers = data['A'].tolist()

In [81]:
questions[0]

'12시 땡!'

In [82]:
answers[0]

'하루가 또 가네요.'

In [83]:
START_TOKEN = [tokenizer.vocab_size]
END_TOKEN = [tokenizer.vocab_size + 1]
SEP_TOKEN = [tokenizer.vocab_size + 2]  # $ <- 얘

In [84]:
VOCAB_SIZE = tokenizer.vocab_size + 3

In [85]:
def tokenize_and_filter(questions, answers):
    tokenized_sentences = []
    for question, answer in zip(questions, answers):
        sentence = START_TOKEN + tokenizer.encode(question) + SEP_TOKEN + tokenizer.encode(answer) + END_TOKEN
        if len(sentence) <= MAX_LENGTH:
            tokenized_sentences.append(sentence)
    tokenized_sentences = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_sentences, maxlen=MAX_LENGTH, padding='post')
    return tokenized_sentences

In [86]:
all_tokenized = tokenize_and_filter(questions, answers)

In [87]:
input_sequences = []
output_sequences = []
for seq in all_tokenized:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i])
        output_sequences.append([seq[i]])

input_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=MAX_LENGTH, padding='post')
output_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(output_sequences, maxlen=MAX_LENGTH, padding='post')

input_sequences_padded = np.array(input_sequences_padded)
output_sequences_padded = np.array(output_sequences_padded)

print("Input sequences shape:", input_sequences_padded.shape)
print("Output sequences shape:", output_sequences_padded.shape)

BATCH_SIZE = 64
BUFFER_SIZE = len(input_sequences_padded)

dataset = tf.data.Dataset.from_tensor_slices((input_sequences_padded, output_sequences_padded))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

Input sequences shape: (15228, 10)
Output sequences shape: (15228, 10)


In [88]:
# 이미 학습된 모델을 로드하기
pretrained_model = create_gpt_model(VOCAB_SIZE, 256, 8, 4, 10)

In [89]:
# 우리는 Multiple Choice이기 때문에 학습된 모델 위에 Linear랑 Softmax 추가하기

def create_multiple_choice_model(pretrained_model, max_length, num_choices):
    inputs = tf.keras.layers.Input(shape= (num_choices, max_length))
    reshaped_inputs = tf.reshape(inputs, [-1, max_length])
    
    pretrained_output = pretrained_model(reshaped_inputs)
    
    # Global Average Pooling을 통해 시퀀스를 벡터로 변환
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(pretrained_output)
    
    # Dense 레이어를 추가
    linear_output = tf.keras.layers.Dense(1)(pooled_output)
    reshaped_output = tf.reshape(linear_output, [-1, num_choices])
    # 최종적으로 Softmax를 통해 확률 분포를 생성
    softmax_output = tf.keras.layers.Softmax()(reshaped_output)
    
    model = tf.keras.Model(inputs=inputs, outputs=softmax_output)
    return model

In [90]:
num_choices = 4  # 예시로 4개의 선택지를 가정
multiple_choice_model = create_multiple_choice_model(pretrained_model, max_length, num_choices)
multiple_choice_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [94]:
def create_multiple_choice_dataset(questions, answers, tokenizer, max_length, num_choices):
    contexts = []
    labels = []
    
    for question, correct_answer in zip(questions, answers):
        question_encoded = tokenizer.encode(question)
        
        # 3개의 랜덤 오답을 생성
        wrong_answers = np.random.choice(answers, num_choices - 1, replace=False)
        all_answers = [correct_answer] + list(wrong_answers)
        
        context = []
        for answer in all_answers:
            sentence = START_TOKEN + question_encoded + SEP_TOKEN + tokenizer.encode(answer) + END_TOKEN
            context.append(sentence)
        
        context_padded = tf.keras.preprocessing.sequence.pad_sequences(context, maxlen=max_length, padding='post')
        contexts.append(context_padded)
        labels.append(0)  # 정답의 인덱스는 항상 0으로 설정
        
    contexts = np.array(contexts, dtype=np.int32)
    labels = np.array(labels, dtype=np.int32)
    
    return contexts, labels

# 데이터셋 생성
contexts, labels = create_multiple_choice_dataset(questions, answers, tokenizer, max_length, num_choices)

dataset = tf.data.Dataset.from_tensor_slices((contexts, labels))
dataset = dataset.cache()
dataset = dataset.shuffle(len(contexts))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)


In [96]:
# 모델 학습
multiple_choice_model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7d6bf5b8c580>

In [97]:
multiple_choice_model.fit(dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7d6bf5a14f10>

In [98]:
multiple_choice_model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 4, 10)]           0         
_________________________________________________________________
tf.reshape (TFOpLambda)      (None, 10)                0         
_________________________________________________________________
model_3 (Functional)         (None, 10, 8135)          5493191   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 8135)              0         
_________________________________________________________________
dense_68 (Dense)             (None, 1)                 8136      
_________________________________________________________________
tf.reshape_1 (TFOpLambda)    (None, 4)                 0         
_________________________________________________________________
softmax_1 (Softmax)          (None, 4)                 0   

## inferene 함수 구현하기

In [109]:
def multiple_choice_inference(question, possible_answers):
    question_encoded = tokenizer.encode(question)
    contexts = []

    for answer in possible_answers:
        sentence = START_TOKEN + question_encoded + SEP_TOKEN + tokenizer.encode(answer) + END_TOKEN
        contexts.append(sentence)

    contexts_padded = tf.keras.preprocessing.sequence.pad_sequences(contexts, maxlen=10, padding='post')
    contexts_padded = np.expand_dims(contexts_padded, axis=0)  # 배치 차원을 추가

    predictions =multiple_choice_model(contexts_padded, training=False)
    predicted_id = np.argmax(predictions, axis=-1)

    return possible_answers[predicted_id[0]]

def sentence_generation(question, possible_answers):
    predicted_answer = multiple_choice_inference(question, possible_answers)

    print('질문 : {}'.format(question))
    print('답변 : {}'.format(predicted_answer))

    return predicted_answer


In [111]:
question = "안녕하세요, 오늘 기분은 어떠신가요?"
possible_answers = ["좋아요", "나빠요", "그냥 그래요", "잘 모르겠어요"]
generated_answer = sentence_generation(question, possible_answers)
print(generated_answer)

질문 : 안녕하세요, 오늘 기분은 어떠신가요?
답변 : 좋아요
좋아요


In [113]:
question = "오늘 날씨는 어때요?"
possible_answers = ["맑아요", "비가 와요", "흐려요", "모르겠어요"]
generated_answer = sentence_generation(question, possible_answers)
print(generated_answer)

질문 : 오늘 날씨는 어때요?
답변 : 모르겠어요
모르겠어요


In [114]:
question = "나 결혼할 수 있을까?"
possible_answers = ["글쎄요", "곧 할듯", "못해요", "소개팅부터 하세요"]
generated_answer = sentence_generation(question, possible_answers)
print(generated_answer)

질문 : 나 결혼할 수 있을까?
답변 : 글쎄요
글쎄요


In [117]:
question = "부트캠프 잘 끝낼 수 있을까?"
possible_answers = ["네", "그럼요", "힘내요", "잘 할 수 있어요"]
generated_answer = sentence_generation(question, possible_answers)
print(generated_answer)

질문 : 부트캠프 잘 끝낼 수 있을까?
답변 : 네
네


In [116]:
question = "점심 메뉴 추천"
possible_answers = ["떡볶이", "닭도리탕", "컴퓨터", "룰렛"]
generated_answer = sentence_generation(question, possible_answers)
print(generated_answer)

질문 : 점심 메뉴 추천
답변 : 떡볶이
떡볶이
