### 필요한 패키지 import

In [1]:
import glob
import os
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split

### 데이터 읽어오기

In [2]:
txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:10])

데이터 크기: 187088
Examples:
 ['When somebody loved me', 'Everything was beautiful', 'Every hour we spent together', 'Lives within my heart And when she was sad', 'I was there to dry her tears', 'And when she was happy so was I', 'When she loved me Through the summer and the fall', 'We had each other that was all', 'Just she and I together', 'Like it was meant to be And when she was lonely']


###  데이터 정제

In [3]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
  
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)

    sentence = sentence.strip()

    sentence = '<start> ' + sentence + ' <end>'
    
    return sentence

corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if sentence[-1] == ":": continue
    processed = preprocess_sentence(sentence)
    if len(processed.split()) <= 15:
        corpus.append(processed)

def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000, 
        filters=' ',
        oov_token="<unk>"
    )
    tokenizer.fit_on_texts(corpus)

    tensor = tokenizer.texts_to_sequences(corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  

    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[  2  45 246 ...   0   0   0]
 [  2 177  53 ...   0   0   0]
 [  2 124 860 ...   0   0   0]
 ...
 [  2   8   4 ...   0   0   0]
 [  2  44  17 ...   0   0   0]
 [  2   6 172 ...   0   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7f07808c8290>


### 평가 데이터셋 분리

In [4]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size=0.2, shuffle=True)

print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (124810, 14)
Target Train: (124810, 14)


### Dataset 객체 생성


In [5]:
def create_dataset(src_input, tgt_input):
    BUFFER_SIZE = len(src_input)
    BATCH_SIZE = 256
    steps_per_epoch = len(src_input) // BATCH_SIZE

    VOCAB_SIZE = tokenizer.num_words + 1

    dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input)).shuffle(BUFFER_SIZE)
    return dataset.batch(BATCH_SIZE, drop_remainder=True)

train_dataset = create_dataset(enc_train, dec_train)
test_dataset = create_dataset(enc_val, dec_val)

### 인공지능 만들기

In [6]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(TextGenerator, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 512
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

for src_sample, tgt_sample in train_dataset.take(1): break
model(src_sample)

model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6144512   
_________________________________________________________________
lstm (LSTM)                  multiple                  6295552   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
Total params: 33,133,793
Trainable params: 33,133,793
Non-trainable params: 0
_________________________________________________________________


In [7]:
optimizer = tf.keras.optimizers.Adam()

loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f078191d8d0>

In [8]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    while True:
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]

        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)

        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

generate_text(model, tokenizer, init_sentence="<start> i love", max_len=20)

'<start> i love you , i love you <end> '

### 회고   

**새로 알게된 것**   

1. 지금까지 numpy array 타입의 데이터와 라벨을 model의 fit 메서드에 전달해 학습을 해왔는데, 텐서플로우에선 이보다는 Dataset객체를 생성해 사용하는 것이 속도도 더 빠르고 편리하다고 한다. 정확하게 어떻게 빠르고 편리한지 찾아봐야 할 것 같다.   

2. tf.keras.Sequential()로 모델을 생성하고, add 메소드로 레이어를 추가하는 방식이 아니라 클래스와 메소드를 통해 모델을 생성하는 방법을 처음 사용해봤다.(https://www.tensorflow.org/guide/keras/custom_layers_and_models)

**정리**

해당 모델로 validation loss를 10 epochs 이내에 2.2 이하로 떨어뜨리는 것이 목표였다.
처음엔 embedding_size를 256, hidden_size를 1024로 설정하고 학습을 시켰는데, 10 번째 epoch에서 2.3이 나왔다.
그 다음엔 hidden_size를 2048로 변경하고 학습을 시켰는데 비슷한 결과가 나왔다.
마지막으로 다시 hidden_size를 1024로 되돌리고, embedding_size를 512로 늘렸더니 10 번째 epoch에서 2.0에 가까운 값이 나왔다.