[E04] Making Lyrics Writer

In [11]:
import glob
import os
import re 
import numpy as np
import tensorflow as tf

1. 데이터 읽기

In [5]:
txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ["Now I've heard there was a secret chord", 'That David played, and it pleased the Lord', "But you don't really care for music, do you?"]


2. 데이터 처리

In [28]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() 
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)  
    sentence = re.sub(r'[" "]+', " ", sentence) 
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) 
    sentence = sentence.strip() 
    sentence = '<start> ' + sentence + ' <end>' 
    return sentence

print(preprocess_sentence(raw_corpus[0]))

<start> now i ve heard there was a secret chord <end>


- 데이터 정제
1. 소문자로 바꾸고, 양쪽 공백을 지우기
2. 특수문자 양쪽에 공백을 넣기
3. 여러개의 공백은 하나의 공백으로 바꾸기
4. a-zA-Z?.!,¿가 아닌 모든 문자를 하나의 공백으로 바꾸기
5. 다시 양쪽 공백을 지우기
6. 문장 시작에는 <start>, 끝에는 <end>를 추가

In [35]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: 
        continue
    
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
        
corpus[:10]

['<start> now i ve heard there was a secret chord <end>',
 '<start> that david played , and it pleased the lord <end>',
 '<start> but you don t really care for music , do you ? <end>',
 '<start> it goes like this <end>',
 '<start> the fourth , the fifth <end>',
 '<start> the minor fall , the major lift <end>',
 '<start> the baffled king composing hallelujah hallelujah <end>',
 '<start> hallelujah <end>',
 '<start> hallelujah <end>',
 '<start> hallelujah your faith was strong but you needed proof <end>']

-길이가 0인 것을 제외하고 말뭉치에 추가

In [39]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words = 14000,
        filters=' ',
        oov_token="<unk>")
    
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = [x for x in tensor if len(x) <= 15]
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[   2   50    5 ...    0    0    0]
 [   2   17 2643 ...    0    0    0]
 [   2   35    7 ...   43    3    0]
 ...
 [   2    5  107 ...    0    0    0]
 [   2  261  200 ...   12    3    0]
 [   2    7   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7ff167418640>


-텐서플로우의 tokenizer를 활용해 말뭉치를 텐서로 변환

In [17]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : i
6 : the
7 : you
8 : and
9 : a
10 : to


-구축된 단어사전 확인

In [41]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    

print(src_input[0])
print(tgt_input[0])

[   2   50    5   91  297   64   57    9  970 6048    3    0    0    0]
[  50    5   91  297   64   57    9  970 6048    3    0    0    0    0]


- 뒤의 토큰을 잘라내 소스 문장을, 앞의 토큰을 잘라내 타겟 문장을 만든다.
- 시퀀스 길이보다 짧은 문장은 패딩문자 <pad>로 채워진다. 0이 <pad> 일것. -> 마지막 토큰의 대부분은 <end>가 아니라 <pad>일 것이다.

3. 학습 평가 데이터 분리

In [68]:
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1   

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

- 5번째 줄의 +1은 패딩문자

In [73]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size=0.2, random_state=42)

print("shape of source train set: ", enc_train.shape)
print("shape of target train set: ", dec_train.shape)

shape of source train set:  (124981, 14)
shape of target train set:  (124981, 14)


4. 모델 설계, 학습

In [74]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 516
hidden_size = 2048
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [76]:
for src_sample, tgt_sample in dataset.take(1): break

model(src_sample)

<tf.Tensor: shape=(256, 14, 14001), dtype=float32, numpy=
array([[[-5.78149848e-05, -1.13515249e-04, -1.85804558e-04, ...,
         -1.79725539e-04, -3.06374423e-04, -1.97737536e-04],
        [ 2.60355329e-04,  4.99273665e-05, -2.82239751e-04, ...,
         -1.39900309e-04, -2.61207548e-04, -3.06390895e-04],
        [ 4.36784467e-04,  2.32820399e-04, -4.31363878e-04, ...,
         -2.15692795e-04, -1.26270141e-04, -1.19117904e-04],
        ...,
        [-1.00755936e-03,  6.60831283e-04, -4.79863869e-04, ...,
          4.48741572e-04,  1.57329207e-03, -6.27967820e-04],
        [-1.47650030e-03,  1.28646928e-03, -6.11653741e-05, ...,
          3.03866051e-04,  1.81185326e-03, -5.08448516e-04],
        [-1.90108072e-03,  1.93406874e-03,  4.05342260e-04, ...,
          2.18282948e-04,  1.94689154e-03, -2.83511326e-04]],

       [[-5.78149848e-05, -1.13515249e-04, -1.85804558e-04, ...,
         -1.79725539e-04, -3.06374423e-04, -1.97737536e-04],
        [-3.44416825e-04, -3.12199118e-04, -4

In [77]:
model.summary()

Model: "text_generator_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      multiple                  7224516   
_________________________________________________________________
lstm_6 (LSTM)                multiple                  21012480  
_________________________________________________________________
lstm_7 (LSTM)                multiple                  33562624  
_________________________________________________________________
dense_3 (Dense)              multiple                  28688049  
Total params: 90,487,669
Trainable params: 90,487,669
Non-trainable params: 0
_________________________________________________________________


In [78]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer)
model.fit(enc_train, dec_train, epochs=10, validation_data=(enc_val, dec_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff09a5d84c0>

In [None]:
5. 평가

In [79]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]


    while True:

        predict = model(test_tensor) 
 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 

        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)

        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""

    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated    

In [80]:
generate_text(model, tokenizer, init_sentence="<start> She lives", max_len=20)

'<start> she lives in the crowd and shed not a tear <end> '

6. 회고

- 토큰의 갯수를 15개로 제한하기 위해 말뭉치를 텐서로 변환할 때 maxlen= 15를 쓰면 임의로 잘려버려 <start>로 시작하지 않는 문장이 생기는 문제가 발생했다.
- validation loss 가 줄어들다가 다시 늘어나는 것은 무슨 문제인지 아직도 도무지 알 수가 없다.
- 데이터셋을 만들었지만 학습은 이전과 똑같이 한 것이 아쉬움.
- 중복된 것을 제거하고 다시 해보고 싶은 생각이 들었지만, 학습에 시간이 너무 걸려 엄두를 못냈다.