In [1]:
import os, re 
import numpy as np
import tensorflow as tf                            ##os는 자꾸 왜 불러오는거지, re는 또 뭐구

In [2]:
file_path = os.getenv('HOME') + '/aiffel/lyricist/data/shakespeare.txt'
with open(file_path, "r") as f:
    raw_corpus = f.read().splitlines()             ##자동으로 행별 리스트 생성

In [3]:
print(raw_corpus[:9])

['First Citizen:', 'Before we proceed any further, hear me speak.', '', 'All:', 'Speak, speak.', '', 'First Citizen:', 'You are all resolved rather to die than to famish?', '']


In [4]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue   # 길이가 0인 문장은 건너뜁니다.
    if sentence[-1] == ":": continue  # 문장의 끝이 : 인 문장은 건너뜁니다.    -> 대화가 아니라 화자가 누군지 표기한거라서

    if idx > 9: break   # 일단 문장 10개만 확인해 볼 겁니다.
        
    print(sentence)

Before we proceed any further, hear me speak.
Speak, speak.
You are all resolved rather to die than to famish?


In [5]:
## 문장들을 토큰화 해주는 방법
# 입력된 문장을
#     1. 소문자로 바꾸고, 양쪽 공백을 지웁니다
#     2. 특수문자 양쪽에 공백을 넣고
#     3. 여러개의 공백은 하나의 공백으로 바꿉니다
#     4. a-zA-Z?.!,¿가 아닌 모든 문자를 하나의 공백으로 바꿉니다
#     5. 다시 양쪽 공백을 지웁니다
#     6. 문장 시작에는 <start>, 끝에는 <end>를 추가합니다
# 이 순서로 처리해주면 문제가 되는 상황을 방지할 수 있겠네요!
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

In [6]:
print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence . <end>


In [7]:
corpus = []           # 문장 들어갈 리스트

In [8]:
for sentence in raw_corpus:
    # 우리가 원하지 않는 문장은 건너뜁니다
    if len(sentence) == 0: continue    # 아무것도 없는 공백
    if sentence[-1] == ":": continue   # :로 끝나는 화자표시
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)

In [9]:
corpus[:10]

['<start> before we proceed any further , hear me speak . <end>',
 '<start> speak , speak . <end>',
 '<start> you are all resolved rather to die than to famish ? <end>',
 '<start> resolved . resolved . <end>',
 '<start> first , you know caius marcius is chief enemy to the people . <end>',
 '<start> we know t , we know t . <end>',
 '<start> let us kill him , and we ll have corn at our own price . <end>',
 '<start> is t a verdict ? <end>',
 '<start> no more talking on t let it be done away , away ! <end>',
 '<start> one word , good citizens . <end>']

In [10]:
def tokenize(corpus):      #토큰화 함수
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=7000, 
        filters=' ',
        oov_token="<unk>")  # 7000개를 토큰으로, 넘치는건 unk로
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)  # 토큰화된텍스트를 텐서라는 변수에 넣고
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')  #텐서로 변환해주는 함수인가봄
    
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

[[   2  143   40 ...    0    0    0]
 [   2  110    4 ...    0    0    0]
 [   2   11   50 ...    0    0    0]
 ...
 [   2  149 4553 ...    0    0    0]
 [   2   34   71 ...    0    0    0]
 [   2  945   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f8590fb0370>


In [11]:
print(tensor[:3, :10])

[[   2  143   40  933  140  591    4  124   24  110]
 [   2  110    4  110    5    3    0    0    0    0]
 [   2   11   50   43 1201  316    9  201   74    9]]


In [12]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : ,
5 : .
6 : the
7 : and
8 : i
9 : to
10 : of


In [13]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    

print(src_input[0])
print(tgt_input[0])

[  2 143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0
   0   0]
[143  40 933 140 591   4 124  24 110   5   3   0   0   0   0   0   0   0
   0   0]


BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

In [15]:
VOCAB_SIZE = tokenizer.num_words + 1   #패딩문자 : 할당된 비트수를 맞춰주기위해 0으로 채운값, 여기서 1에 해당

In [16]:
dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))      # 텐서를 데이터셋 속에 넣음.
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 20), (256, 20)), types: (tf.int32, tf.int32)>

In [17]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [18]:
for src_sample, tgt_sample in dataset.take(1): break
model(src_sample)

<tf.Tensor: shape=(256, 20, 7001), dtype=float32, numpy=
array([[[ 2.16087166e-04, -2.87092233e-04,  1.13450806e-04, ...,
          1.20215060e-04, -8.62177549e-05, -1.09402055e-04],
        [ 1.52047432e-04, -7.69345323e-04,  1.76481481e-04, ...,
          2.95109756e-04, -6.36725890e-05, -3.81494232e-04],
        [ 1.88440536e-04, -1.35809788e-03,  3.48108617e-04, ...,
          5.20972710e-04, -2.26982040e-04, -4.38983698e-04],
        ...,
        [ 6.18554041e-05,  1.07557094e-03,  2.35607941e-03, ...,
         -7.11598143e-04, -4.81768249e-04, -2.20777723e-03],
        [-5.20847752e-05,  1.24820881e-03,  2.42381147e-03, ...,
         -5.59397333e-04, -3.62589839e-04, -2.36866227e-03],
        [-1.26954372e-04,  1.41539646e-03,  2.50698673e-03, ...,
         -4.10016655e-04, -2.58310378e-04, -2.51017231e-03]],

       [[ 2.16087166e-04, -2.87092233e-04,  1.13450806e-04, ...,
          1.20215060e-04, -8.62177549e-05, -1.09402055e-04],
        [ 4.94466920e-04, -4.40303149e-04,  2.

In [20]:
model.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  1792256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  7176025   
Total params: 22,607,961
Trainable params: 22,607,961
Non-trainable params: 0
_________________________________________________________________


In [21]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f858f05f7c0>

In [24]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    # 단어 하나씩 예측해 문장을 만듭니다
    #    1. 입력받은 문장의 텐서를 입력합니다
    #    2. 예측된 값 중 가장 높은 확률인 word index를 뽑아냅니다
    #    3. 2에서 예측된 word index를 문장 뒤에 붙입니다
    #    4. 모델이 <end>를 예측했거나, max_len에 도달했다면 문장 생성을 마칩니다
    while True:
        # 1
        predict = model(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer를 이용해 word index를 단어로 하나씩 변환합니다 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [25]:
generate_text(model, tokenizer, init_sentence="<start> he")

'<start> he s not a man , and i am able to see <end> '

In [26]:
generate_text(model, tokenizer, init_sentence="<start> hungry")

'<start> hungry <unk> <unk> in the <unk> s death , <end> '

In [28]:
generate_text(model, tokenizer, init_sentence="<start> dog")

'<start> dog , sir , i will not . <end> '

## 여기부터 프로젝트

In [35]:
#데이터 가져오기부터...
import glob
import os

txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'
txt_list = glob.glob(txt_file_path)

In [31]:
txt_list[:3]

['/aiffel/aiffel/lyricist/data/lyrics/leonard-cohen.txt',
 '/aiffel/aiffel/lyricist/data/lyrics/lil-wayne.txt',
 '/aiffel/aiffel/lyricist/data/lyrics/blink-182.txt']

In [32]:
raw_corpus = []

In [33]:
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

In [34]:
print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

데이터 크기: 187088
Examples:
 ["Now I've heard there was a secret chord", 'That David played, and it pleased the Lord', "But you don't really care for music, do you?"]


### 데이터 정제(전처리)

1. 문장중 길이가 0이거나 15이상인 문장 제외
2. 문장을 정규표현식을 사용해 기호, 띄어쓰기등이 없도록 변환

### 토큰화

1. 문장을 단어로 변환 후 각 단어에 인덱스를 붙여서 텐서로 변환

In [70]:
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue   
    if len(sentence) > 75: continue
    if idx > 5: break   
        
    print(sentence)     ## 15개는 너무 작은가보다. 애시당초 센텐스면 단어 개수가 아니라 알파벳으로 자르는건가
                        ## 1줄에 대충 띄어쓰기 포함 5단어라고 치고 걸러보자  -> 나중에 행렬에서 열이 너무 많아짐

Now I've heard there was a secret chord
That David played, and it pleased the Lord
But you don't really care for music, do you?
It goes like this
The fourth, the fifth
The minor fall, the major lift


In [44]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()                       # 소문자, 좌우 공백삭제
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)       # 특수문자 좌우에 공백넣기
    sentence = re.sub(r'[" "]+', " ", sentence)               # 2개이상의 공백은 1개로
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)     #  특수문자 = 공백 치환
    sentence = sentence.strip()                               # 공백 삭제
    sentence = '<start> ' + sentence + ' <end>'               #문장 처음과 끝에 <start>, <end> 넣기
    return sentence

In [72]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    preprocessed_sentence = preprocess_sentence(sentence)
    if len(preprocessed_sentence.split()) > 15: continue          #오케이 됐다
    corpus.append(preprocessed_sentence)

In [73]:
print(corpus[:5])            #?가 보이는데 잘못된건가?, 아 ?, !는 빼고구나 잘 된거같다.

['<start> now i ve heard there was a secret chord <end>', '<start> that david played , and it pleased the lord <end>', '<start> but you don t really care for music , do you ? <end>', '<start> it goes like this <end>', '<start> the fourth , the fifth <end>']


In [75]:
def tokenize(corpus):     
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,    # 이전에 unk가 많이 나와서 좀 늘렸음
        filters=' ',
        oov_token="<unk>")  
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)  
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post') 
    
    print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)               ## 토큰으로 잘 바뀌었다. 2로 시작하는거 보니 잘되었군

[[   2   50    4 ...    0    0    0]
 [   2   15 2971 ...    0    0    0]
 [   2   33    7 ...   46    3    0]
 ...
 [   2    4  117 ...    0    0    0]
 [   2  258  195 ...   12    3    0]
 [   2    7   34 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7f84961b3dc0>


### 메모리 할당 / 데이터 셋 분리

In [76]:
src_input = tensor[:, :-1]  
tgt_input = tensor[:, 1:]    

BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE
VOCAB_SIZE = tokenizer.num_words + 1

In [77]:
dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input)) 
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>

In [63]:
##여기까지하고 분류하는게 맞겠지?
from sklearn.model_selection import train_test_split

In [78]:
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size=0.2, random_state=37)

In [79]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)

Source Train: (124981, 14)
Target Train: (124981, 14)


대충 성공적이다
### 모델에 맞춰 훈련시키기

In [81]:
class lyricist(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256
hidden_size = 1024
model = lyricist(tokenizer.num_words + 1, embedding_size , hidden_size)

In [82]:
for enc_train, dec_train in dataset.take(1): break
model(enc_train)

<tf.Tensor: shape=(256, 14, 12001), dtype=float32, numpy=
array([[[-1.32260684e-04,  1.76336165e-04,  2.23136449e-05, ...,
          1.98939990e-04,  6.49734138e-05, -5.33336861e-05],
        [-8.88010545e-05,  4.12185793e-04, -1.97884801e-04, ...,
          1.09123110e-04,  2.05781507e-05, -2.32958730e-04],
        [-9.07630019e-05,  5.77206258e-04, -4.07557236e-04, ...,
          1.45193017e-05,  6.13477096e-05, -2.86252412e-04],
        ...,
        [ 5.88754134e-04, -1.97822286e-04,  2.10699000e-04, ...,
         -5.38915803e-04,  2.47444957e-04, -4.06241015e-04],
        [ 5.44317125e-04,  8.14986925e-05,  4.71767649e-04, ...,
         -2.89131858e-04,  3.13071156e-04, -3.55269643e-04],
        [ 4.09154600e-04,  4.59551520e-04,  5.55857201e-04, ...,
         -3.73169933e-05,  4.86185862e-04, -3.34214972e-04]],

       [[-1.32260684e-04,  1.76336165e-04,  2.23136449e-05, ...,
          1.98939990e-04,  6.49734138e-05, -5.33336861e-05],
        [-2.06493511e-04,  1.20087410e-04, -3

In [83]:
model.summary()

Model: "lyricist"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  3072256   
_________________________________________________________________
lstm_4 (LSTM)                multiple                  5246976   
_________________________________________________________________
lstm_5 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense_2 (Dense)              multiple                  12301025  
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________


In [84]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8496071040>

In [88]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    while True:
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [90]:
generate_text(model, tokenizer, init_sentence="<start> i love", max_len=30)

'<start> i love you , i love you <end> '

In [91]:
generate_text(model, tokenizer, init_sentence="<start> i", max_len=30)

'<start> i m not gonna be a poet <end> '

In [92]:
generate_text(model, tokenizer, init_sentence="<start> i hate", max_len=30)

'<start> i hate to see the <unk> <end> '

In [93]:
generate_text(model, tokenizer, init_sentence="<start> you love", max_len=30)

'<start> you love me , you know , you know <end> '

In [100]:
generate_text(model, tokenizer, init_sentence="<start> don t look back", max_len=30)

'<start> don t look back , i m not gonna leave <end> '

## 결과 분석

1. 운좋게 파라미터값을 수정하지 않았는데도 기준치를 통과했다. 학습에 시간이 오래걸려서 다행이었다.

2. 적절한 언어를 내놓는게 신기하긴 하지만 결과는 좀 아쉬운 편... 비슷한 단어만 계속 말하는 수준이다.

3. nlp를 처음 접해봐서 조금 충격적이다. 글쓰는 것이 아무리 예술의 영역이라도 일정한 form이 있기 때문에
  언젠간 작가란 직업이 사라질수도 있겠다는 생각이 든다. 인공지능의 가진 양면을 고민해보게 되었다.
  
4. 그치만 좀 이해가 안되는 부분은, 문법을 배제하는 점. 문법은 논리적인 부분이라 인간에게는 어려워도
  컴퓨터에겐 오히려 쉬운 영역일텐데... 어째서 통계에만 의존하는지 잘모르겠다. 
  언어의 사용이라는 측면이 무에서 유를 창조하는거라곤 하지만, 가이드라인정도는 미리 줄 수 있을꺼도 같은데..