# 텍스트 생성하기(RNN)

In [2]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
text = """경마장에 있는 말이 다쳤다.\n
그의 말은 매우 빠르다.\n
가는 말이 고운데 왜 오는 말은 미울까\n
"""


In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

# 전체 단어 사이즈 = 토크나이즈 정수 인코딩 개수 + 1(패딩)
vocab_size = len(tokenizer.word_index) + 1

print(vocab_size)

14


In [5]:
print(tokenizer.word_index)

{'말이': 1, '말은': 2, '경마장에': 3, '있는': 4, '다쳤다': 5, '그의': 6, '매우': 7, '빠르다': 8, '가는': 9, '고운데': 10, '왜': 11, '오는': 12, '미울까': 13}


In [6]:
sequences = []
# 텍스트를 \n기준으로 나누어, 한줄씩 받아온다.
for line in text.split('\n'):
    # 각 줄에 있는 문자를 숫자로 변환한다.
    encoded = tokenizer.texts_to_sequences([line])[0]

    # 숫자로 변환된 값을 여러 데이터로 나눈다.
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print('data 수', len(sequences))

data 수 12


In [7]:
print(sequences)

[[3, 4], [3, 4, 1], [3, 4, 1, 5], [6, 2], [6, 2, 7], [6, 2, 7, 8], [9, 1], [9, 1, 10], [9, 1, 10, 11], [9, 1, 10, 11, 12], [9, 1, 10, 11, 12, 2], [9, 1, 10, 11, 12, 2, 13]]


In [8]:
maxlen = 7

sequences = pad_sequences(sequences, maxlen=maxlen)
print(sequences)

[[ 0  0  0  0  0  3  4]
 [ 0  0  0  0  3  4  1]
 [ 0  0  0  3  4  1  5]
 [ 0  0  0  0  0  6  2]
 [ 0  0  0  0  6  2  7]
 [ 0  0  0  6  2  7  8]
 [ 0  0  0  0  0  9  1]
 [ 0  0  0  0  9  1 10]
 [ 0  0  0  9  1 10 11]
 [ 0  0  9  1 10 11 12]
 [ 0  9  1 10 11 12  2]
 [ 9  1 10 11 12  2 13]]


In [9]:
sequences = np.array(sequences)

# 마지막 단어를 제외한 부분을 data로 사용
X = sequences[:, :-1]
# 마지막 단어를 target으로 사용
y = sequences[:, -1]

In [11]:
y = to_categorical(y, num_classes=vocab_size)
print(y)

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, SimpleRNN

In [16]:
model = Sequential()

# input_dim : vocab_size, output_dim : 10
model.add(Embedding(vocab_size, 10))
model.add(SimpleRNN(32))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200)

Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.1667 - loss: 2.6340
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.1667 - loss: 2.6204
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.2500 - loss: 2.6069
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.2500 - loss: 2.5933
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.2500 - loss: 2.5795
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.2500 - loss: 2.5655
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.3333 - loss: 2.5511
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.3333 - loss: 2.5363
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1e8bbd1b970>

In [20]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ""

    for _ in range(n):
        # 입력된 단어 인코딩 및 패딩 진행
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=7)
        # 인코딩된 단어를 통해 예측, 가장 높은 확률을 가진 값 선택
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)
        # 토크나이저에 예측한 단어와 동일한 인덱스가 있다면 break
        for word, index in tokenizer.word_index.items():
            if index == result:
                break
        # 현재단어에 예측단어를 이어준다.
        current_word = current_word + ' ' + word
        # 예측 단어를 전체 문장에 이어준다.
        sentence = sentence + ' ' + word
    # 입력 단어와 이어진 예측 단어를 출력
    sentence = init_word + sentence
    return sentence

경마장에 있는 말이 다쳤다


In [27]:
print(sentence_generation(model, tokenizer, '그의', 3))

그의 말은 매우 빠르다
