Colab Jupyter Notebook Version.

In [None]:
import re
import requests
from bs4 import BeautifulSoup


def get_poem():
    title, poem = [], []
    for i in range(1, 51):
        url = requests.get('https://yoondongju.yonsei.ac.kr/poet/poem.asp?ID=' + str(i))

        bs_obj = BeautifulSoup(url.content.decode('euc-kr', 'ignore').encode('utf-8'), 'html.parser')
        data = bs_obj.find('div', {'id': 'con'})

        data_title = data.find('p', {'id': 'title'})
        title_text = data_title.text

        poem_text = re.sub('<br>', '\n', re.sub(r'<br/>', '\n', str(data)))
        poem_text = poem_text[poem_text.find('</p>') + 4:poem_text.find('<a') - 2]

        title.append(title_text)
        poem.append(poem_text)

        if i < 10:
            n = '0' + str(i)
        else:
            n = str(i)

        with open('./drive/My Drive/Colab/PoemGenerator/poems/poem' + n + '.txt', 'w', encoding='utf-8') as f:
            f.write(poem_text)


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


def build_model(vocab_size, max_len):
    model = Sequential()
    model.add(Embedding(vocab_size, 32, input_length=max_len - 1))
    model.add(LSTM(256, return_sequences=True, kernel_initializer='glorot_uniform'))
    model.add(LSTM(256, return_sequences=True, kernel_initializer='glorot_uniform'))
    model.add(LSTM(256, kernel_initializer='glorot_uniform'))
    model.add(Dense(vocab_size, 'softmax', kernel_initializer='glorot_uniform'))
    return model


def sentence_generation(model, t, current_word, n):
    init_word = current_word
    sentence = ''
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=73, padding='pre')
        result = np.argmax(model.predict(encoded, verbose=0))
        word = ''
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word += ' ' + word
        sentence += ' ' + word
    sentence = init_word + sentence
    return sentence


In [None]:
import os
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint


if not os.path.isdir('./drive/My Drive/Colab/PoemGenerator/poems'):
    os.mkdir('./drive/My Drive/Colab/PoemGenerator/poems')
if not os.path.isdir('./drive/My Drive/Colab/PoemGenerator/checkpoints'):
    os.mkdir('./drive/My Drive/Colab/PoemGenerator/checkpoints')

# get poem with crawler
if len(os.listdir('./drive/My Drive/Colab/PoemGenerator/poems')) < 50:
    get_poem()

# read text
text = []
for i in range(1, 51):
    n = '1'
    if i < 10:
        n = '0' + str(i)
    else:
        n = str(i)

    with open('./drive/My Drive/Colab/PoemGenerator/poems/poem' + n + '.txt', 'r', encoding='utf-8') as f:
        while 1:
            tmp = f.readline()
            text.append(tmp[:-1])
            if tmp == '':
                break

# Tokenizer
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1

# make sequences
sequences = list()
for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

idx2word = {}
for key, value in t.word_index.items():
    idx2word[value] = key

max_len = max(len(ln) for ln in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

# split sequences into X, Y
sequences = np.array(sequences)
X = sequences[:, :-1]
Y = sequences[:, -1]
Y = to_categorical(Y, num_classes=vocab_size)

checkpoint_dir = './drive/My Drive/Colab/PoemGenerator/checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'cpkt')
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

model = build_model(vocab_size, max_len)

# train model
if len(os.listdir('./drive/My Drive/Colab/PoemGenerator/checkpoints')) == 0:
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, Y, batch_size=256, epochs=250, callbacks=[checkpoint_callback], verbose=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))


# get word from input and generate text(50 words)
# input 'x' to stop
while True:
    word = input('시작 단어 입력(종료: x): ')
    if word == 'x':
        break
    else:
        text_len = int(input('단어 수 입력: '))
        print(sentence_generation(model, t, word, text_len))
        print('')
