In [8]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.optimizers import *
# from keras.utils import np_utils
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
from konlpy.corpus import kolaw
from konlpy.tag import Okt

c = kolaw.open('constitution.txt').read()
senstents = [s for s in sent_tokenize(c)]

print(senstents[3])   # 제2조① 대한민국의 국민이 되는 요건은 법률로 정한다.
print(len( senstents ) )  #357


제2조 ① 대한민국의 국민이 되는 요건은 법률로 정한다.
357


In [9]:
########## 전처리 ###################
twitter = Okt()
doc0 = [" ".join(["".join(w) for w, t in twitter.pos(s)
                  if t not in ['Number', "Foreign"] and w not in ["제", "조"]]) for s in sent_tokenize(c)]
print(len(doc0)) #357
print(doc0[3]) #대한민국 의 국민 이 되는 요건 은 법률 로 정 한다 .


tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc0)
doc = [l for l in tokenizer.texts_to_sequences(doc0) if len(l) > 1]
print(len(doc)) #354
print(doc[3])   #[102, 1, 22, 5, 111, 653, 4, 9, 24, 13, 6]

maxlen = max([len(x) - 1 for x in doc])      #187
vocab_size = len(tokenizer.word_index) + 1   #1165

print( maxlen, vocab_size ) #187 1165

357
대한민국 의 국민 이 되는 요건 은 법률 로 정 한다 .
354
[102, 1, 22, 5, 111, 653, 4, 9, 24, 13, 6]
187 1165


In [11]:
############   Data Generation ##################
import  numpy as np

def generate_data(X, maxlen, vocab_size):
    for sentence in X:
        inputs = []
        targets = []
        for i in range(1, len(sentence)):
            inputs.append(sentence[0:i])
            targets.append(sentence[i])
        y = to_categorical(targets, vocab_size)
        inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)
        yield (inputs_sequence, y)

for i, (x, y) in enumerate(generate_data(doc, maxlen, vocab_size)):
    print("i", i)
    print("x", x.shape, "\n", x)
    print("y", y.shape, "\n", y)
    if i > 1:
        break


X = []
Y = []
for x, y in generate_data(doc, maxlen, vocab_size):
    X.append(x)
    Y.append(y)

X = np.concatenate(X)
Y = np.concatenate(Y)

i 0
x (187, 187) 
 [[  0   0   0 ...   0   0 102]
 [  0   0   0 ...   0 102  28]
 [  0   0   0 ... 102  28 602]
 ...
 [  0   0 102 ... 647 155   2]
 [  0 102  28 ... 155   2  20]
 [102  28 602 ...   2  20 180]]
y (187, 1165) 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
i 1
x (6, 187) 
 [[  0   0   0 ...   0   0  45]
 [  0   0   0 ...   0  45 439]
 [  0   0   0 ...  45 439 648]
 [  0   0   0 ... 439 648 102]
 [  0   0   0 ... 648 102   4]
 [  0   0   0 ... 102   4 649]]
y (6, 1165) 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
i 2
x (12, 187) 
 [[  0   0   0 ...   0   0 102]
 [  0   0   0 ...   0 102   1]
 [  0   0   0 ... 102   1 440]
 ...
 [  0   0   0 ...  34 651   4]
 [  0   0   0 ... 651   4  22]
 [  0   0   0 ...   4  22 331]]
y (12, 1165) 
 [[0. 1. 0. ... 0. 0. 0.

In [12]:
######## Model ###########

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(LSTM(100, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 187, 100)          116500    
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1165)              117665    
                                                                 
Total params: 314565 (1.20 MB)
Trainable params: 314565 (1.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
###### Training #########
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=["accuracy"])
hist = model.fit(X, Y, epochs=500, batch_size=800, verbose=2)

import matplotlib.pyplot as plt
plt.plot(hist.history['acc'])
plt.show()

## 모델 저장 ##
model.save("rnn_text_gen.hdf5")



Epoch 1/500
9/9 - 6s - loss: 6.6710 - accuracy: 0.0194 - 6s/epoch - 625ms/step
Epoch 2/500
9/9 - 5s - loss: 5.8505 - accuracy: 0.0392 - 5s/epoch - 591ms/step
Epoch 3/500
9/9 - 5s - loss: 5.7513 - accuracy: 0.0444 - 5s/epoch - 607ms/step
Epoch 4/500
9/9 - 5s - loss: 5.7326 - accuracy: 0.0492 - 5s/epoch - 582ms/step
Epoch 5/500
9/9 - 5s - loss: 5.7130 - accuracy: 0.0500 - 5s/epoch - 567ms/step
Epoch 6/500
9/9 - 6s - loss: 5.6880 - accuracy: 0.0484 - 6s/epoch - 624ms/step
Epoch 7/500
9/9 - 6s - loss: 5.6701 - accuracy: 0.0506 - 6s/epoch - 631ms/step
Epoch 8/500
9/9 - 6s - loss: 5.6394 - accuracy: 0.0513 - 6s/epoch - 623ms/step
Epoch 9/500
9/9 - 6s - loss: 5.6017 - accuracy: 0.0529 - 6s/epoch - 629ms/step
Epoch 10/500
9/9 - 6s - loss: 5.5684 - accuracy: 0.0522 - 6s/epoch - 617ms/step
Epoch 11/500
9/9 - 6s - loss: 5.5227 - accuracy: 0.0536 - 6s/epoch - 612ms/step
Epoch 12/500
9/9 - 5s - loss: 5.4774 - accuracy: 0.0551 - 5s/epoch - 605ms/step
Epoch 13/500
9/9 - 6s - loss: 5.4397 - accuracy: 

Epoch 104/500
9/9 - 5s - loss: 3.0617 - accuracy: 0.3595 - 5s/epoch - 606ms/step
Epoch 105/500
9/9 - 5s - loss: 3.0606 - accuracy: 0.3507 - 5s/epoch - 597ms/step
Epoch 106/500
9/9 - 5s - loss: 3.0439 - accuracy: 0.3575 - 5s/epoch - 598ms/step
Epoch 107/500
9/9 - 5s - loss: 3.0189 - accuracy: 0.3655 - 5s/epoch - 588ms/step
Epoch 108/500
9/9 - 5s - loss: 2.9969 - accuracy: 0.3681 - 5s/epoch - 582ms/step
Epoch 109/500
9/9 - 5s - loss: 2.9906 - accuracy: 0.3653 - 5s/epoch - 585ms/step
Epoch 110/500
9/9 - 5s - loss: 2.9750 - accuracy: 0.3707 - 5s/epoch - 594ms/step
Epoch 111/500
9/9 - 5s - loss: 2.9533 - accuracy: 0.3674 - 5s/epoch - 582ms/step
Epoch 112/500
9/9 - 5s - loss: 2.9445 - accuracy: 0.3781 - 5s/epoch - 583ms/step
Epoch 113/500
9/9 - 5s - loss: 2.9242 - accuracy: 0.3715 - 5s/epoch - 573ms/step
Epoch 114/500
9/9 - 5s - loss: 2.9211 - accuracy: 0.3769 - 5s/epoch - 588ms/step
Epoch 115/500
9/9 - 5s - loss: 2.9006 - accuracy: 0.3817 - 5s/epoch - 583ms/step
Epoch 116/500
9/9 - 5s - los

9/9 - 5s - loss: 1.8775 - accuracy: 0.5663 - 5s/epoch - 611ms/step
Epoch 206/500
9/9 - 5s - loss: 1.8779 - accuracy: 0.5611 - 5s/epoch - 594ms/step
Epoch 207/500
9/9 - 5s - loss: 1.8683 - accuracy: 0.5644 - 5s/epoch - 573ms/step
Epoch 208/500
9/9 - 5s - loss: 1.8560 - accuracy: 0.5641 - 5s/epoch - 594ms/step
Epoch 209/500
9/9 - 5s - loss: 1.8590 - accuracy: 0.5732 - 5s/epoch - 593ms/step
Epoch 210/500
9/9 - 5s - loss: 1.8342 - accuracy: 0.5781 - 5s/epoch - 545ms/step
Epoch 211/500
9/9 - 5s - loss: 1.8245 - accuracy: 0.5774 - 5s/epoch - 550ms/step
Epoch 212/500
9/9 - 5s - loss: 1.8191 - accuracy: 0.5783 - 5s/epoch - 559ms/step
Epoch 213/500
9/9 - 5s - loss: 1.8225 - accuracy: 0.5774 - 5s/epoch - 570ms/step
Epoch 214/500
9/9 - 5s - loss: 1.8061 - accuracy: 0.5877 - 5s/epoch - 585ms/step
Epoch 215/500
9/9 - 5s - loss: 1.8074 - accuracy: 0.5774 - 5s/epoch - 566ms/step
Epoch 216/500
9/9 - 5s - loss: 1.8149 - accuracy: 0.5689 - 5s/epoch - 566ms/step
Epoch 217/500
9/9 - 5s - loss: 1.7774 - ac

Epoch 307/500
9/9 - 5s - loss: 1.2917 - accuracy: 0.6883 - 5s/epoch - 569ms/step
Epoch 308/500
9/9 - 5s - loss: 1.2686 - accuracy: 0.7012 - 5s/epoch - 568ms/step
Epoch 309/500
9/9 - 5s - loss: 1.2737 - accuracy: 0.6860 - 5s/epoch - 572ms/step
Epoch 310/500
9/9 - 5s - loss: 1.2582 - accuracy: 0.6952 - 5s/epoch - 575ms/step
Epoch 311/500
9/9 - 5s - loss: 1.2585 - accuracy: 0.6967 - 5s/epoch - 545ms/step
Epoch 312/500
9/9 - 5s - loss: 1.2677 - accuracy: 0.6963 - 5s/epoch - 577ms/step
Epoch 313/500
9/9 - 5s - loss: 1.2459 - accuracy: 0.6950 - 5s/epoch - 588ms/step
Epoch 314/500
9/9 - 5s - loss: 1.2437 - accuracy: 0.6964 - 5s/epoch - 588ms/step
Epoch 315/500
9/9 - 5s - loss: 1.2495 - accuracy: 0.6974 - 5s/epoch - 567ms/step
Epoch 316/500
9/9 - 5s - loss: 1.2507 - accuracy: 0.6981 - 5s/epoch - 578ms/step
Epoch 317/500
9/9 - 5s - loss: 1.2594 - accuracy: 0.6944 - 5s/epoch - 576ms/step
Epoch 318/500
9/9 - 5s - loss: 1.2291 - accuracy: 0.7010 - 5s/epoch - 560ms/step
Epoch 319/500
9/9 - 5s - los

9/9 - 5s - loss: 0.9665 - accuracy: 0.7599 - 5s/epoch - 584ms/step
Epoch 409/500
9/9 - 5s - loss: 0.9635 - accuracy: 0.7589 - 5s/epoch - 561ms/step
Epoch 410/500
9/9 - 5s - loss: 0.9468 - accuracy: 0.7655 - 5s/epoch - 552ms/step
Epoch 411/500
9/9 - 5s - loss: 0.9641 - accuracy: 0.7643 - 5s/epoch - 563ms/step
Epoch 412/500
9/9 - 5s - loss: 0.9562 - accuracy: 0.7656 - 5s/epoch - 577ms/step
Epoch 413/500
9/9 - 5s - loss: 0.9508 - accuracy: 0.7661 - 5s/epoch - 578ms/step
Epoch 414/500
9/9 - 5s - loss: 0.9490 - accuracy: 0.7674 - 5s/epoch - 561ms/step
Epoch 415/500
9/9 - 5s - loss: 0.9532 - accuracy: 0.7636 - 5s/epoch - 560ms/step
Epoch 416/500
9/9 - 5s - loss: 0.9397 - accuracy: 0.7665 - 5s/epoch - 579ms/step
Epoch 417/500
9/9 - 5s - loss: 0.9296 - accuracy: 0.7694 - 5s/epoch - 570ms/step
Epoch 418/500
9/9 - 5s - loss: 0.9361 - accuracy: 0.7672 - 5s/epoch - 570ms/step
Epoch 419/500
9/9 - 5s - loss: 0.9291 - accuracy: 0.7693 - 5s/epoch - 577ms/step
Epoch 420/500
9/9 - 5s - loss: 0.9279 - ac

In [None]:
## 모델 로드 ##
from keras.models import load_model
model = load_model("rnn_text_gen.hdf5")

word_list = '대한민국 의 국민 이 되는 요건 은 법률 로 정한 다 .'.split(" ")
 
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
x = sequence.pad_sequences([[tokenizer.word_index[w] for w in word_list[:2]]], maxlen=maxlen)
 
p = model.predict(x)[0]
 
idx = np.flip(np.argsort(p), 0)
 
for i in idx[:5]:
    print(reverse_word_map[i])

def predict_word(i, n=1):
    x = sequence.pad_sequences([[tokenizer.word_index[w] for w in word_list[:i]]], maxlen=maxlen)
    p = model.predict(x)[0]
    idx = np.flip(np.argsort(p), 0)
    for j in idx[:n]:
        print('"', " ".join(word_list[:i]), '"', reverse_word_map[j], " (p={:4.2f}%)".format(100 * p[j]))


print (predict_word(1, n=3))
print (predict_word(2, n=3))
print (predict_word(3, n=3))
