In [1]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout
from keras.optimizers import RMSprop

import numpy as np
import random
import sys

ModuleNotFoundError: No module named 'keras'

#### 세익스피어 소설을 읽는다

In [2]:
text = open('shakespeare_final.txt').read().lower()
print('corpus length:', len(text))

characters = sorted(list(set(text)))
print('total chars:', len(characters))
print(characters)

corpus length: 581432
total chars: 61
['\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~']


In [3]:
char2indices = dict((c, i) for i, c in enumerate(characters))
indices2char = dict((i, c) for i, c in enumerate(characters))
print(char2indices)
print()
print(indices2char)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '<': 26, '>': 27, '?': 28, '@': 29, '[': 30, ']': 31, '_': 32, 'a': 33, 'b': 34, 'c': 35, 'd': 36, 'e': 37, 'f': 38, 'g': 39, 'h': 40, 'i': 41, 'j': 42, 'k': 43, 'l': 44, 'm': 45, 'n': 46, 'o': 47, 'p': 48, 'q': 49, 'r': 50, 's': 51, 't': 52, 'u': 53, 'v': 54, 'w': 55, 'x': 56, 'y': 57, 'z': 58, '|': 59, '~': 60}

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '#', 5: '&', 6: "'", 7: '(', 8: ')', 9: '*', 10: ',', 11: '-', 12: '.', 13: '/', 14: '0', 15: '1', 16: '2', 17: '3', 18: '4', 19: '5', 20: '6', 21: '7', 22: '8', 23: '9', 24: ':', 25: ';', 26: '<', 27: '>', 28: '?', 29: '@', 30: '[', 31: ']', 32: '_', 33: 'a', 34: 'b', 35: 'c', 36: 'd', 37: 'e', 38: 'f', 39: 'g', 40: 'h', 41: 'i', 42: 'j', 43: 'k', 44: 'l', 45: 'm', 46: 'n', 47: 'o', 48: 'p', 49: 'q', 50: 'r', 51: 's'

#### cut the text in semi-redundant sequences of maxlen characters

In [4]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 193798


In [5]:
print(sentences[:10])
print(next_chars[:10])

['the project gutenberg ebook of the compl', ' project gutenberg ebook of the complete', 'oject gutenberg ebook of the complete wo', 'ct gutenberg ebook of the complete works', 'gutenberg ebook of the complete works of', 'enberg ebook of the complete works of wi', 'erg ebook of the complete works of willi', ' ebook of the complete works of william ', 'ook of the complete works of william sha', ' of the complete works of william shakes']
['e', ' ', 'r', ' ', ' ', 'l', 'a', 's', 'k', 'p']


#### Converting indices into vectorized format

In [6]:
X = np.zeros((len(sentences), maxlen, len(characters)), dtype=np.bool)
y = np.zeros((len(sentences), len(characters)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char2indices[char]] = 1
    y[i, char2indices[next_chars[i]]] = 1

#### Model Building

In [7]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(characters))))

model.add(Dense(len(characters)))

model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))

print (model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               97280     
_________________________________________________________________
dense_1 (Dense)              (None, 61)                7869      
_________________________________________________________________
activation_1 (Activation)    (None, 61)                0         
Total params: 105,149
Trainable params: 105,149
Non-trainable params: 0
_________________________________________________________________
None


#### Function to convert prediction into index
[0.4, 0.6]을 아래 함수로 변환하면 ...<br>
1. metric = 1.0 이면 [0.4, 0.6] - 불변
2. metric = 0.2 이면 [0.17, 0.83] - 차이가 더 커짐
3. metric = 1.5 이면 [0.43, 0.57] - 차이가 작아짐

In [18]:
# 61개 softmax vector를 word index로 변환한다.
def pred_indices(preds, metric = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / metric
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

#### Train & Evaluate the Model

In [31]:
model.fit(X, y, batch_size = 128, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x17eff301a90>

#### 문장 1개를 선택하고 이후에 나타날 문자 1개를 예측한다.

In [20]:
# 임의 문장 1개를 선택한다
start_index = random.randint(0, len(text) - maxlen - 1)
sentence = text[start_index: start_index + maxlen]
sentence

'\n  celia. let us sit and mock the good h'

In [21]:
# 선택한 문장 다음에 나올 단어를 예측해 본다.
x = np.zeros((1, maxlen, len(characters)))
for t, char in enumerate(sentence):
    x[0, t, char2indices[char]] = 1.    # 문장

preds = model.predict(x, verbose=0)[0]  # 다음 단어 예측 (61개 짜리 softmax)
next_index = pred_indices(preds, 0.2)
pred_char = indices2char[next_index]
print(pred_char)

o


#### 문장 1개를 선택하고 이후에 나타날 문자 400개를 연속으로 예측한다.

In [32]:
# 임의 문장 1개를 선택한다
start_index = random.randint(0, len(text) - maxlen - 1)
sentence = text[start_index: start_index + maxlen]

generated = ''
generated += sentence
print('----- Generating with seed: "' + sentence + '"\n')

diversity = 0.2
for i in range(400):
    x = np.zeros((1, maxlen, len(characters)))
    for t, char in enumerate(sentence):
        x[0, t, char2indices[char]] = 1.    # 문장

    preds = model.predict(x, verbose=0)[0]  # 다음 단어의 one-hot vector 예측
    next_index = pred_indices(preds, diversity)
    pred_char = indices2char[next_index]

    generated += pred_char
    sentence = sentence[1:] + pred_char

    print(pred_char, end='')

----- Generating with seed: "  nor taste, nor smell, desire to be inv"

ertents to the stands with the stand the stands to the world the harts the stands,
    and the stand to the stand the searent,
    that i will not the stand the world the graise,
    the world the wast the seed to and the storther,
    the stand the beartent of the stand the stand the way,
    and the self the stand the stand the stand the startent,
    the stand the stand the stand the world in t

#### 원래 책에 있는 소스 프로그램
iteration이 증가할수록 (학습량이 증가할수록) 생성된 문장의 품질을 비교해 보자.

In [23]:
for iteration in range(1, 30):
    print('-' * 40)
    print('Iteration', iteration)
    
    # 반복할 때마다 계속 학습함.
    model.fit(X, y, batch_size = 128, epochs=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.7, 1.2]:

        print('\n----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(characters)))
            for t, char in enumerate(sentence):
                x[0, t, char2indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = pred_indices(preds, diversity)
            pred_char = indices2char[next_index]

            generated += pred_char
            sentence = sentence[1:] + pred_char

            sys.stdout.write(pred_char)
            sys.stdout.flush()
        print("\nOne combination completed \n")

----------------------------------------
Iteration 1
Instructions for updating:
Use tf.cast instead.
Epoch 1/1

----- diversity: 0.2
----- Generating with seed: " wife,
  the world will be thy widow and"
 wife,
  the world will be thy widow and the world the foor and the world the world the part the read
    the commessend the world the some the mostle and the prosisers and the world so sure the world my may the thement the world my mostlespeare the thee,
    the world the send the comes to messarant
    the restress and the women the world the commended to myselveres
    which the make world the world the world the proselt your marrand
One combination completed 


----- diversity: 0.7
----- Generating with seed: " wife,
  the world will be thy widow and"
 wife,
  the world will be thy widow and portant
  dutter. ender ard worth not a prose show hearerrain time
    a to most uresarain your oundicervion shall by the seiver
    the had the greadediand i but the hast,
    out when then my

KeyboardInterrupt: 