In [0]:
!mkdir log_dir

In [0]:
!pip install optuna

In [5]:
import keras
from keras import layers
import numpy as np
import random
import sys
import optuna

'''
サンプル(コーパス)のダウンロード
'''
path = keras.utils.get_file(
    "nietzsche.txt",
    origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt"
)
text = open(path).read().lower()
print("Corpus length:", len(text))


Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
Corpus length: 600893


In [6]:
'''
文字シーケンスのベクトル化
'''
maxlen = 60         # 60文字のシーケンスを抽出
step = 3            # 3文字おきに新しいシーケンスをサンプリング
sentences = []      # 抽出されたシーケンスを保持
next_chars = []     # 目的値(次に来る文字)を保持

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen])

print("Number of sequences:", len(sentences))

# コーパスの一意な文字のリスト
chars = sorted(list(set(text)))         # set()->重複する要素を削除, list()->リスト型にキャスト
print("Unique character:", len(chars))

# これらの文字をリストcharsのインデックスにマッピングするディクショナリ
char_indicies = dict((char, chars.index(char)) for char in chars)

print("Vectorization...")

# one-hotコーディングを適用して文字を2値の配列に格納
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indicies[char]] = 1
    y[i, char_indicies[next_chars[i]]] = 1


Number of sequences: 200278
Unique character: 57
Vectorization...


In [0]:
'''
モデルの構築
'''
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath="my_model.h5",     # モデルの保存先となるファイルのパス
        monitor='loss',         # lossを監視
        save_best_only=True       # lossが改善した時のみ保存
    ),
    keras.callbacks.TensorBoard(
        log_dir="log_dir",            # ログの記録場所
        histogram_freq=0,            # 1エポックごとに活性化ヒストグラムを記録
        write_images = True
    )
]

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation="softmax"))
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probabs = np.random.multinomial(1, preds, 1)
    return np.argmax(probabs)

In [51]:
'''
言語モデルの訓練とサンプリング
'''
# import keras.backend as K
# K.clear_session()
# 1エポックでデータを学習
model.fit(
    x, y,
    batch_size=2048,
    epochs=60,
    callbacks=callbacks_list,
    validation_split = 0.3
 )

# テキストシートをランダムに選択
start_index = random.randint(0, len(text) - maxlen -1)
generated_text = text[start_index: start_index + maxlen]
print('---Generating with seed: "'+ generated_text +'"')

# ある範囲内のサンプリング温度を試してみる
for temperature in [0.5, 1.0]:
    print("------temperature:", temperature)
    print("")
    sys.stdout.write(generated_text)
    print("")

    # 400文字を生成
    for i in range(400):
       # これまでに生成された文字にone-hotコーディングを適用
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
           sampled[0, t, char_indicies[char]] = 1.

            # 次の文字をサンプリング
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]

        generated_text += next_char
        generated_text = generated_text[1:]

        sys.stdout.write(next_char)
        sys.stdout.flush()


Train on 140194 samples, validate on 60084 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
---Generating with seed: "ort which religion and like mythological monstrosities
are c"
------temperature: 0.5

ort which religion and like mythological monstrosities
are c
ommonplaces also of the spirit when in what stroked the good oper