In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import sys
import time
from tensorflow import keras
import os

In [3]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath, 'r').read()

1115394


In [11]:
vocab = sorted(set(text))
char2idx = { chr : idx for idx, chr in enumerate(vocab)}
text_as_intarr = np.array([char2idx[c] for c in text])
print(vocab)
print(char2idx)
print(text_as_intarr[0:50])

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 4

In [12]:
def split_input_target(text):
    return (text[0:-1], text[1:])

print(split_input_target('abcde'))

('abcd', 'bcde')


In [20]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_intarr)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length+1, drop_remainder=True)
batch_size = 64
shuffle_size = 10000
seq_dataset = seq_dataset.map(split_input_target)
seq_dataset = seq_dataset.shuffle(shuffle_size).batch(batch_size, drop_remainder=True)
for (d1, d2) in seq_dataset.take(2):
    print(d1)
    print(d2)


tf.Tensor(
[[50 44  6 ... 47 56  6]
 [63  1 58 ... 50 43  7]
 [51  5 42 ... 58  1  5]
 ...
 [ 1 41 39 ... 53 57 57]
 [33 54 53 ... 56 53 54]
 [45 39 47 ... 39 49  6]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[44  6  1 ... 56  6  0]
 [ 1 58 56 ... 43  7 61]
 [ 5 42  1 ...  1  5 58]
 ...
 [41 39 52 ... 57 57  1]
 [54 53 52 ... 53 54 43]
 [39 47 52 ... 49  6  1]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[ 1 58 46 ...  0 18 56]
 [61 47 58 ... 50 42  5]
 [46 43  1 ... 53 56 57]
 ...
 [52 10  0 ... 13 51 43]
 [ 1 58 53 ... 52  1 43]
 [ 1 40 43 ... 61 39 56]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[58 46 53 ... 18 56 53]
 [47 58 46 ... 42  5 57]
 [43  1 51 ... 56 57  5]
 ...
 [10  0 20 ... 51 43 52]
 [58 53  1 ...  1 43 50]
 [40 43 43 ... 39 56 42]], shape=(64, 100), dtype=int64)


In [34]:
batch_size=64
embedding_dim=256
rnn_units=1024
# 用embedding_dim = 128，rnn_units=512试过，效果极差 
vocab_size = len(vocab)
def build_model(vocab_size, embedding_size, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_size, batch_input_shape=[batch_size, None]),
        keras.layers.LSTM(units=rnn_units, stateful=True, recurrent_initializer="glorot_normal", return_sequences=True),
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

idx2char = np.array(vocab)

In [35]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

# randomm sampling
# greedy, random
sample_indices = tf.random.categorical(logits = example_batch_predictions[0], num_samples = 1)
print(sample_indices)

sample_indices = tf.squeeze(sample_indices, axis = -1)
print(sample_indices)

print("Input: ", repr(" ".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr(" ".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr(" ".join(idx2char[sample_indices])))

(64, 100, 65)
tf.Tensor(
[[10]
 [35]
 [26]
 [28]
 [34]
 [49]
 [42]
 [62]
 [ 9]
 [28]
 [38]
 [33]
 [ 3]
 [21]
 [40]
 [45]
 [23]
 [38]
 [52]
 [20]
 [10]
 [14]
 [26]
 [55]
 [64]
 [ 3]
 [23]
 [24]
 [22]
 [23]
 [43]
 [16]
 [19]
 [15]
 [40]
 [17]
 [46]
 [ 9]
 [60]
 [10]
 [ 2]
 [15]
 [ 7]
 [40]
 [ 2]
 [23]
 [16]
 [44]
 [47]
 [17]
 [33]
 [ 6]
 [62]
 [10]
 [35]
 [43]
 [53]
 [56]
 [13]
 [ 8]
 [42]
 [54]
 [20]
 [17]
 [49]
 [35]
 [43]
 [16]
 [ 7]
 [13]
 [30]
 [50]
 [10]
 [36]
 [64]
 [53]
 [41]
 [30]
 [45]
 [20]
 [58]
 [44]
 [40]
 [ 0]
 [ 4]
 [30]
 [48]
 [ 7]
 [60]
 [20]
 [16]
 [15]
 [42]
 [52]
 [ 7]
 [48]
 [40]
 [36]
 [59]
 [14]], shape=(100, 1), dtype=int64)
tf.Tensor(
[10 35 26 28 34 49 42 62  9 28 38 33  3 21 40 45 23 38 52 20 10 14 26 55
 64  3 23 24 22 23 43 16 19 15 40 17 46  9 60 10  2 15  7 40  2 23 16 44
 47 17 33  6 62 10 35 43 53 56 13  8 42 54 20 17 49 35 43 16  7 13 30 50
 10 36 64 53 41 30 45 20 58 44 40  0  4 30 48  7 60 20 16 15 42 52  7 48
 40 36 59 14], shape=(100,), dtype=int64)

In [36]:
def loss(labels, logits):
    # 模型里最后一层没有softmax，所以返回的是logits，而不是概率分布，所以from_logits＝True
    return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

model.compile(optimizer = 'adam', loss=loss)

In [37]:
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)
epochs = 20
history = model.fit(seq_dataset, epochs = epochs, callbacks = [checkpoint_callback])

Train for 172 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:
applied_model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
applied_model.load_weights(tf.train.latest_checkpoint(output_dir))
applied_model.build(tf.TensorShape([1, None]))

In [39]:
def generate_text(model, start_text, gen_text_num):
    input_eval = [char2idx[c] for c in start_text]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    
    # temperature > 1, random
    # temperature > 1, greedy
    temperature = 0.5

    for _ in range(gen_text_num):
        predict = model(input_eval)
        predict = predict / temperature
        
        predict = tf.squeeze(predict, 0)
        # random比greedy更适用，random是按softmax的概率来输出，即a:0.8, b:0.15, c:0.05，则80%得到的结果是a, 而greedy则只可能是a
        predict_id = tf.random.categorical(predict, num_samples=1)[-1, 0].numpy()
        predict_char = idx2char[predict_id]
        text_generated.append(predict_char)
        input_eval = tf.expand_dims([predict_id], 0) # lstmk只需要刚预测出来的值
    return "".join(text_generated)

test = generate_text(applied_model, "All: ", 500)
print(test)

I shall entreat me speak.

CLIFFORD:
How must he does not presently the people
Your providest heart my penitent that be made.

GLOUCESTER:
What is thy name?

CORIOLANUS:
The king shall meet thee, thou shalt still have
Our torts of stone, that we may be meet,
And therefore I'll not be so valiant husband.
A grave beloved, that thought there was my wanton piece of spirits than a present and sheep-worthy love.

VINCENTIO:
As if that news, my lord.

DUKE VINCENTIO:
I know him for this business.

AUFI
