In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=6, micro=4, releaselevel='final', serial=0)
matplotlib 2.1.2
numpy 1.19.1
pandas 0.22.0
sklearn 0.19.1
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


In [2]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath, 'r').read()

print(len(text))
print(text[0:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos> 输入输出

vocab = sorted(set(text))
print(len(vocab))
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
char_2idx = {char:idx for idx, char in enumerate(vocab)}
print(char_2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [5]:
idx2char = np.array(vocab)
print(idx2char)
# 通过id转换（获取）字符
print(idx2char[[0,1,2]])
print(idx2char[np.array([0,1,2])])

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
['\n' ' ' '!']
['\n' ' ' '!']


In [6]:
text_as_int = np.array([char_2idx[c] for c in text])
print(text_as_int[0:10])
print(text[:10])

[18 47 56 57 58  1 15 47 58 47]
First Citi


In [7]:
def split_input_target(id_text):
    """
    abcde -> abcd, cde
    """
    return id_text[0:-1], id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)  # 这里的dataset是字符级的，下面需要进行转换，转换为句子级的。
seq_length = 100
# 将dataset转变为sequence的dataset
# 这里加1是因为，对于输入长度为5的，做完切分之后，长度都会变为4，长度会减1，所以如果想要100的长度，提前先加1
# drop_remainder = True，最后的一个batch长度如果不够，那就丢弃
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder=True)

for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])
    
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(18, shape=(), dtype=int64) F
tf.Tensor(47, shape=(), dtype=int64) i
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [8]:
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [9]:
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

In [10]:
vocab_size = len(vocab)
embedding_dim = 256 # 词表比较小，所以设置的维度大一些
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape = [batch_size, None]),
        keras.layers.SimpleRNN(units=rnn_units,
                              stateful=True,
                              recurrent_initializer='glorot_uniform',
                              return_sequences=True),  # 输出预测也是一个序列
        keras.layers.Dense(vocab_size), # 模型最后一层输出没有激活函数，所以就是logits类型的输出
    ])
    return model

model = build_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [11]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    

(64, 100, 65)


64：batch_size, 100:句子长度，65：预测类别个数

In [12]:
# random sampling.
# 通过随机采样的方式，基于上面的概率分布来生成一段话。
# greedy（谈心策略，直接使用概率）, random（随机策略）.基于谈心算法最终只能生成一个序列，而采用随机采样算法，可以生成得到多个序列
sample_indices = tf.random.categorical(
    logits=example_batch_predictions[0], num_samples=1) # 采样个数num_samples = 1，也可以设置多个，这样就可以采样多个序列
print(sample_indices)
# (100, 65) -> (100, 1)，对100中的每一个位置都去做一个sample，最终一个位置从65个类别中采样得到一个结果
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(sample_indices)

tf.Tensor(
[[12]
 [53]
 [53]
 [23]
 [56]
 [17]
 [17]
 [25]
 [30]
 [13]
 [37]
 [23]
 [15]
 [10]
 [45]
 [16]
 [42]
 [16]
 [46]
 [15]
 [19]
 [35]
 [24]
 [33]
 [13]
 [53]
 [54]
 [59]
 [21]
 [12]
 [32]
 [31]
 [47]
 [64]
 [49]
 [36]
 [36]
 [30]
 [33]
 [40]
 [53]
 [20]
 [51]
 [14]
 [39]
 [29]
 [ 2]
 [45]
 [31]
 [20]
 [13]
 [ 6]
 [38]
 [64]
 [11]
 [26]
 [47]
 [ 4]
 [20]
 [34]
 [ 2]
 [ 3]
 [15]
 [39]
 [24]
 [ 4]
 [15]
 [ 5]
 [33]
 [60]
 [51]
 [ 2]
 [ 4]
 [44]
 [10]
 [26]
 [60]
 [39]
 [38]
 [39]
 [26]
 [37]
 [21]
 [ 4]
 [ 0]
 [34]
 [18]
 [22]
 [46]
 [10]
 [61]
 [37]
 [25]
 [16]
 [32]
 [61]
 [62]
 [50]
 [46]
 [ 9]], shape=(100, 1), dtype=int64)
tf.Tensor(
[12 53 53 23 56 17 17 25 30 13 37 23 15 10 45 16 42 16 46 15 19 35 24 33
 13 53 54 59 21 12 32 31 47 64 49 36 36 30 33 40 53 20 51 14 39 29  2 45
 31 20 13  6 38 64 11 26 47  4 20 34  2  3 15 39 24  4 15  5 33 60 51  2
  4 44 10 26 60 39 38 39 26 37 21  4  0 34 18 22 46 10 61 37 25 16 32 61
 62 50 46  9], shape=(100,), dtype=int64)


In [13]:
print("Input: ", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

Input:  't sea?\n\nISABELLA:\nI have heard of the lady, and good words went with her name.\n\nDUKE VINCENTIO:\nShe '

Output:  ' sea?\n\nISABELLA:\nI have heard of the lady, and good words went with her name.\n\nDUKE VINCENTIO:\nShe s'

Predictions:  "?ooKrEEMRAYKC:gDdDhCGWLUAopuI?TSizkXXRUboHmBaQ!gSHA,Zz;Ni&HV!$CaL&C'Uvm!&f:NvaZaNYI&\nVFJh:wYMDTwxlh3"


In [14]:
# 定义模型的损失函数
# 之前的损失函数直接是在激活函数之后使用的字符串指定"sparse_categorical_crossentropy"，而这里我们没有加激活函数，所以这里设置from_logits=True
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer="adam", loss=loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.184099


In [15]:
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

epochs = 100
history = model.fit(seq_dataset, epochs=epochs, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 99/100
Epoch 100/100


In [16]:
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints/ckpt_100'

In [17]:
# 从checkpoint载入模型
model2 = build_model(vocab_size,
                     embedding_dim,
                     rnn_units,
                     batch_size=1) # 一次只生成一个句子
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None])) # 一个变长输入
# 大写表示序列，小写表示字符
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [20]:
def generate_text(model, start_string, num_generate=1000):
    input_eval = [char_2idx[ch] for ch in start_string] # 字符转为id序列，维度为1
    input_eval = tf.expand_dims(input_eval, 0) # 进行维度扩展，[1, None]
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample ->   ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # 将三维的预测结果转为二维，直接将batch_size=1消掉
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical(
            predictions, num_samples=1)[-1, 0].numpy() # 只取预测出的结果，所以取最后一个（ch），取出来加入到text_generated
        text_generated.append(idx2char[predicted_id])
        # s,x -> rnn -> s',y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)

All: I must confess so long amazed me! this one people years. Through enr of my wordward.
Gentlemen, too much alone.

GREGORIO:
Bear with minument must be exceed
Was slave my weakness that we died,
He stay before his head well please?

CORIOLANUS:
Mispeak of love!
But, sir, I with tears; for all of us offended t butt, and from remains
And look'd for himself perforce quarrel half our glorious Tybalt marr'd and ambroker tears are come; in purposedate so proper babe.
And there was book how wears from Rome again
Without counteoushis wars and the sea of me.

PAULINA:
Why, wherefore? sore! loss have of my bounself of me;
For I have been consider thee.

Gag it, the quoch being hold his pamb my,
And bear their torments than bound to visit you not ontend.

Provost:
I told you, Ifore
As to be so boldness of my king by me, as yours, when man an' unrayor this be sworn breed.
Stand, ere thou wast thine,
Who hadst as gone, sir?

COMINIUS:
If it be so, the old more strength,
Which within your requess