In [None]:
import tensorflow as tf
import numpy as np
import os
import time

#讀取資料

In [None]:
#莎士比亞的科利奧蘭納斯劇本，是莎士比亞晚期的作品
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [None]:
# 讀取資料，並且格式轉換為utf-8
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# 確認字數量
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [None]:
# 觀察前100個字
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
# 觀察不重複字元
vocab = sorted(set(text))
print('{} 個不重複的文字'.format(len(vocab)))

65 個不重複的文字


In [None]:
vocab

['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

#資料前處理

In [None]:
# 建立每一個文字的代號
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [None]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...
}


In [None]:
# 展示前面10個文字轉換後的代號
print('文字：',text[:10],'代號：',text_as_int[:10])

文字： First Citi 代號： [18 47 56 57 58  1 15 47 58 47]


In [None]:
#設定最長輸入的句子
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
#建立訓練資料與預測目標
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
#前一個輸入的文字，預測下一個輸入的文字
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))
#以First為例，用F去預測i，用i去預測r

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


In [None]:
# 加上BUFFER_SIZE與BATCH_SIZE
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#建立模型

In [None]:
# 建立以65字元vocab_size
vocab_size = len(vocab)
# 設定詞嵌入的維度
embedding_dim = 256
# 設定RNN所使用的單元數
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (批次大小、序列長度、字詞的數量)")

(64, 100, 65) # (批次大小、序列長度、字詞的數量)


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
sampled_indices

array([ 8,  5, 40, 61, 41, 35, 17,  7, 49, 18, 19, 16, 62, 54,  9, 57,  4,
       10, 32, 18, 53, 35, 48, 18, 24, 32,  2,  5, 27, 48, 37, 53, 57, 51,
       36, 29,  2, 18, 53,  0, 57, 31, 64, 10, 30, 23, 27, 60, 34, 29, 14,
       15, 48, 10, 19, 30, 16, 53, 14, 15, 37, 36, 47, 22, 16, 27, 23,  3,
        5, 26,  6, 53, 17, 18, 27, 13, 64, 59, 57,  1, 10, 32, 12, 18, 40,
       41,  5,  5, 64, 59, 11, 44, 54, 45, 38, 19,  9, 35, 56, 21])

In [None]:
print("輸入: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("下一個預測的文字: \n", repr("".join(idx2char[sampled_indices ])))

輸入: 
 'kill him?\n\nBUCKINGHAM:\nMy lord, your promise for the earldom,--\n\nKING RICHARD III:\nRichmond! When la'

下一個預測的文字: 
 ".'bwcWE-kFGDxp3s&:TFoWjFLT!'OjYosmXQ!Fo\nsSz:RKOvVQBCj:GRDoBCYXiJDOK$'N,oEFOAzus :T?Fbc''zu;fpgZG3WrI"


In [None]:
# 設定儲存位置
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
# 模型compile
model.compile(optimizer='adam', loss=loss)
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 文字生成

In [None]:
#讀取模型
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [None]:
#建立文字生成函式
def generate_text(model, start_string):
    # 設定每一個生成的長度
    num_generate = 1000
    # 轉換文字到向量
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    #建立預測參數，低的temperature可以有比較多的預測文字，反之高的會產生比較多的不相關的文字
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # 清空批次的維度資料
        predictions = tf.squeeze(predictions, 0)

        # 預測文字
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        # 將預測的文字作為下一個輸入
        input_eval = tf.expand_dims([predicted_id], 0)
        #將文字組合起來
        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"First Citizen"))

First Citizen:
If would not pata'll better benied of our troop-late
to harry my coward propperose to my tender: even so like a ve,
To his consent for their counself's roaricute
To mons the matter for put thy son thou hast.
Nowling methown is ripely fathers' lord
revelse your ready too let have been, and than lianlike him.

BENVOLIO:
The queen is caming steals tas this to-day; mistose, open those world receive
The bran of Ross? I will too leave?

RIVERS:
My lord, how now!

PETRUCHIO:
If she wind
Sith fair better death tell The fools.

SAMPSON:
This is a banishment doth to you but hath.

LEONTES:
You are for wants one day.

BUCKINGHAM:
Why, 'tis crookned as Edward scorn'd
Should nd't.

PETRUTH:
Cut,--then would I do request; it is,
So can alike purpose, we will promite thigh--

ISABELLA:
Away, thou art a worse to the streets.

KING RICHARD II:
Not say that enome; and all their swords
Than blackly and dangers and straws writing where
The pronounced beneficious motas swift and deigny:
As I

In [None]:
#改善結果的方式，增加EPOCHS次數要改善結果
#也可以增加RNN、LSTM來去調整結果
#或是調整temperature參數