## 语言模型数据集（周杰伦专辑歌词）

接下来我们介绍如何预处理一个语言模型数据集，并将其转换成字符级循环神经网络所需要的输入格式。为此，我们收集了周杰伦从第一张专辑《Jay》到第十张专辑《跨时代》中的歌词，应用循环神经网络来训练一个语言模型。当模型训练好后，我们就可以用这个模型来创作歌词。

In [1]:
# read data
import tensorflow as tf
import random
import zipfile
import numpy as np
with zipfile.ZipFile('./data/jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')

# show
corpus_chars[:80]


'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每天在想想想想著你\n这样的甜蜜\n让我开始乡相信命运\n感谢地心引力\n让我碰到你\n漂亮'

### 这个数据集有6万多个字符。为了打印方便，我们把换行符替换成空格，然后仅使用前1万个字符来训练模型。

In [2]:
corpus_chars = corpus_chars.replace('\n',' ').replace('\r',' ')
corpus_chars = corpus_chars[0:10000]
len(corpus_chars)

10000

### 建立字符索引
我们将每个字符映射成一个从0开始的连续整数，又称索引，来方便之后的数据处理。为了得到索引，我们将数据集里所有不同字符取出来，然后将其逐一映射到索引来构造词典。接着，打印vocab_size，即词典中不同字符的个数，又称词典大小。

In [3]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char,i) for i , char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

1027

In [4]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:26]
print('chars: ',"".join([idx_to_char[idx] for idx in sample]))


chars:  想要有直升机 想要和你飞到宇宙去 想要和你融化在一起


In [5]:
# 隨機採樣, 每次隨機採一個 批量大小來做訓練, # utils
def data_iter_random(corpus_indices, batch_size, num_steps,ctx= None):
    # -1,輸出索引相應輸入索引+1
    num_examples = (len(corpus_indices) -1) // num_steps
    print("num_example: \n",num_examples)
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)
    
    def _data(pos):
        return corpus_indices[pos: pos + num_steps]
    
    for i in range(epoch_size):
        # read  random sample which size is batch_size once
        i = i * batch_size
        batch_indices = example_indices[i: i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield np.array(X,ctx), np.array(Y,ctx)
        


In [6]:
# test 相邻的两个随机小批量在原始序列上的位置不一定相毗邻。
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X:',X , '\nY:',Y,'\n')

    

num_example: 
 4
X: [[18 19 20 21 22 23]
 [ 6  7  8  9 10 11]] 
Y: [[19 20 21 22 23 24]
 [ 7  8  9 10 11 12]] 

X: [[ 0  1  2  3  4  5]
 [12 13 14 15 16 17]] 
Y: [[ 1  2  3  4  5  6]
 [13 14 15 16 17 18]] 



In [7]:
# 相邻采样 utils
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = np.array(corpus_indices)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0:batch_size*batch_len].reshape((batch_size,batch_len))
    epoch_size = (batch_len - 1) //num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:,i: i + num_steps]
        Y = indices[:,i+1:i+num_steps+1]
        yield X, Y 

for X, Y in data_iter_consecutive(my_seq,batch_size=2, num_steps=6):
    print('X:', X,'\nY:',Y)
    

X: [[ 0  1  2  3  4  5]
 [15 16 17 18 19 20]] 
Y: [[ 1  2  3  4  5  6]
 [16 17 18 19 20 21]]
X: [[ 6  7  8  9 10 11]
 [21 22 23 24 25 26]] 
Y: [[ 7  8  9 10 11 12]
 [22 23 24 25 26 27]]


# 循环神经网络的从零开始实现

我们将从零开始实现一个基于字符级循环神经网络的语言模型，并在周杰伦专辑歌词数据集上训练一个模型来进行歌词创作。首先，我们读取周杰伦专辑歌词数据集：

In [8]:
# data load
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as f
import numpy as np
import sys
import time
sys.path.append("..") 

def load_data_jay_lyrics():
    """加载周杰伦歌词数据集"""
    with zipfile.ZipFile('./data/jaychou_lyrics.txt.zip') as zin:
        with zin.open('jaychou_lyrics.txt') as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

(corpus_indices, char_to_idx, idx_to_char, vocab_size) = load_data_jay_lyrics()

In [9]:
# one hot utils
def to_onehot(X,size):
     return [tf.one_hot(x,size,dtype=tf.float32) for x in X.T]

#test 
X = np.arange(10).reshape((2,5))
inputs = to_onehot(X,vocab_size)
len(inputs),inputs[0].shape

(5, TensorShape([2, 1027]))

In [10]:
# initial parameter
num_inputs, num_hiddens,num_outputs = vocab_size, 256, vocab_size

def get_params():
    
    def _one(shape):
        return tf.Variable(tf.random.normal(shape=shape,stddev=0.01,mean=0,dtype=tf.float32))

    # hidder layer
    W_xh = _one((num_inputs,num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = tf.Variable(tf.zeros(num_hiddens),dtype=tf.float32)

    #output layer
    W_hq = _one((num_hiddens, num_outputs))
    b_q = tf.Variable(tf.zeros(num_outputs), dtype=tf.float32)
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    return params
    

In [11]:
# Define Model

## initail hidden  state
def init_rnn_state(batch_size, num_hiddens):
    return (tf.zeros(shape=(batch_size, num_hiddens)), )

# define caculate hidden state and output
def rnn(inputs, state, params):
    # inputs and outputs 皆為 num_steps個形狀為(batch_size, vocab_siez)的矩陣
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    H2 = state
    print((H.shape, H.shape))
    outputs = []
    for X in inputs:
        X = tf.reshape(X,[-1,W_xh.shape[0]])
        H = tf.tanh(tf.matmul(X, W_xh) + tf.matmul(H, W_hh) + b_h)
        Y = tf.matmul(H,W_hq) + b_q
        outputs.append(Y)
    return outputs,(H,)

## TEST
state = init_rnn_state(X.shape[0], num_hiddens)
inputs = to_onehot(X, vocab_size)
params = get_params()
outputs, state_new = rnn(inputs,state,params)
print(len(outputs), outputs[0].shape, state_new[0].shape)

(TensorShape([2, 256]), TensorShape([2, 256]))
5 (2, 1027) (2, 256)


In [12]:
# prefict 
def predict_rnn(prefix,num_chars, rnn, params, init_run_state, num_hiddens, vocab_size,
                idx_to_char, char_to_idx):
    state = init_rnn_state(1,num_hiddens)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        # 將上一時間步的輸出作為當前時間步輸入
        X = tf.convert_to_tensor(to_onehot(np.array([output[-1]]),vocab_size),dtype=tf.float32)
        print("before X ",X.shape)
        X = tf.reshape(X,[1,-1])
        print('\n reshape X ',X.shape)
        #計算輸出 跟 隱藏狀態
        (Y,state) = rnn(X, state, params)
        #下一個時間步的輸入是prefix里的字符或者當前的最佳預測字符
        if t < len(prefix) -1 :
            output.append(char_to_idx[prefix[t+1]])
        else:
            output.append(int(np.array(tf.argmax(Y[0],axis=1))))
    #print(output)
    return ''.join([idx_to_char[i] for i in output]) 

#test
print(predict_rnn('分开',10,rnn,params,init_rnn_state, num_hiddens,vocab_size,idx_to_char, char_to_idx))


before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
分开确仔被宠广忧纳告

# Clip gradient

循环神经网络中较容易出现梯度衰减或梯度爆炸。为了应对梯度爆炸，我们可以裁剪梯度（clip gradient）。假设我们把所有模型参数梯度的元素拼接成一个向量 $\boldsymbol{g}$，并设裁剪的阈值是$\theta$。裁剪后的梯度

$$ \min\left(\frac{\theta}{\|\boldsymbol{g}\|}, 1\right)\boldsymbol{g}$$

的$L_2$范数不超过$\theta$。

In [13]:
def grad_clipping(grads,theta):
    norm = np.array([0])
    for i in range(len(grads)):
        norm +=tf.math.reduce_sum(grads[i]**2)
    norm = np.sqrt(norm).item()
    new_gradient=[]
    if norm > theta:
        for grad in grads:
            new_gradient.append(grad * theta / norm)
    else:
        for grad in grads:
            new_gradient.append(grad)
    
    return new_gradient

    

In [16]:
# train function
import math
def train_and_predict_rnn(rnn, get_params,init_rnn_state,num_hiddens,
                          vocab_size,corpus_indices, idx_to_char,
                          char_to_idx,is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period, 
                          pred_len, predfixes):
    if is_random_iter:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecutive
    params = get_params()
    #loss = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
    
    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样，在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps)
        for X, Y in data_iter:
            if is_random_iter:  # 如使用随机采样，在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens)
            #else:  # 否则需要使用detach函数从计算图分离隐藏状态
                #for s in state:
                    #s.detach()
            with tf.GradientTape(persistent=True) as tape:
                tape.watch(params)
                inputs = to_onehot(X, vocab_size)
                # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
                (outputs, state) = rnn(inputs, state, params)
                # 拼接之后形状为(num_steps * batch_size, vocab_size)
                outputs = tf.concat(outputs, 0)
                # Y的形状是(batch_size, num_steps)，转置后再变成长度为
                # batch * num_steps 的向量，这样跟输出的行一一对应
                y = Y.T.reshape((-1,))
                #print(Y,y)
                y=tf.convert_to_tensor(y,dtype=tf.float32)
                # 使用交叉熵损失计算平均分类误差
                l = tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(y,outputs))
                #l = loss(y,outputs)
                #print("loss",np.array(l))
                
            grads = tape.gradient(l, params)
            grads=grad_clipping(grads, clipping_theta)  # 裁剪梯度
            optimizer.apply_gradients(zip(grads, params))
            #sgd(params, lr, 1 , grads)  # 因为误差已经取过均值，梯度不用再做平均
            l_sum += np.array(l).item() * len(y)
            n += len(y)

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            #print(params)
            for prefix in prefixes:
                print(prefix)
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size,  idx_to_char, char_to_idx))
        

现在我们可以训练模型了。首先，设置模型超参数。
我们将根据前缀“分开”和“不分开”分别创作长度为50个字符（不考虑前缀长度）的一段歌词。
我们每过50个迭代周期便根据当前训练的模型创作一段歌词。

In [17]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 0.01, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['想要', '想要和']
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_size, corpus_indices, idx_to_char,
                      char_to_idx, False, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
 - 想要和                                                  
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShap

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
 - 想要                                                  
想要和
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 102

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 25

(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
(TensorShape([32, 256]), TensorShape([32, 256]))
epoch 250, perplexity 2631.857599, time 0.58 sec
想要
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256]), TensorShape([1, 256]))
before X  (1, 1027)

 reshape X  (1, 1027)
(TensorShape([1, 256])