## Lesson 14 - RNN Poet generator & Chatbot



### Table of Contents
* [Train poet generator model using Tensorflow](#rnn-poems)
* [Generate Poems Chatbot](#poems-chatbot)


<a id="rnn-poems"></a>
## Train poet generator model using Tensorflow

#### train.py

In [1]:
import os
import numpy as np
import tensorflow as tf

from src.RNNPoet.poetry_porcess import *
from src.RNNPoet.gen_poetry import *

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

batch_size = 64
poetry_file = './data/poems/poems.txt'

def train(words,poetry_vector,x_batches,y_batches):
    input_data = tf.placeholder(tf.int32,[batch_size,None])
    output_targets = tf.placeholder(tf.int32,[batch_size,None])
    end_points = rnn_model(len(words),input_data=input_data,output_data=output_targets,batch_size=batch_size)

    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
    merge = tf.summary.merge_all()
    with tf.Session(config=config) as sess:
        writer = tf.summary.FileWriter('./logs',sess.graph)
        sess.run(init_op)

        start_epoch = 0
        model_dir = "./model/RNNPoems"
        epochs = 50
        checkpoint = tf.train.latest_checkpoint(model_dir)
        if checkpoint:
            saver.restore(sess,checkpoint)
            print("## restore from the checkpoint {0}".format(checkpoint))
            start_epoch += int(checkpoint.split('-')[-1])
            print('## start training...')
        try:
            for epoch in range(start_epoch,epochs):
                n_chunk = len(poetry_vector) // batch_size
                for n in range(n_chunk):
                    loss,_,_ = sess.run([
                        end_points['total_loss'],
                        end_points['last_state'],
                        end_points['train_op'],
                    ],feed_dict={input_data: x_batches[n],output_targets: y_batches[n]})
                    print('Epoch: %d, batch: %d, training loss: %.6f' % (epoch,n,loss))
                    if epoch % 5 == 0:
                        saver.save(sess,os.path.join(model_dir,"poetry"),global_step=epoch)
                        result = sess.run(merge,feed_dict={input_data: x_batches[n],output_targets: y_batches[n]})
                        writer.add_summary(result,epoch * n_chunk + n)
        except KeyboardInterrupt:
            print('## Interrupt manually, try saving checkpoint for now...')
            saver.save(sess,os.path.join(model_dir,"poetry"),global_step=epoch)
            print('## Last epoch were saved, next time will start from epoch {}.'.format(epoch))


if __name__ == "__main__":
    words,poetry_vector,to_num,x_batches,y_batches = poetry_process()
    #train(words, poetry_vector, x_batches, y_batches)

  from ._conv import register_converters as _register_converters


唐詩數量: 34646


<img src="images/RNNPoet_TensorFlow_model.jpg">

#### LSTM_model.py

In [2]:
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import numpy as np
np.set_printoptions(threshold=np.inf)

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
"""
import tensorflow as tf

def rnn_model(num_of_word,input_data,output_data=None,rnn_size=128,num_layers=2,batch_size=128):
    end_points = {}
    """

    :param num_of_word: 詞彙的個數
    :param input_data: 輸入向量
    :param output_data: 標籤
    :param rnn_size: 隱藏層的向量大小
    :param num_layers: 隱藏層的階層數
    :param batch_size: 
    :return: 
    """
    
    ''' Construct RNN '''
    # cell_fun = tf.contrib.rnn.BasicRNNCell
    # cell_fun = tf.contrib.rnn.GRUCell
    cell_fun = tf.contrib.rnn.BasicLSTMCell

    cell = cell_fun(rnn_size,state_is_tuple=True)
    cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers,state_is_tuple=True)

    # 如果發現標籤(output_data)，則初始化為一個 batch cell
    if output_data is not None:
        initial_state = cell.zero_state(batch_size,tf.float32)
    else:
        initial_state = cell.zero_state(1,tf.float32)

    # 崁入詞向量
    embedding = tf.get_variable('embedding',initializer=tf.random_uniform(
        [num_of_word + 1,rnn_size],-1.0,1.0))
    inputs = tf.nn.embedding_lookup(embedding,input_data)

    outputs,last_state = tf.nn.dynamic_rnn(cell,inputs,initial_state=initial_state)
    output = tf.reshape(outputs,[-1,rnn_size])

    weights = tf.Variable(tf.truncated_normal([rnn_size,num_of_word + 1]))
    bias = tf.Variable(tf.zeros(shape=[num_of_word + 1]))
    logits = tf.nn.bias_add(tf.matmul(output,weights),bias=bias)

    if output_data is not None:
        labels = tf.one_hot(tf.reshape(output_data,[-1]),depth=num_of_word + 1)
        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels,logits=logits)
        total_loss = tf.reduce_mean(loss)
        train_op = tf.train.AdamOptimizer(0.01).minimize(total_loss)
        tf.summary.scalar('loss',total_loss)

        end_points['initial_state'] = initial_state
        end_points['output'] = output
        end_points['train_op'] = train_op
        end_points['total_loss'] = total_loss
        end_points['loss'] = loss
        end_points['last_state'] = last_state
    else:
        prediction = tf.nn.softmax(logits)

        end_points['initial_state'] = initial_state
        end_points['last_state'] = last_state
        end_points['prediction'] = prediction
    return end_points

#### gen_poetry.py

In [4]:
import numpy as np
import tensorflow as tf
 
from src.RNNPoet.LSTM_model import rnn_model

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

def to_word(predict, vocabs):
    t = np.cumsum(predict)
    s = np.sum(predict)
    sample = int(np.searchsorted(t, np.random.rand(1) * s))
    if sample > len(vocabs):
        sample = len(vocabs) - 1
    return vocabs[sample]  # [np.argmax(predict)]

def gen_poetry(words, to_num):
    batch_size = 1
    print('儲存模型為: {}'.format('./model'))
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    end_points = rnn_model(len(words), input_data=input_data, batch_size=batch_size)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session(config=config) as sess:
        sess.run(init_op)
 
        checkpoint = tf.train.latest_checkpoint('./model')
        saver.restore(sess, checkpoint)
 
        x = np.array(to_num('B')).reshape(1, 1)
 
        _, last_state = sess.run([end_points['prediction'], end_points['last_state']], feed_dict={input_data: x})
 
        word = input('請輸入起始句: ')
        poem_ = ''
        while word != 'E':
            poem_ += word
            x = np.array(to_num(word)).reshape(1, 1)
            predict, last_state = sess.run([end_points['prediction'], end_points['last_state']],
                                           feed_dict={input_data: x, end_points['initial_state']: last_state})
            word = to_word(predict, words)
        print(poem_)
        return poem_

def generate_poet(start_with, words, to_num, style_words="誰謂傷心畫不成，畫人心逐世人情。"):
 
    batch_size = 1
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    end_points = rnn_model(len(words), input_data=input_data, batch_size=batch_size)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session(config=config) as sess:
        sess.run(init_op)
 
        checkpoint = tf.train.latest_checkpoint('./model')
        saver.restore(sess, checkpoint)
 
        x = np.array(to_num('B')).reshape(1, 1)
        _, last_state = sess.run([end_points['prediction'], end_points['last_state']], feed_dict={input_data: x})
 
        if style_words:
            for word in style_words:
                x = np.array(to_num(word)).reshape(1, 1)
                last_state = sess.run(end_points['last_state'],
                                      feed_dict={input_data: x, end_points['initial_state']: last_state})
 
        start_words = list(start_with)
        start_word_len = len(start_words)
 
        result = start_words.copy()
        max_len = 200
        for i in range(max_len):
 
            if i < start_word_len:
                w = start_words[i]
                x = np.array(to_num(w)).reshape(1, 1)
                predict, last_state = sess.run([end_points['prediction'], end_points['last_state']],
                                               feed_dict={input_data: x, end_points['initial_state']: last_state})
            else:
                predict, last_state = sess.run([end_points['prediction'], end_points['last_state']],
                                               feed_dict={input_data: x, end_points['initial_state']: last_state})
                w = to_word(predict, words)
                # w = words[np.argmax(predict)]
                x = np.array(to_num(w)).reshape(1, 1)
                if w == 'E':
                    break
                result.append(w)
        print(''.join(result))
        return ''.join(result)

#### poetry_porcess.py

In [5]:
import numpy as np
from collections import Counter

def poetry_process(batch_size=64,
                   poetry_file='./data/poems/poems.txt'):
    poetrys = []
    with open(poetry_file,'r',encoding='utf-8') as f:
        for line in f:
            try:
                title,content = line.strip().split(':')
                content = content.replace(' ','')  # 去除空白字元
                if '_' in content or '(' in content or '（' in content or '《' in content or '[' in content:
                    continue
                if len(content) < 5 or len(content) > 79:
                    continue
                content = 'B' + content + 'E'
                poetrys.append(content)
            except Exception as e:
                pass

    # 依據每首詩的長度排序
    # poetrys = sorted(poetrys, key=lambda poetry: len(poetry))
    print('唐詩數量:',len(poetrys))

    # 统计字出现次数
    all_words = []
    for poetry in poetrys:
        all_words += [word for word in poetry]
    counter = Counter(all_words)
    # print(counter.items())
    # item 會將字典中的每一項，轉置為一個二元 byte，字典變成大的list
    count_pairs = sorted(counter.items(),key=lambda x: -x[1])
    # 使用 zip 取出，由於原始資料的結構，不如numpy的結構好用
    words,_ = zip(*count_pairs)
    # print(words)

    words = words[:len(words)] + (' ',)  # 在每個list後便，新增一個空白字元' '來補齊詩句的長度
    # print(words)
    # 字典: word->int
    word_num_map = dict(zip(words,range(len(words))))
    # 將詩詞轉換為向量格式
    to_num = lambda word: word_num_map.get(word,len(words))
    poetry_vector = [list(map(to_num,poetry)) for poetry in poetrys]

    n_chunk = len(poetry_vector) // batch_size
    x_batches = []
    y_batches = []
    for i in range(n_chunk):
        start_index = i * batch_size
        end_index = start_index + batch_size
        batches = poetry_vector[start_index:end_index]
        length = max(map(len,batches))  # 记录下最长的诗句的长度
        xdata = np.full((batch_size,length),word_num_map[' '],np.int32)
        for row in range(batch_size):
            xdata[row,:len(batches[row])] = batches[row]
        # print(len(xdata[0])) 每个batch中数据长度不相等
        ydata = np.copy(xdata)
        ydata[:,:-1] = xdata[:,1:]
        """
            xdata             ydata
            [6,2,4,6,9]       [2,4,6,9,9]
            [1,4,2,8,5]       [4,2,8,5,5]
            """
        x_batches.append(xdata)  # (n_chunk, batch, length)
        y_batches.append(ydata)
    return words,poetry_vector,to_num,x_batches,y_batches

#### test.py

In [6]:
import os
import numpy as np
import tensorflow as tf
from collections import Counter

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

batch_size = 64
poetry_file = './data/poems/poems.txt'

def to_word(predict, vocabs):
    t = np.cumsum(predict)
    s = np.sum(predict)
    sample = int(np.searchsorted(t, np.random.rand(1) * s))
    if sample > len(vocabs):
        sample = len(vocabs) - 1
    return vocabs[sample]  # [np.argmax(predict)]

def gen_poetry(words, to_num):
    batch_size = 1
    print('儲存模型為: {}'.format('./model'))
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    end_points = rnn_model(len(words), input_data=input_data, batch_size=batch_size)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session(config=config) as sess:
        sess.run(init_op)
 
        checkpoint = tf.train.latest_checkpoint('./model')
        saver.restore(sess, checkpoint)
 
        x = np.array(to_num('B')).reshape(1, 1)
 
        _, last_state = sess.run([end_points['prediction'], end_points['last_state']], feed_dict={input_data: x})
 
        word = input('請輸入起始句: ')
        poem_ = ''
        while word != 'E':
            poem_ += word
            x = np.array(to_num(word)).reshape(1, 1)
            predict, last_state = sess.run([end_points['prediction'], end_points['last_state']],
                                           feed_dict={input_data: x, end_points['initial_state']: last_state})
            word = to_word(predict, words)
        print(poem_)
        return poem_

def poetry_process(batch_size=64,
                   poetry_file='./data/poems/poems.txt'):
    poetrys = []
    with open(poetry_file,'r',encoding='utf-8') as f:
        for line in f:
            try:
                title,content = line.strip().split(':')
                content = content.replace(' ','')  # 去除空白字元
                if '_' in content or '(' in content or '（' in content or '《' in content or '[' in content:
                    continue
                if len(content) < 5 or len(content) > 79:
                    continue
                content = 'B' + content + 'E'
                poetrys.append(content)
            except Exception as e:
                pass

    # 依據每首詩的長度排序
    # poetrys = sorted(poetrys, key=lambda poetry: len(poetry))
    print('唐詩數量:',len(poetrys))

    # 统计字出现次数
    all_words = []
    for poetry in poetrys:
        all_words += [word for word in poetry]
    counter = Counter(all_words)
    # print(counter.items())
    # item 會將字典中的每一項，轉置為一個二元 byte，字典變成大的list
    count_pairs = sorted(counter.items(),key=lambda x: -x[1])
    # 使用 zip 取出，由於原始資料的結構，不如numpy的結構好用
    words,_ = zip(*count_pairs)
    # print(words)

    words = words[:len(words)] + (' ',)  # 在每個list後便，新增一個空白字元' '來補齊詩句的長度
    # print(words)
    # 字典: word->int
    word_num_map = dict(zip(words,range(len(words))))
    # 將詩詞轉換為向量格式
    to_num = lambda word: word_num_map.get(word,len(words))
    poetry_vector = [list(map(to_num,poetry)) for poetry in poetrys]

    n_chunk = len(poetry_vector) // batch_size
    x_batches = []
    y_batches = []
    for i in range(n_chunk):
        start_index = i * batch_size
        end_index = start_index + batch_size
        batches = poetry_vector[start_index:end_index]
        length = max(map(len,batches))  # 记录下最长的诗句的长度
        xdata = np.full((batch_size,length),word_num_map[' '],np.int32)
        for row in range(batch_size):
            xdata[row,:len(batches[row])] = batches[row]
        # print(len(xdata[0])) 每个batch中数据长度不相等
        ydata = np.copy(xdata)
        ydata[:,:-1] = xdata[:,1:]
        """
            xdata             ydata
            [6,2,4,6,9]       [2,4,6,9,9]
            [1,4,2,8,5]       [4,2,8,5,5]
            """
        x_batches.append(xdata)  # (n_chunk, batch, length)
        y_batches.append(ydata)
    return words,poetry_vector,to_num,x_batches,y_batches

def rnn_model(num_of_word,input_data,output_data=None,rnn_size=128,num_layers=2,batch_size=128):
    end_points = {}
    """

    :param num_of_word: 詞彙的個數
    :param input_data: 輸入向量
    :param output_data: 標籤
    :param rnn_size: 隱藏層的向量大小
    :param num_layers: 隱藏層的階層數
    :param batch_size: 
    :return: 
    """
    
    ''' Construct RNN '''
    # cell_fun = tf.contrib.rnn.BasicRNNCell
    # cell_fun = tf.contrib.rnn.GRUCell
    cell_fun = tf.contrib.rnn.BasicLSTMCell

    cell = cell_fun(rnn_size,state_is_tuple=True)
    cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers,state_is_tuple=True)

    # 如果發現標籤(output_data)，則初始化為一個 batch cell
    if output_data is not None:
        initial_state = cell.zero_state(batch_size,tf.float32)
    else:
        initial_state = cell.zero_state(1,tf.float32)

    # 崁入詞向量
    embedding = tf.get_variable('embedding',initializer=tf.random_uniform(
        [num_of_word + 1,rnn_size],-1.0,1.0))
    inputs = tf.nn.embedding_lookup(embedding,input_data)

    outputs,last_state = tf.nn.dynamic_rnn(cell,inputs,initial_state=initial_state)
    output = tf.reshape(outputs,[-1,rnn_size])

    weights = tf.Variable(tf.truncated_normal([rnn_size,num_of_word + 1]))
    bias = tf.Variable(tf.zeros(shape=[num_of_word + 1]))
    logits = tf.nn.bias_add(tf.matmul(output,weights),bias=bias)

    if output_data is not None:
        labels = tf.one_hot(tf.reshape(output_data,[-1]),depth=num_of_word + 1)
        loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels,logits=logits)
        total_loss = tf.reduce_mean(loss)
        train_op = tf.train.AdamOptimizer(0.01).minimize(total_loss)
        tf.summary.scalar('loss',total_loss)

        end_points['initial_state'] = initial_state
        end_points['output'] = output
        end_points['train_op'] = train_op
        end_points['total_loss'] = total_loss
        end_points['loss'] = loss
        end_points['last_state'] = last_state
    else:
        prediction = tf.nn.softmax(logits)

        end_points['initial_state'] = initial_state
        end_points['last_state'] = last_state
        end_points['prediction'] = prediction
    return end_points

In [7]:
# 這個 cell 只要執行一次即可
words,poetry_vector,to_num,x_batches,y_batches = poetry_process()

batch_size = 1
input_data = tf.placeholder(tf.int32, [batch_size, None])
end_points = rnn_model(len(words), input_data=input_data, batch_size=batch_size)
saver = tf.train.Saver(tf.global_variables())
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

唐詩數量: 34646

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


In [8]:
def generate_poet(start_with, style_words="誰謂傷心畫不成，畫人心逐世人情。"):
    with tf.Session(config=config) as sess:
        sess.run(init_op)
        checkpoint = tf.train.latest_checkpoint('./model/RNNPoems')
        saver.restore(sess, checkpoint)
        x = np.array(to_num('B')).reshape(1, 1)
        _, last_state = sess.run([end_points['prediction'], end_points['last_state']], feed_dict={input_data: x})
        if style_words:
            for word in style_words:
                x = np.array(to_num(word)).reshape(1, 1)
                last_state = sess.run(end_points['last_state'],
                                      feed_dict={input_data: x, end_points['initial_state']: last_state})
        start_words = list(start_with)
        start_word_len = len(start_words)
        result = start_words.copy()
        max_len = 200
        for i in range(max_len):
            if i < start_word_len:
                w = start_words[i]
                x = np.array(to_num(w)).reshape(1, 1)
                predict, last_state = sess.run([end_points['prediction'], end_points['last_state']],
                                               feed_dict={input_data: x, end_points['initial_state']: last_state})
            else:
                predict, last_state = sess.run([end_points['prediction'], end_points['last_state']],
                                               feed_dict={input_data: x, end_points['initial_state']: last_state})
                w = to_word(predict, words)
                # w = words[np.argmax(predict)]
                x = np.array(to_num(w)).reshape(1, 1)
                if w == 'E':
                    break
                result.append(w)
    return ''.join(result)

### 自動產生詩詞

In [9]:
start_with = '少小離家老大回'
r = generate_poet(start_with, style_words="大漠孤煙直，長河落日圓。")
print(r)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./model/RNNPoems\poetry-45
少小離家老大回淚，青春多雨白團團。


In [16]:
start_with = '天龍國大戰南北'
r = generate_poet(start_with, style_words="大漠孤煙直，長河落日圓。")
print(r)

INFO:tensorflow:Restoring parameters from ./model/RNNPoems\poetry-45
天龍國大戰南北，貔虎功夷瑞遍雕。膏圓駘蕩火仙手，紅杏輕含秋色青。應擁薊門秋雨曲，妒吹金蹙鑒梨枝。


<a id="poems-chatbot"></a>
## Generate Poems Chatbot

In [19]:
# coding=utf-8
import datetime
from urllib.parse import quote
from random import randint

from flask import Flask, render_template, request, make_response
from flask import jsonify

import sys
import time  
import hashlib
import threading
import datetime

import os
import numpy as np
import tensorflow as tf

from src.RNNPoet.poetry_porcess import *
from src.RNNPoet.gen_poetry import *

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

batch_size = 64
poetry_file = '../data/poems/poems.txt'
words,poetry_vector,to_num,x_batches,y_batches = poetry_process()

def start_requests(start_with, style_words="誰謂傷心畫不成，畫人心逐世人情"):
    ret_poem = ""
    if len(start_with)>3:
        ret_poem = generate_poet(start_with, words, to_num, style_words="大漠孤煙直，長河落日圓。")
        #print(new_poem)
        items.append({
          "poems": str(ret_poem),
          "updatetime":datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })
    else:
        items.append({
          "poems": "start words need more characters.",
          "updatetime":datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        })
    return str(ret_poem)

def heartbeat():
    print (time.strftime('%Y-%m-%d %H:%M:%S - heartbeat', time.localtime(time.time())))
    timer = threading.Timer(60, heartbeat)
    timer.start()
timer = threading.Timer(60, heartbeat)
timer.start()

app = Flask(__name__,static_url_path="/static") 
@app.route('/message', methods=['POST'])

def reply():
    start_with = request.form['msg']
    res_msg = "Res:"
    today = datetime.datetime.now().strftime("%Y-%m-%d")
    yesterday = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(1), '%Y-%m-%d')
    ret_poem = start_requests(start_with, "大漠孤煙直，長河落日圓。")
    rand_post_index = randint(0, len(items)-1)
    print(items)
    l = "<font color=white>"+ret_poem+"</font>"
    res_msg = l
    if res_msg == ' ':
        res_msg = 'No Data input'

    return jsonify( { 'text': res_msg } )

@app.route("/")
def index(): 
    return render_template("index.html")

# 啟動APP
if (__name__ == "__main__"):
    pass
    #items = []
    #app.run(host = '127.0.0.1', port = 8898)

唐詩數量: 34646


<img src="images/poems_chatbot.png">

### Homework
- 以三字經進行訓練 (./data/poems/three.txt)
- 部署成聊天機器人