In [None]:
# 预测下一个汉字。 
# 1. one word in, one word out 
# 2. multiple word in, one word out.
# 通过model的性能可以看出来，two word in 优于 one word in
# 光使用10000条公交站数据，one word能到60%，two word能到66%。使用普通数据10000条，one word 26%, two word 34%，
# 说明ngram确实是work的，另外，提高epoch和增加训练数据可以提升。

import numpy as np
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense, LSTM, Embedding
import sys
sys.path.append("../common/")
import chinese_preprocess
import probability_utils
import chinese_poi_helper
import nlp_utils

In [None]:
def find_topn_prob_word(y_distribution, tokenizer):
    """
    获取概率最高的字和相应的概率
    :param y_distribution:
    :param tokenizer:
    :return:
    """
    prob_array = y_distribution[0]
    _, indexes = probability_utils.top_k_values_and_indexes(prob_array)
    result = probability_utils.map_indexes_to_word(indexes, tokenizer, prob_array)
    return result


def predict_one_word_with_prob(model, tokenizer, word, length=1):
    """
    获取模型的输出，映射到字并给出概率
    :param model:
    :param tokenizer:
    :param word:
    :param length:
    :return:
    """
    in_text = word
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    encoded = encoded[-length:]
    encoded_np = np.array(encoded)
    encoded_np = encoded_np[np.newaxis, :]
    y_distribution = model.predict(encoded_np, verbose=0)
    topn_word = find_topn_prob_word(y_distribution, tokenizer)
    print("input - {}".format(word))
    print("predict - ")
    print(topn_word)
    return topn_word


def generate_seq_with_one_word(model, tokenizer, seed_text, n_words):
    """
    从一个word开始预测序列，实际中预测的后面的词就不太靠谱了
    :param model:
    :param tokenizer:
    :param seed_text:
    :param n_words:
    :return:
    """
    in_text, result = seed_text, seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        (out_word, probability) = predict_one_word_with_prob(model, tokenizer, in_text)[0]
        in_text, result = out_word, result + ' ' + out_word
    return result


def generate_seq_with_multiple_word(model, tokenizer, seed_text, n_words, length):
    """
    从多个word预测序列，实际中预测的后面的词就不太靠谱了
    :param model:
    :param tokenizer:
    :param seed_text:
    :param n_words:
    :param length:
    :return:
    """
    in_text, result = seed_text, seed_text
    for x in range(n_words):
        (out_word, probability) = predict_one_word_with_prob(model, tokenizer, in_text, length)[0]
        in_text, result = out_word, result + ' ' + out_word
        # 改变输入变量
        encoded[0].pop(0)
        new_encode = tokenizer.texts_to_sequences([in_text])[0]
        encoded[0].extend(new_encode)
    return result


def gen_X_y(encoded, length=1):
    word_seq = nlp_utils.generate_ngram_multiple(encoded, length)
    sequences = np.array(word_seq)
    # 生成X = [word_int1, word_int2, ...] -> y = word_int
    X, y = sequences[:, 0:-1], sequences[:, -1]
    # 为什么是one hot的方式，因为最后的输出也是one hot。
    y = to_categorical(y, num_classes=vocab_size)
    return X, y


def build_and_train_model(vocab_size, length, X, y):
    """
    从RNN(LSTM)的角度来看，多个字input，就是hidden的信息多了东西。
    """
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=length))
    model.add(LSTM(32))
    model.add(Dense(vocab_size, activation='softmax'))
    print(model.summary())
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    model.fit(X, y, epochs=20, verbose=1)
    return model


print("#1 load data")
chinese_poi_name = chinese_poi_helper.prepare_chinese_poi_raw_data_for_task(True)
print(chinese_poi_name[0:2])
print("#2 transform to int")
tokenizer, encoded, vocab_size = chinese_preprocess.encode_chinese_to_int(chinese_poi_name)
print(encoded[0:2])
print("vocab_size : {}".format(vocab_size))
print(chinese_poi_name[0])
print(tokenizer.texts_to_sequences([chinese_poi_name[0]]))

length_one = 1
print("#3 train with {}".format(length_one))
X1, y1 = gen_X_y(encoded, length_one)
model1 = build_and_train_model(vocab_size, length_one, X1, y1)
model1.summary()
# 测试单个字输入的预测
predict_one_word_with_prob(model1, tokenizer, "公")
predict_one_word_with_prob(model1, tokenizer, "机")
predict_one_word_with_prob(model1, tokenizer, "阳")
# 测试单个字输入预测序列的效果
test_word1 = ['公', '机', '阳']
for word in test_word1:
    print(word)
    print(generate_seq_with_one_word(model1, tokenizer, word, 1))


length_two = 2
X2, y2 = gen_X_y(encoded, length_two)
model2 = build_and_train_model(vocab_size, length_two, X2, y2)
model2.summary()
# 测试两个字预测一个字
predict_one_word_with_prob(model2, tokenizer, chinese_preprocess.seg_chinese_single("公安"), 2)
predict_one_word_with_prob(model2, tokenizer, "机 关", 2)
predict_one_word_with_prob(model2, tokenizer, "阳 光", 2)
# 测试两个字预测序列
test_word2 = ['公 安', '机 关', '阳 光']
for word in test_word2:
    print(word)
    print(generate_seq_with_multiple_word(model2, tokenizer, word, 1, 2))