In [None]:
# 根据当前的词，预测下一个词。 
# 1. one word in, one word out
# 2. mulitple word in, multiple word out.

from numpy import array

from tensorflow.python.keras import backend as k
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense, LSTM, Embedding

import itertools
 
# 实际上可以连续预测多个词
def generate_seq_with_one_word(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
        # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded)
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

# 实际上可以连续预测多个词
def generate_seq_with_multiple_word(model, tokenizer, seed_text, n_words, length):
    in_text, result = seed_text, seed_text
    encoded = tokenizer.texts_to_sequences([in_text])[-length:]
    encoded = array(encoded)
    if (len(encode_init) < length):
        print("context length is smaller than required length")
        return
    for _ in range(n_words):
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        # 输出的class映射回word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
        # 改变输入变量
        encoded.pop(0)
        new_encode = tokenizer.texts_to_sequences([in_text])
        encoded.append(new_encode)
    return result


# 将中文用空格分开并返回，作为fit_on_text的输入
def load_chinese(file, line_num=100000):
    sentence_array = []
    line_index = 0
    with open(file) as infile:
        for line in infile:
            line_index += 1
            line = line.strip()
            current = []
            for i in line:
                if i != "-":
                    current.append(i)
            seg_str = " ".join(current)
            sentence_array.append(seg_str)
            if line_index == line_num:
                break
    return sentence_array

# 将空格分开的中文encode成int，作为神经网络的输入
def encode_to_int(chinese_poi_name):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(chinese_poi_name)
    encoded = tokenizer.texts_to_sequences(chinese_poi_name)
    vocab_size = len(tokenizer.word_index) + 1
    return tokenizer, encoded, vocab_size

# 构造one-word-in, one-word-out的神经网络的输入
def generate_word_pair(encoded):
    sequences = list()
    for i in encoded:
        tuple_temp = tuple(i)
        for pair in list(zip(i, i[1:])):
            sequences.append(pair)
    print('Total Sequences: %d' % len(sequences))
    return sequences

# 构造多个word的神经网络输入
# encoded = [[1,2,3], [4,5,6]] length = 2
# [1,2] [2,3] [4,5] [5,6]
def generate_multiple_word_seq(encoded, length=1):
    sequences = list()
    for array in encoded:
        for idx, val in enumerate(array):
            seq = []
            if idx + length <= len(array):
                seq = array[idx:idx + length]
                sequences.append(seq) 
            else:
                break
    print('Total Sequences: %d' % len(sequences))
    return sequences    

# 生成X = [word_int1, word_int2, ...] -> word_int
def gen_X_y(encoded, length=1):
    word_seq = generate_multiple_word_seq(encoded, length)
    print(word_seq[0:10])
    sequences = array(word_seq)
    X, y = sequences[:,0:-2],sequences[:,-1]
    # 为什么是one hot的方式
    y = to_categorical(y, num_classes=vocab_size)
    print(y[0:5])
    return X, y

def build_and_train_model(vocab_size, length=1, X, y)
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=length))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    print(model.summary())
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    model.fit(X, y, epochs=50, verbose=1)
    return model
    

In [None]:
# 预处理是一样的
chinese_poi_name = load_chinese("./input_data/poi_1000000", 10000)
print(chinese_poi_name[0:2])
tokenizer, encoded, vocab_size = encode_to_int(chinese_poi_name)
print(encoded[0:2])
print("vocab_size : {}".format(vocab_size))
print(chinese_poi_name[0])
print(type(chinese_poi_name[0]))
print(tokenizer.word_index["铁"])
print(tokenizer.texts_to_sequences([chinese_poi_name[0]]))

In [None]:
tokenizer.word_index

# 测试效果 one word in
test_word1 = []
test_word1.append("公")
test_word1.append("机")
test_word1.append("阳")

length_one = 1
X1, y1 = gen_X_y(encoded, length_one)
model1 = build_and_train_model(vocab_size, length_one, X1, y1)
for word in test_word:
    print(generate_seq_with_one_word(model, tokenizer, word, 1))


In [None]:

# 测试效果 multiple word in 
test_word2 = []
test_word2.append("公 交")
test_word2.append("机 关")
test_word2.append("阳 光")

length_two = 2
X2, y2 = gen_X_y(encoded, length_two)
model2 = build_and_train_model(vocab_size, length_one, X2, y2)
for word in test_word2:
    print(generate_seq_with_multiple_word(model, tokenizer, word, 1， 2))
