In [None]:
import json
import pypinyin
import numpy as np
from gensim.models import Word2Vec

## Load

In [155]:
print('Loading...')

characterFrequency = json.load(open('singleCharacterFrequency.json', 'r'))
characters = list(characterFrequency.keys())
char_finals = [ pypinyin.pinyin(char, style=pypinyin.Style.FINALS)[0][0] for char in characters]
char_initials = [ pypinyin.pinyin(char, style=pypinyin.Style.INITIALS)[0][0] for char in characters]
tupleCharacterFrequency = json.load(open('tupleCharacterFrequency.json', 'r'))
characterFrequency[''] = 0
for key in characterFrequency:
    characterFrequency[''] += characterFrequency[key]
characterFrequency.update(tupleCharacterFrequency)

model = Word2Vec.load('word2vec_model')

print('Loading complete!')

Loading dictionary...
Loading complete!


In [201]:
def similarity(char_x, char_y):
    try:
        return model.similarity(char_x, char_y)
    except Exception:
        return 0
    
def similarity_to_input(line):
    return [ np.sum([similarity(char, line_char) for line_char in line]) for char in characters]

In [240]:
def predict_phrase(line):
    '''Predict the most frequent phrases that satisfiy the rhythm
    
    line: the previous line, ex: ['i', 'ao']
    num_view: the number of phrases that returned
    '''
    finals = [li[0] for li in pypinyin.pinyin(line, style=pypinyin.Style.FINALS)]
    initials = [li[0] for li in pypinyin.pinyin(line, style=pypinyin.Style.INITIALS)]
    char_similarity = similarity_to_input(line)
    
    phrase_length = len(finals)
    vocab_size = len(characters)
    
    probability = np.zeros([phrase_length, vocab_size], dtype='float')
    path = np.zeros([phrase_length, vocab_size], dtype='int') - 1
    for idx, char in enumerate(characters):
        if char_finals[idx] == finals[0] and char_initials[idx] != initials[0]:
            probability[0][idx] = characterFrequency[char]

    for k in range(1, phrase_length):
        for idx, char in enumerate(characters):
            if char_finals[idx] == finals[k] and char_initials[idx] != initials[k]:
                with_prev_freq = np.array(\
                        [ float(characterFrequency.get(prev + char, 0)) / characterFrequency.get(prev, 1) for prev in characters ])
                
                probability[k][idx] = np.max(probability[k - 1] * with_prev_freq)
                if probability[k][idx] > 0:
                    path[k][idx] = np.argmax(probability[k - 1] * with_prev_freq)
    
    def path2phrase(k, idx):
        phrase = ''
        while k >= 0:
            if idx == -1: return None
            phrase = characters[idx] + phrase
            idx = path[k][idx]
            k -= 1
        return phrase
    
    return [path2phrase(phrase_length - 1, idx)\
            for idx in np.argsort(probability[phrase_length - 1])[::-1]\
            if probability[phrase_length - 1][idx] > 0]

In [None]:
while True:
    line = input("Please input pinyin:\n")
    result = predict_phrase(line)

    print(result)

Please input pinyin:
倚老卖老
['持召开招', '持召开到', '持召开好', '是要改造', '细胞胎宝', '持召开高', '细胞胎爆', '持召开考', '知道还要', '持召开报', '持召开道', '持召开跑', '持召开刀', '持召开讨', '持召开导', '持召开药', '持召开包', '细胞胎保', '机号牌照', '持召开凿', '至少外贸', '机号牌号', '知道菜肴', '细胞胎早', '持召开炮', '细胞胎潮', '细胞胎毛', '持召开超', '日早在逃', '提高海岛', '持召开豪', '日早在奥', '日早在朝', '日早在澳', '细胞胎抛', '细胞胎噪', '持召开脑', '持召开倒', '持召开淘', '细胞胎冒', '日早在少', '提高海涛', '日报摘帽', '自豪宅套', '自豪宅遭', '细胞胎盗', '细胞胎抱', '持召开摇', '洗脑袋妖', '细胞胎罩', '知道还邀', '制造百草', '是要再找', '日早在操', '持召开搞', '细胞胎闹', '细胞胎暴', '细胞胎膏', '持召开告', '洗脑袋泡', '日早在靠', '细胞胎槽', '细胞胎薄', '是要改稿', '知道菜炒', '持召开扫', '日早在召', '细胞胎陶', '细胞胎胞', '细胞胎烧', '日早在赵', '提高海豹', '是好莱堡', '日早在桃', '持召开窑', '细胞白皓', '持召开茂', '是要哀悼', '持召开灶', '日早在韶', '日早在邵', '至少外貌', '指导带扰', '持召开曹', '持召开蚝', '持召开哨', '持召开烤', '日早在矛', '食药材熬', '制造百兆', '指导带钥', '日早在腰', '细胞胎羔', '细胞胎肇', '提高海藻', '持召开茅', '洗脑袋掏', '是要再遥', '制造财猫', '日早在浩', '是要再绕', '日早在稻', '自豪宅耗', '持召开咬', '细胞白芍', '日早在毫', '日早在绍', '日早在姚', '持召开抄', '日早在稍', '日早在昭', '细胞代庖', '指导蔡皋', '食药材窈', '持召开篙', '持召开刨', '细胞白袍', '日早在枣', '日早在吵', '日早在耀', '食药材煲', '持召开沼