In [1]:
import json
import pypinyin
import numpy as np
from gensim.models import Word2Vec
import thulac

## Load

In [2]:
print('Loading...')

characterFrequency = json.load(open('singleCharacterFrequency.json', 'r'))
characters = list(characterFrequency.keys())
char_finals = [ pypinyin.pinyin(char, style=pypinyin.Style.FINALS)[0][0] for char in characters]
char_initials = [ pypinyin.pinyin(char, style=pypinyin.Style.INITIALS)[0][0] for char in characters]
tupleCharacterFrequency = json.load(open('tupleCharacterFrequency.json', 'r'))
characterFrequency[''] = 0
for key in characterFrequency:
    characterFrequency[''] += characterFrequency[key]
characterFrequency.update(tupleCharacterFrequency)

with_prev_freq = [np.array(\
        [ float(characterFrequency.get(prev + char, 0)) / characterFrequency.get(prev, 1) for prev in characters ])
                 for char in characters]


print('Loading complete!')

Loading...
Loading complete!


## 2-gram

In [3]:
def predict_phrase_2gram(line, num_view=20):
    '''Predict the most frequent phrases that satisfiy the rhythm
    
    line: the previous line, ex: ['i', 'ao']
    num_view: the number of phrases that returned
    '''
    finals = [li[0] for li in pypinyin.pinyin(line, style=pypinyin.Style.FINALS)]
    initials = [li[0] for li in pypinyin.pinyin(line, style=pypinyin.Style.INITIALS)]
    
    phrase_length = len(finals)
    vocab_size = len(characters)
    
    probability = np.zeros([phrase_length, vocab_size], dtype='float')
    path = np.zeros([phrase_length, vocab_size], dtype='int') - 1
    for idx, char in enumerate(characters):
        if char_finals[idx] == finals[0] and char_initials[idx] != initials[0]:
            probability[0][idx] = characterFrequency[char]

    for k in range(1, phrase_length):
        for idx, char in enumerate(characters):
            if char_finals[idx] == finals[k]:                
                probability[k][idx] = np.max(probability[k - 1] * with_prev_freq[idx])
                if probability[k][idx] > 0:
                    path[k][idx] = np.argmax(probability[k - 1] * with_prev_freq[idx])
    
    def path2phrase(k, idx):
        phrase = ''
        while k >= 0:
            if idx == -1: return None
            phrase = characters[idx] + phrase
            idx = path[k][idx]
            k -= 1
        return phrase

    return [path2phrase(phrase_length - 1, idx)\
            for idx in np.argsort(probability[phrase_length - 1])[::-1][:num_view]\
            if probability[phrase_length - 1][idx] > 0]

In [5]:
if __name__ == '__main__':
    print(predict_phrase_2gram('风景'))
    print(predict_phrase_2gram('灿烂'))
    print(predict_phrase_2gram('倚老卖老'))
    print(predict_phrase_2gram('心境高雅韵如风'))

['曾经', '生命', '证明', '梦醒', '梦境', '冷静', '能听', '能停', '冷清', '更精', '能轻', '能令', '冷冰', '生情', '能平', '能幸', '生病', '成行', '成名', '灯影']
['慢慢', '看看', '淡淡', '漫漫', '蔓延', '弯弯', '单单', '蓝蓝', '满满', '闪闪', '晚安', '山万', '满山', '反反', '晚餐', '泛滥', '阑珊', '扮演', '贪婪', '缠缠']
['知道爱到', '知道还要', '知道还好', '只要再找', '知道爱早', '只要太少', '知道在脑', '只要再高', '只要再逃', '只要再抱', '只要再靠', '起摇摆摇', '知道在草', '只要再遥', '知道彩照', '知道该抛', '知道外套', '知道该告', '知道爱老', '知道该保']
['民警报答允许能', '民警报答允许更', '民警报答允许生', '民警报答允许等', '民警报答允许梦', '民警报答允许曾', '民警报答允许风', '民警报答允许承', '民警报答允许成', '民警报答允许正', '民警报答允许冷', '民警报答允许恒', '民警报答允许挣', '民警报答允许朋', '尽情到那春雨声', '民警报答驯服整', '民警报答允许剩', '民警报答允许胜', '民警报答允许灯', '民警报答允许疯']


## Meaning satisfying rhythm

In [2]:
def all_rhyme(word_x, word_y):
    if len(word_x) == len(word_y):
        finals_x = [li[0] for li in pypinyin.pinyin(word_x, style=pypinyin.Style.FINALS)]
        initials_x = [li[0] for li in pypinyin.pinyin(word_x, style=pypinyin.Style.INITIALS)]
        finals_y = [li[0] for li in pypinyin.pinyin(word_y, style=pypinyin.Style.FINALS)]
        initials_y = [li[0] for li in pypinyin.pinyin(word_y, style=pypinyin.Style.INITIALS)]

        for i in range(len(word_x)):
            if finals_x[i] != finals_y[i] or (i == 0 and initials_x[i] == initials_y[i]):
                return False
        return True
    return False

In [315]:
model = Word2Vec.load('word2vec_model')
cut = thulac.thulac(seg_only=True)

def predict_phrase_embedding(line, num_view=20):
    words = list(zip(*cut.cut(line)))[0]
    d = {}
    for word in words:
        try:
            d[word] = [candidate for candidate in list(zip(*model.most_similar(word, topn=len(model.wv.vocab))))[0]\
                       if all_rhyme(candidate, word)][:num_view]
        except Exception:
            d[word] = []
            
    return d

Model loaded succeed


In [316]:
if __name__ == '__main__':
    print(predict_phrase_embedding('风景'))
    print(predict_phrase_embedding('灿烂'))
    print(predict_phrase_embedding('倚老卖老'))
    print(predict_phrase_embedding('心境高雅韵如风'))

{'风景': ['冷清', '澄庆', '生命', '梦请', '梦醒', '盛情', '恒星', '横行', '更醒', '梦听', '狰狞', '生平', '盛行', '圣灵', '灯影', '生灵', '成名', '生情', '诚请', '正定']}
{'灿烂': ['蜿蜒', '冉冉', '黯淡', '烂漫', '繁衍', '蔓延', '暗淡', '婉婉', '漫延', '闪闪', '严寒', '赞叹', '烟但', '淡淡', '完满', '蓝看', '斑斑', '沾满', '阑珊', '三万']}
{'倚老卖老': []}
{'心境': ['引擎', '金星', '贫病', '尽兴', '金陵', '尽庆', '民情', '阴影', '品性', '琴令', '临行', '亲情', '阴晴', '隐型', '隐形', '尽情', '拼命', '进行', '琴请'], '高雅韵': [], '如': ['复', '斧', '初', '絮', '诉', '拂', '余', '惧', '互', '铸', '瀑', '雾', '于', '语', '玉', '覆', '逐', '浮', '须', '取'], '风': ['梦', '蓬', '冷', '嘭', '腾', '捧', '承', '昇', '鲠', '吭', '勐', '迸', '砰', '乘', '正', '怦', '碰', '晟', '证', '僧']}


In [244]:
if __name__ == '__main__':
    while True:
        line = input("Please input pinyin:\n")
        result = predict_phrase_2gram(line)
        print(result)

Please input pinyin:
你好
['提高', '日报', '制造', '知道', '指导', '是要', '至少', '医药', '提到', '医保', '一套', '思考', '自贸', '依靠', '细胞', '日早', '持召', '是老', '执照', '起草']


KeyboardInterrupt: 