In [54]:
import pandas as pd
import re
from collections import Counter, defaultdict, OrderedDict
from nltk import ngrams

In [55]:
htdl = pd.read_csv('.\\data\\ChnSentiCorp_htl_all.csv')
htdl.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [56]:
def preprocess(sentence):
    if not isinstance(sentence, str):
        return ''
    sentence = sentence.strip()
    sentence = re.sub(r'\s', '', sentence)
    return list(sentence)

In [57]:
sents = list(htdl['review'].apply(preprocess))

In [58]:
sents[0]

['距',
 '离',
 '川',
 '沙',
 '公',
 '路',
 '较',
 '近',
 ',',
 '但',
 '是',
 '公',
 '交',
 '指',
 '示',
 '不',
 '对',
 ',',
 '如',
 '果',
 '是',
 '"',
 '蔡',
 '陆',
 '线',
 '"',
 '的',
 '话',
 ',',
 '会',
 '非',
 '常',
 '麻',
 '烦',
 '.',
 '建',
 '议',
 '用',
 '别',
 '的',
 '路',
 '线',
 '.',
 '房',
 '间',
 '较',
 '为',
 '简',
 '单',
 '.']

In [59]:
seq = '人工智慧自然語言導論'
bigcrm = ngrams(seq, n=2, pad_left=True, pad_right=True)
trigrams = ngrams(seq, n=3, pad_left=True, pad_right=True)

In [60]:
print('bigcrm')
for w1, w2 in bigcrm:
    print('w_n-2', w1)
    print('w_n-1', w2)
    print('---')

bigcrm
w_n-2 None
w_n-1 人
---
w_n-2 人
w_n-1 工
---
w_n-2 工
w_n-1 智
---
w_n-2 智
w_n-1 慧
---
w_n-2 慧
w_n-1 自
---
w_n-2 自
w_n-1 然
---
w_n-2 然
w_n-1 語
---
w_n-2 語
w_n-1 言
---
w_n-2 言
w_n-1 導
---
w_n-2 導
w_n-1 論
---
w_n-2 論
w_n-1 None
---


In [61]:
print('trigrams')
for w1, w2, w3 in trigrams:
    print('w_n-2', w1)
    print('w_n-1', w2)
    print('w_n', w3)
    print('---')

trigrams
w_n-2 None
w_n-1 None
w_n 人
---
w_n-2 None
w_n-1 人
w_n 工
---
w_n-2 人
w_n-1 工
w_n 智
---
w_n-2 工
w_n-1 智
w_n 慧
---
w_n-2 智
w_n-1 慧
w_n 自
---
w_n-2 慧
w_n-1 自
w_n 然
---
w_n-2 自
w_n-1 然
w_n 語
---
w_n-2 然
w_n-1 語
w_n 言
---
w_n-2 語
w_n-1 言
w_n 導
---
w_n-2 言
w_n-1 導
w_n 論
---
w_n-2 導
w_n-1 論
w_n None
---
w_n-2 論
w_n-1 None
w_n None
---


In [62]:
model_two_words = defaultdict(lambda: defaultdict(lambda: 0))
for sent in sents:
    for w1, w2 in ngrams(sent, n=2, pad_left=True, pad_right=True):
        model_two_words[w1][w2] += 1

In [63]:
for w1 in model_two_words:
    total_count = float(sum(model_two_words[w1].values()))
    for w2 in model_two_words:
        model_two_words[w1][w2] /= total_count
    # sort
    # model[w1] = OrderedDict(sorted(model[w1].items(), key=lambda x: x[1], reverse=True))]
model_two_words['暖']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'气': 0.366120218579235,
             '.': 0.01092896174863388,
             '和': 0.09836065573770492,
             '。': 0.04918032786885246,
             '空': 0.0273224043715847,
             '器': 0.02185792349726776,
             '风': 0.10382513661202186,
             '色': 0.00546448087431694,
             '，': 0.08743169398907104,
             '花': 0.00546448087431694,
             '的': 0.04371584699453552,
             '我': 0.00546448087431694,
             '．': 0.00546448087431694,
             '在': 0.00546448087431694,
             '）': 0.00546448087431694,
             '！': 0.01092896174863388,
             '闲': 0.00546448087431694,
             '舒': 0.00546448087431694,
             '感': 0.00546448087431694,
             ',': 0.01092896174863388,
             '、': 0.00546448087431694,
             '热': 0.00546448087431694,
             '一': 0.00546448087431694,
             '2': 0.00546448087431694,
     

In [64]:
def gen_word_by_one_word(model, start_word, max_len=30):
    pred_sent = list()
    next_word = start_word
    for i in range(max_len):
        pred_sent.append(next_word)
        next_word = list(model[next_word].keys())[0]
    return ''.join(pred_sent)

print(gen_word_by_one_word(model_two_words, '暖'))

暖气好。酒店应该重视一下这个问题了。酒店应该重视一下这个问题


In [None]:
model_three_words = defaultdict(lambda: defaultdict(lambda: 0))
for sent in sents:
    for w1, w2, w3 in ngrams(sent, n=3, pad_left=True, pad_right=True):
        model_three_words[(w1, w2)][w3] += 1
model_three_words

In [None]:
for w1_w2 in model_three_words:
    total_count = float(sum(model_three_words[w1_w2].values()))
    for w3 in model_three_words[w1_w2]:
        model_three_words[w1_w2][w3] /= total_count # type: ignore
    # sort
    # model[w1] = OrderedDict(sorted(model[w1].items(), key=lambda x: x[1], reverse=True))]
model_three_words[('缺', '点')]

In [91]:
def gen_word_by_two_word(model, start_word, max_len=30):
    pred_sent = list()
    next_word = start_word
    for i in range(max_len):
        pred_sent += list(next_word)
        input = tuple(pred_sent[-2:])
        next_word = list(model[input])[0]
    return ''.join(pred_sent)
print(gen_word_by_two_word(model_three_words, ('缺', '点')))

缺点：液晶电视固定在墙上，不大好找，但还好北京热心同胞很多~宾


pip install jpype1==0.7.0
pip install pyhanlp
手動配置: https://github.com/hankcs/pyhanlp/wiki/%E6%89%8B%E5%8A%A8%E9%85%8D%E7%BD%AE