# 自製智能中文選字系統  (2)

確認版本為 python3

## 資料前處理

In [1]:
import re

In [2]:
def prepocess_line(line):
    chinese_pattern = r'[\u4E00-\u9FFF]+'
    segments = re.findall(chinese_pattern, line, flags=re.UNICODE)
    return segments

In [3]:
segments = []
with open('wiki_zh_small.txt', encoding='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

## 斷詞

In [4]:
import jieba

In [5]:
list(jieba.cut_for_search(segments[6001]))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Windows\Temp\jieba.cache
Loading model cost 0.735 seconds.
Prefix dict has been built successfully.


['所以', '僅用', '於', '還原', '一些', '貴重', '的', '化合', '化合物']

In [6]:
cut_segments = []
for seg in segments:
    # 使用結巴斷詞的 cut_for_search
    cut_segments.extend(list(jieba.cut_for_search(seg)))

## 使用斷詞的結果來作Ngram

In [7]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]

    def fit(self, segments):
        for i in range(1, 1 + self.n):
            for segment in segments:
                self.counters[i] += Counter(self._skip(segment, i))

        base_count = sum(dict(self.counters[1]).values())
        self.counters[0] = Counter({'': base_count})

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [8]:
counters = Counters(n=5)
counters.fit(cut_segments)

In [9]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1

        reference = prefix[-(self.n - 1):] if self.n > 1 else ''
        count_referecne = self.minor_counter[reference]
        probs = []
        for key, count in dict(self.major_counter).items():
            if key.startswith(reference):
                prob = count / count_referecne
                probs.append((prob, key[-1]))
        sorted_probs = sorted(probs, reverse=True)
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}

In [10]:
ngrams = [Ngram(i, counters) for i in range(1, 6)]

## 使用Smoothing of Language Models來建立第二版選字系統

In [11]:
from tqdm.auto import tqdm

In [12]:
class ChineseWordRecommenderV2:
    def __init__(self, ngrams):
        self.ngrams = ngrams
    
    def predict_proba(self, prefix='', top_k=5, mode='back-off'):
        interpolation_lambda = 0.99
        proba_dicts = [ngram.get_proba_dict(prefix) for ngram in self.ngrams[:len(prefix)+1]]

        probas = []
        if mode == 'back-off':
            idx = 1 if prefix else 0 
            counter = self.ngrams[idx].major_counter
            nr_map = self._get_nr(counter)
        # the word appears in higher order dict must appears in lower order dict
        for word in proba_dicts[0].keys():
            # Interpolation smoothing
            if mode == 'Interpolation':
                prob = self._get_interpolation_proba(word, proba_dicts, interpolation_lambda, None)
            # back-off smoothing
            elif mode == 'back-off':
                prob = self._get_backoff_proba(prefix, word, counter, nr_map, proba_dicts, 5, idx)
        
            probas.append((prob, word))    
        sorted_probas = sorted(probas, reverse=True)
        
        return sorted_probas[:top_k] if top_k > 0 else sorted_probas
    
    def _get_interpolation_proba(self, word, proba_dicts, interp_lambda, idx=None):
        if idx is None:
            idx = len(proba_dicts) - 1
        if idx == 0:
            return proba_dicts[idx].get(word, 0.)
        return interp_lambda * proba_dicts[idx].get(word, 0.) + \
               (1 - interp_lambda) * self._get_interpolation_proba(word, proba_dicts, interp_lambda, idx=idx-1)
    
    def _get_nr(self, counter):
        nr_map = {}
        for cnt in counter.values():
            nr_map[cnt] = nr_map.get(cnt, 0) + 1
        return nr_map
    
    def _get_backoff_proba(self, prefix, word, counter, nr_map, proba_dicts, k=5, idx=None):
        prefix = prefix[-1] if prefix else ''
        r = counter[prefix+word]
        if r > k:
            dr = 1
            p_katz = dr * proba_dicts[idx][word]
        elif (r > 0) and (r <= k): 
            tmp = (k+1)*nr_map[k+1] / nr_map[1]
            dr = ((r+1)*nr_map[r+1]/(r*nr_map[r]) - tmp) / (1 - tmp) 
            p_katz = dr * proba_dicts[idx][word]
        else:
            nonzero_p_katz = 0
            nonzero_ml_cnt = 0
            for w in proba_dicts[idx].keys():
                r = counter[prefix+word]
                p_katz = self._get_backoff_proba(prefix, w, counter, nr_map, proba_dicts, k, idx)
                nonzero_p_katz += p_katz
                nonzero_ml_cnt += self.ngrams[idx-1].major_counter[w]
                
            all_unigram_cnt = sum(self.ngrams[idx-1].major_counter.values())
        
            alpha = (1 - nonzero_p_katz) / (1 - nonzero_ml_cnt/all_unigram_cnt)
            c_katz = alpha * self.ngrams[idx-1].major_counter.get(word, 0)
            p_katz = c_katz / all_unigram_cnt
        return p_katz

In [13]:
model = ChineseWordRecommenderV2(ngrams)

In [16]:
probs = model.predict_proba('法', top_k=10, mode='back-off')
probs

[(0.05230496453900709, '國'),
 (0.044326241134751775, '律'),
 (0.030141843971631204, '院'),
 (0.028948136090029063, '的'),
 (0.020390070921985817, '語'),
 (0.01152482269503546, '蘭'),
 (0.010638297872340425, '系'),
 (0.009135433334423064, '一'),
 (0.008169332363511292, '在'),
 (0.007978723404255319, '定')]

## Demo

In [17]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10, mode='back-off')
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')