# 语言模型

## Ngram

一个由 l 个词构成的句子的概率计算公式为：

$$ p(s) = p(w_1)p(w_2|w_1)p(w_3|w_1 w_2)...p(w_l|w_1..w_{l-1}) \\
        = \prod_{i=1}^l p(w_i | w_1 ...w_{i-1}) \\
        \approx \prod_{i=1}^l p(w_i | wi-1)        $$

上面最后一个例子是二元模型的情况

### 代码实现

In [1]:
from collections import OrderedDict, defaultdict, deque, Counter
import json
import random 
import nltk
from nltk import ngrams, FreqDist

In [2]:
words = "It's good to see you you.".split()
words_ngram = list(ngrams(words, 2))
words_freq = FreqDist(words_ngram)

print('words:', words)
print('ngram:', words_ngram)
print('freq:\n')
words_freq

words: ["It's", 'good', 'to', 'see', 'you', 'you.']
ngram: [("It's", 'good'), ('good', 'to'), ('to', 'see'), ('see', 'you'), ('you', 'you.')]
freq:



FreqDist({("It's", 'good'): 1,
          ('good', 'to'): 1,
          ('see', 'you'): 1,
          ('to', 'see'): 1,
          ('you', 'you.'): 1})

### 语言模型

In [3]:
import itertools
import re
import jieba
import more_itertools

In [5]:
def read_data(path):
    with open(path, 'rt') as f:
        for line in f:
            yield line.strip()

def segment_data(data):
    """segment data to sentence."""
    sentence_break = r'(。|！|？)'
    for line in data:
        # 按句子切分，并且捕获分组
        sentences = re.split(sentence_break, line)
        # 捕获的 delimiter 添加到字符串后面
        sentences = itertools.zip_longest(sentences[0::2], 
                                          sentences[1::2],
                                          fillvalue='')
        sentences = (''.join(i) for i in sentences)
        yield sentences

def jieba_cut(sentences):
    # Flat the nested iterator
    sentences = itertools.chain.from_iterable(sentences)
    # Clean empty sentence
    sentences = (sentence for sentence in sentences if sentence)
    for sentence in sentences:
        sentence = deque(jieba.cut(sentence))
        sentence.appendleft('$$')
        sentence = ' '.join(sentence)
        yield sentence

def get_ngram(sentences, n=2):
    """Get ngram from a list of string."""
    for sentence in sentences:
        ngram = ngrams(sentence.split(), n)
        yield ngram
        
def count_ngram(ngram, n=2):
    d = defaultdict(Counter)
    for context, value in ngram:
        d[context][value] += 1
    return d

def count_to_prob(dct):
    prob_dct = dct.copy()
    for context, count in prob_dct.items():
        total = sum(count.values())
        for word in count:
            count[word] /= total
    return prob_dct

def generate_word(context):
    psum = 0
    r = random.random()
    for word, prob in ngram_prob[context].items():
        psum += prob
        if psum > r:
            return word

def generate_sentence(word):
    while word:
        word = generate_word(word)
        yield word

In [6]:
data = read_data('data/YGZ-rain.md')
sentences = segment_data(data)
sentences = jieba_cut(sentences)
ngram = itertools.chain.from_iterable(get_ngram(sentences, n=2))
ngram_counts = count_ngram(ngram)
ngram_prob = count_to_prob(ngram_counts)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/df/g_gpcyqx3j35s2yvr5k2tysc0000gn/T/jieba.cache
Loading model cost 1.050 seconds.
Prefix dict has been built succesfully.


In [7]:
g_sentence = generate_sentence('$$')
sentence = ''.join(more_itertools.islice_extended(g_sentence, 0, -1))
print(sentence)

而究竟在伞上尝凉凉甜甜的雨水，等他听台风台雨在虚无之间罢了。


#### 3 个词时候的探索

In [34]:
test = list(ngrams("I don't master regular expression don't master regular expression".split(), 3))
test

[('I', "don't", 'master'),
 ("don't", 'master', 'regular'),
 ('master', 'regular', 'expression'),
 ('regular', 'expression', "don't"),
 ('expression', "don't", 'master'),
 ("don't", 'master', 'regular'),
 ('master', 'regular', 'expression')]

In [33]:
d = defaultdict(Counter)
for *i, j in test:
    d[tuple(i)][j] += 1
    print(i, j)
    
d

['I', "don't"] master
["don't", 'master'] regular
['master', 'regular'] expression
['regular', 'expression'] don't
['expression', "don't"] master
["don't", 'master'] regular
['master', 'regular'] expression


defaultdict(collections.Counter,
            {('I', "don't"): Counter({'master': 1}),
             ("don't", 'master'): Counter({'regular': 2}),
             ('expression', "don't"): Counter({'master': 1}),
             ('master', 'regular'): Counter({'expression': 2}),
             ('regular', 'expression'): Counter({"don't": 1})})

## 平滑

平滑的应用