# Tokenizer with ngram model
### 1. text preprocessing with the simpleified-chinese text corpus
### 2. test the perplexity with the text corpus with unigram, bigram, trigram
### 3. implementation the viterbi algorithm with unigram, bigram, trigram.  
corpus from https://pan.baidu.com/s/1YJkY48u6DN1HUirb-_7cTw pw: smkk  
### sourcefile:   
language_Model.py  
preprocess.py
### corpus file:
msr_training.utf8  
msr_test.utf8

## Section1
load the data set and do some text preprocessing

In [2]:
from pathlib import Path
def load_data(data_dir):
    """Load train and test corpora from a directory.
    Directory must contain two files: train.txt and test.txt.
    Newlines will be stripped out. 
    Args:
        data_dir (Path) -- pathlib.Path of the directory to use. 
    Returns:
        The train and test sets, as lists of sentences.
    """
    train_path = data_dir.joinpath('msr_training.utf8').absolute().as_posix()
    test_path  = data_dir.joinpath('msr_test.utf8').absolute().as_posix()

    with open(train_path, 'r', encoding = 'utf8') as f:
        train = [l.strip() for l in f.readlines()]
    with open(test_path, 'r', encoding = 'utf8') as f:
        test = [l.strip() for l in f.readlines()]
    return train, test

In [3]:
# load data from the current directory.
train, test = load_data(Path('.'))

In [4]:
# preprocessing
import nltk
import re

SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"
# init the toktoktokenizer
toktok = nltk.tokenize.ToktokTokenizer()
word_tokenize = toktok.tokenize

In [96]:
# add sentence token
def add_sentence_tokens(sentences, n):
    """Wrap each sentence in SOS and EOS tokens.
    For n >= 2, n-1 SOS tokens are added, otherwise only one is added.
    Args:
        sentences (list of str): the sentences to wrap.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        List of sentences with SOS and EOS tokens wrapped around them.
    """
    sos = SOS * (n-1) if n > 1 else SOS
    return ['{}{} {}'.format(sos, s, EOS) for s in sentences]

In [97]:
def replace_singletons(tokens):
    """Replace tokens which appear only once in the corpus with <UNK>.
    
    Args:
        tokens (list of str): the tokens comprising the corpus.
    Returns:
        The same list of tokens with each singleton replaced by <UNK>.
    
    """
    vocab = nltk.FreqDist(tokens)
    return [token if vocab[token] > 1 else UNK for token in tokens]

In [98]:
def sentence_clean(sentences, pattern = ''):
    """clean the characters in the input sentences.
    
    Args:
        sentences (list of str): the sentences to preprocess.
        pattern (str): the regular expression to be filted from the sentence.
    Returns:
        The cleaned sentences, filted with the specific pattern.
    """
    # by default we will filt the \n characters.
    cleaner = lambda x : x.replace('\n', '')
    if pattern != '':
        pa = re.compile(pattern)
        pattern_filter = lambda x : re.sub(pa, '', x)
        return [pattern_filter(cleaner(sent)) for sent in sentences]
    else:
        return [cleaner(sent) for sent in sentences]

def preprocess(sentences, n):
    """Add SOS/EOS/UNK tokens to given sentences and tokenize.
    Args:
        sentences (list of str): the sentences to preprocess.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        The preprocessed sentences, tokenized by words.
    """
    sentences = add_sentence_tokens(sentences, n)
    sentences = sentence_clean(sentences)
    tokens = word_tokenize(' '.join(sentences))
    tokens = replace_singletons(tokens)
    return tokens

In [99]:
sentences = ['“  一点  外语  知识  、  数理化  知识  也  没有  ，  还  攀  什么  高峰  ？', 
                '“  在  我们  做  子女  的  眼  中  ，  他  是  一个  严厉  的  父亲  ，  同时  又是  一个  充满  爱心  的  父亲  。']
print(preprocess(sentences, 2))

['<s>', '“', '<UNK>', '<UNK>', '知识', '<UNK>', '<UNK>', '知识', '<UNK>', '<UNK>', '，', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '</s>', '<s>', '“', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '的', '<UNK>', '<UNK>', '，', '<UNK>', '<UNK>', '一个', '<UNK>', '的', '父亲', '，', '<UNK>', '<UNK>', '一个', '<UNK>', '<UNK>', '的', '父亲', '<UNK>', '</s>']


## Section2  
1) Train the ngram model and evaluate the model's basic function  
2) Test the perplexity of the ngram model with the test data set.

In [5]:
from Language_Model import LanguageModel

In [6]:
# create the ngram model
print("Loading {}-gram model...".format(3))
lm = LanguageModel(train, 3, laplace=0.1)
print("Vocabulary size: {}".format(len(lm.vocab)))

Loading 3-gram model...
Vocabulary size: 46811


In [7]:
# test the perplexity
perplexity = lm.perplexity(test)
print("Model perplexity: {:.3f}".format(perplexity))
print("")

Model perplexity: 61.735



In [108]:
# we are going to train the unigram, bigram, and trigram with and output the perplexity.
for i in range(1, 4):
    print("Training {}-gram model...".format(i))
    mode = LanguageModel(train, i, laplace=0.1)
    print("Vocabulary size: {}".format(len(mode.vocab)))
    perplexity = mode.perplexity(test)
    print("Model perplexity: {:.3f}".format(perplexity))
    print("")

Training 1-gram model...
Vocabulary size: 46811
Model perplexity: 43.386

Training 2-gram model...
Vocabulary size: 46811
Model perplexity: 29.768

Training 3-gram model...
Vocabulary size: 46811
Model perplexity: 61.735



In [8]:
testGram = ['风险投', '的', '目的']
print(lm.getScore(testGram))
print('<s>' in lm.vocab)

8.230666007379586
True


In [42]:
# test unigram count
print(lm._convert_oov(['风险投']))

None


## Section3

Implementation the viterbi algorithm with the n-gram model.

In [14]:
def generateGraph(input_str, dic_words):
    """ Generate the link graph of the input sentence base on the dictionary.
    Args:
        input_str (str): the sentence to preprocess.
        dic_words (dict): the dictionary stored the words.
    Returns:
        The generated graph structure. each element of the list store the info
        with respected to the inbound node.
    """
    # init the graph
    graph = [[] for i in range(len(input_str))]
    for i in range(len(input_str)):
        for j in range(i, len(input_str)):
            substr = input_str[i:(j + 1)]
            if substr in dic_words:
                # add a inbound edge and weight for the graph
                graph[j].append(i - 1)
            elif j == i:
                # default weight for single character
                graph[j].append(i - 1)
    return graph
# test code
print(generateGraph("经常有意见分歧", lm.vocab))

[[-1], [-1, 0], [0, 1], [1, 2], [2, 3], [4], [4, 5]]


In [18]:
# ngram padding function
def generateNGram(input_list, n):
    """ Generate the nGram from the input list.
    Args:
        input_list (str): the list of the adjacent words.
        n (int): the output level of the gram
    Returns:
        The generated n-tuple of the ngram model
    """
    # if the input len less or equal than n. we do an index slice.
    if len(input_list) >= n:
        return input_list[-n:]
    ngram = ['<s>'] * (n - len(input_list))
    ngram.extend(input_list)
    return ngram
# unit test
print(generateNGram(['a','b'], 3))    

['<s>', 'a', 'b']


In [33]:
## TODO 请编写word_segment_viterbi函数来实现对输入字符串的分词
def word_segment_viterbi(input_str, lm, n):
    """
    1. 基于输入字符串，词典，以及给定的unigram概率来创建DAG(有向图）。
    2. 编写维特比算法来寻找最优的PATH
    3. 返回分词结果
    
    input_str: 输入字符串   输入格式：“今天天气好”
    best_segment: 最好的分词结果  输出格式：["今天"，"天气"，"好"]
    """
    # boundary check  
    if len(input_str) <= 0:
        return [input_str]    
    
    # 1. generate the directed graph.
    graph = generateGraph(input_str, lm.vocab)
    
    # TODO： 第二步： 利用维特比算法来找出最好的PATH， 这个PATH是P(sentence)最大或者 -log P(sentence)最小的PATH。
    #              hint: 思考为什么不用相乘: p(w1)p(w2)...而是使用negative log sum:  -log(w1)-log(w2)-...
    # init the dp
    N = len(input_str)
    path = [-1] * N # store the previous node.
    dp = [0] * N    # store the dynamic program array.
    cutlist = [[]] * N
            
    for i in range(N):
        # iterate for each edge
        dp[i] = 10e20
        for e in graph[i]:
            currentCut = cutlist[e][-(n - 1):] # deep copy
            currentCut.append(input_str[e + 1:i + 1])
            currentCut = generateNGram(currentCut, n)
            score = dp[e] + lm.getScore(currentCut)
            if score < dp[i]:
                dp[i] = score
                path[i] = e
                cutlist[i] = currentCut

    # TODO: 第三步： 根据最好的PATH, 返回最好的切分
    idx = N - 1
    best_segment = []
    while idx != -1:
        best_segment.append(input_str[path[idx] + 1:idx + 1])
        idx = path[idx]

    best_segment.reverse()
    return best_segment   

In [38]:
print(word_segment_viterbi("经馨", lm, 3))
print(word_segment_viterbi("经常有意见分歧?", lm, 3))
print(word_segment_viterbi("北京的天气，真好啊", lm, 3))
print(word_segment_viterbi("今天的课程内容很有意思", lm, 3))
print(word_segment_viterbi("", lm, 3))

['经', '馨']
['经常', '有意', '见', '分歧', '?']
['北京', '的', '天气', '，', '真', '好', '啊']
['今天', '的', '课程', '内容', '很', '有意思']
['']


In [39]:
def tokenized_viterbi(text, lm, n):
    """ text tokenized interface.
    Args:
        text (str): the input text.
        lm (languageModel): the input ngram language model
        n (int): the level of the n-gram
    Returns:
        The tokenized list of the input text
    """
    tokenized = [word_segment_viterbi(sent.strip(), lm, 3) for sent in text.split(' ')]
    return tokenized

In [40]:
print(tokenized_viterbi("北京的天气，真好啊   今天的课程内容很有意思", lm, 3))

[['北京', '的', '天气', '，', '真', '好', '啊'], [''], [''], ['今天', '的', '课程', '内容', '很', '有意思']]
