In [1]:
import re
from tqdm import tqdm
import kenlm
global orig_len

# Joint Probability Based Tokenizer

Greedy tokenizer would generally work for most of the cases, however, it could lead to an undesirable segmentation, due to the preference towards longer chunks.
We propose a joint consideration for sub-word segmentation by considering both source and target sentences.

A translator needs a source sentence $\mathbf{S}$ consisting of segmentations where $\mathbf{S} = s_0 s_1 \dots s_n$ and a target sentence $\mathbf{T}$ consisting of segmentations where $\mathbf{T} = t_0 t_1 \dots t_m$.

We want to find optimal arrangement of $\mathbf{S}$ which is $\mathbf{S}^*$ and optimal arrangement of $\mathbf{T}$ which is $\mathbf{T}^*$. Mathematically:
\begin{align}
\label{eq1}
    \mathbf{S}^*, \mathbf{T}^* = \underset{{s_i \in \mathbf{S}, t_j \in \mathbf{T}}}{\operatorname{argmax}} P(\mathbf{S}, \mathbf{T})
\end{align}
where $P(\mathbf{S}, \mathbf{T})$ is the joint probability of sequences. 

We assume that the prior probabilities, which are $P(\mathbf{S})$ and $P(\mathbf{T})$, are language model based probabilities. 

In [2]:
def dp_tokenizer(sentence):
    s = sentence
    global orig_len
    orig_len = len(s)
    return segment(s)

In [3]:
model = kenlm.Model("sim_train.klm")

In [4]:
def memo(f):
    "Memoize function f, whose args must all be hashable."
    cache = {}
    def fmemo(*args):
        if args not in cache:
            cache[args] = f(*args)
        return cache[args]
    fmemo.cache = cache
    return fmemo

In [5]:
def splits(text, start=0, L=20):
    "Return a list of all (first, rest) pairs; start <= len(first) <= L."
    return [(text[:i], text[i:]) 
            for i in range(start, min(len(text), L)+1)]

We chose Viterbi for segmentation the given sentence. The scoring function is obtained from the constructed language models.

Optimal segmentation depends on the following:
<ol>
    <li>language model score of source sentence of a candidate segment.</li>
    <li>language model score of target sentence of a candidate segment.</li>
    <li>item mapping conversions from source segment to target segment</li>
</ol>

In [6]:
@memo
def segment(text):
    "Return a list of words that is the most probable segmentation of text."
    if not text: 
        return []
    else:
        candidates = ([first] + segment(rest) 
                      for (first, rest) in splits(text, 1))
        return max(candidates, key=Pwords)

To avoid OOVs as output segmentations, we imposed a penalty on OOV outputs, which is given by: $\alpha \times \frac{\texttt{len(segment)}}{\texttt{len(sentence)}}$. 

In [7]:
penalty_constant = 15.0

In [8]:
def Pwords(words):
    "Probability of words, assuming each word is independent of others."
    sentence = " ".join(words)
    score = 0
    words_ = ['<s>'] + sentence.split() + ['</s>']
    for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
        if oov:
            penalty = len(words_[i+1]) / orig_len
            score += penalty_constant * prob * penalty
        else:
            score += prob
    return score

In [9]:
import string
alphanumerics = 'a-zA-Z0-9'
known_stops = u'。。…！？'
known_punctuation = u'／（）、，。：「」…。『』！？《》“”；’ ‘【】·〔〕'
eng_punct = string.punctuation
avoid = re.compile("([%s%s%s%s]+)" % (alphanumerics, known_stops, known_punctuation, eng_punct))

# Tokenize Sentence
Tokenize sentence and output the tokens.

In [10]:
def tokenize_sentence(sentence):
    split_words = re.split(avoid, sentence)
    split_words_values = [(i, bool(re.search(avoid, i))) for i in split_words]
    answer = []
    for (word, value) in split_words_values:
        segmented_text = []
        if value == False:
            orig_len = len(word)
            segmented_text = dp_tokenizer(word)
        else:
            segmented_text = list(word)
        for segs in segmented_text:
            answer.append(segs)
    return answer

Test sentence and output.

In [13]:
sentence = "姚松炎、周庭势被「DQ」? 泛民质疑，政府再取消参选人资格涉政治筛选，要求律政司司长郑若骅解释法律理据。 有报道指，据全国人大常委会就《基本法》第一百零四条进行的释法，代表泛民参选立法会港岛及九龙西补选的香港众志周庭和被「DQ」前议员姚松炎，势被取消参选资格。律政司表示，法律政策专员黄惠冲将于稍后时间与泛民议员会面，确实时间待定。 民主派议员前晚在律政中心外静坐要求与律政司司长郑若骅会面不果后，昨在立法会召开记者招待会，要求郑就撤销参选人资格的理据，及其给予选举主任的法律意见作出详细交代。公民党议员郭荣铿批评，郑不向公众交代的做法是「冇承担，冇责任」的表现，不能只把责任交托予公务员。"
a = tokenize_sentence(sentence)

In [14]:
print(a)

['姚', '松', '炎', '、', '周', '庭', '势', '被', '「', 'D', 'Q', '」', '?', ' ', '泛', '民', '质疑', '，', '政府', '再', '取消', '参选人', '资格涉', '政治', '筛选', '，', '要求', '律政司', '司长郑', '若骅', '解释', '法律', '理据', '。', ' ', '有', '报道', '指', '，', '据', '全国人大常委会', '就', '《', '基本法', '》', '第一', '百', '零', '四条', '进行', '的', '释', '法', '，', '代表', '泛民', '参选立', '法会', '港岛及', '九龙西', '补选的香', '港众', '志周', '庭和被', '「', 'D', 'Q', '」', '前', '议员', '姚', '松', '炎', '，', '势', '被', '取消', '参选', '资格', '。', '律政司', '表示', '，', '法律', '政策', '专员黄', '惠冲', '将于', '稍后时', '间与', '泛民', '议员会面', '，', '确实', '时间', '待定', '。', ' ', '民主派', '议员前', '晚在', '律政', '中心外', '静坐要', '求与', '律政', '司司', '长郑', '若骅', '会面不', '果后', '，', '昨在', '立法会', '召开', '记者招待会', '，', '要求', '郑', '就', '撤销', '参选人', '资格', '的', '理据', '，', '及其', '给予', '选举', '主任', '的', '法律', '意见', '作出详', '细交代', '。', '公民', '党', '议员', '郭', '荣铿', '批评', '，', '郑', '不', '向', '公众', '交代', '的', '做法', '是', '「', '冇', '承担', '，', '冇', '责任', '」', '的', '表现', '，', '不能', '只', '把', '责任', '交', '托予', '公务员', '。']
