In [1]:
import sys
import numpy as np

from pprint import pprint

# Max Score Tokenizer(for space error)

### example

In [2]:
score = {'파스': 0.3, '파스타': 0.7, '좋아요': 0.2, '좋아':0.5}
sent = '파스타가좋아요'

In [3]:
subtokens = []

for b in range(0, len(sent)):
    # word length is loger than 2
    for r in range(2, 3+1):
        
        e = b + r
        
        if e > len(sent):
            continue
            
        subtoken = sent[b:e]
        
        # (subtoken, 시작점, 끝점, 단어 점수)
        subtokens.append((subtoken, b, e, score.get(subtoken, 0)))
        
# sort by score
subtokens = sorted(subtokens, key=lambda x:x[3], reverse=True)
pprint(subtokens)

[('파스타', 0, 3, 0.7),
 ('좋아', 4, 6, 0.5),
 ('파스', 0, 2, 0.3),
 ('좋아요', 4, 7, 0.2),
 ('스타', 1, 3, 0),
 ('스타가', 1, 4, 0),
 ('타가', 2, 4, 0),
 ('타가좋', 2, 5, 0),
 ('가좋', 3, 5, 0),
 ('가좋아', 3, 6, 0),
 ('아요', 5, 7, 0)]


In [4]:
results = []

word, b, e, s = subtokens.pop(0)

print('subtoken = %s\n' % word)
pprint(subtokens)

subtoken = 파스타

[('좋아', 4, 6, 0.5),
 ('파스', 0, 2, 0.3),
 ('좋아요', 4, 7, 0.2),
 ('스타', 1, 3, 0),
 ('스타가', 1, 4, 0),
 ('타가', 2, 4, 0),
 ('타가좋', 2, 5, 0),
 ('가좋', 3, 5, 0),
 ('가좋아', 3, 6, 0),
 ('아요', 5, 7, 0)]


In [5]:
results.append((word, b, e, s))

removals = []
for i, (word_, b_, e_, _) in enumerate(subtokens):
    
    # word와 오버랩 되는 word_
    if (b_ < e and b < e_):
        removals.append(i)

print('파스타와 겹치는 부분')
for i in removals:
    print(subtokens[i])

print('\n중복된 subtokens을 지운 뒤')
for i in reversed(removals): # index 꼬임 없이 삭제위해
    del subtokens[i]
    
pprint(subtokens)

파스타와 겹치는 부분
('파스', 0, 2, 0.3)
('스타', 1, 3, 0)
('스타가', 1, 4, 0)
('타가', 2, 4, 0)
('타가좋', 2, 5, 0)

중복된 subtokens을 지운 뒤
[('좋아', 4, 6, 0.5),
 ('좋아요', 4, 7, 0.2),
 ('가좋', 3, 5, 0),
 ('가좋아', 3, 6, 0),
 ('아요', 5, 7, 0)]


# MaxScore Tokenizer

In [6]:
def tokenize(sent, score, max_len=3):
    
    def initialize(sent, score, max_len=3):
        subtokens = []
        
        for b in range(0, len(sent)):
            for r in range(2, max_len+1):
                
                e = b + r
                if e > len(sent):
                    continue
                    
                subtoken = sent[b:e]
                subtokens.append((subtoken, b, e, score.get(subtoken, 0), e - b))
                
        if not subtokens:
            return subtokens
        
        # Sort by (score and its length)
        subtokens = sorted(subtokens, key=lambda x:x[3], reverse=True)
#         subtokens = sorted(subtokens, key=lambda x:(x[3], x[4]), reverse=True)
        return subtokens
    
    def _tokenize(subtokens):
        results = []
        
        while subtokens:
            
            word, b, e, s, l = subtokens.pop(0)
            results.append((word, b, e, s, l))
            
            # Select overlapped subtoken
            removals = []
            for i, (word_, b_, e_, _1, _2) in enumerate(subtokens):
                if (b_ < e and b < e_):
                    removals.append(i)
                    
            # Remove them
            for i in reversed(removals):
                del subtokens[i]
                
        # Sort by begin point
        results = sorted(results, key=lambda x:x[1])
        return results
    
    def postprocess(sent, results):
        # 맨 앞글자가 비었을 경우, 
        if results[0][1] != 0:
            b = 0
            e = results[0][1]
            word = sent[b:e]
            results.insert(0, (word, b, e, 0, e - b))
            
        # 맨 뒷글자가 비었을 경우
        if results[-1][2] != len(sent):
            b = results[-1][2]
            e = len(sent)
            word = sent[b:e]
            results.append((word, b, e, 0, e - b))
        
        # 중간 글자가 비었을 경우
        adds = []
        for i, base in enumerate(results[:-1]):
            if base[2] == results[i+1][1]:
                continue            
            b = base[2]
            e = results[i+1][1]
            word = sent[b:e]
            adds.append((word, b, e, 0, e - b))
        
        results = sorted(results + adds, key=lambda x:x[1])
        return results
            
    subtokens = initialize(sent, score, max_len)
    if not subtokens:
        return [(sent, 0, len(sent), 0)]
    
    results = _tokenize(subtokens)
    results = postprocess(sent, results)

    return results

In [7]:
tokenize(sent, score)

[('파스타', 0, 3, 0.7, 3),
 ('가', 3, 4, 0, 1),
 ('좋아', 4, 6, 0.5, 2),
 ('요', 6, 7, 0, 1)]