In [1]:
import sys
import os
import time

In [2]:
class TextSpliter(object):
    def __init__(self, corpus_path, encoding='utf8', debug=False, verbose=False, max_load_word_length=4):
        self.debug = debug
        self.verbose = verbose
        self.dict = {}
        self.dict2 = {}
        self.max_word_length = 1
        begin_time = time.time()
        self.__log__('start load corpus from %s' % corpus_path, 'msg')
        # 加载语料
        with open(corpus_path, 'r', encoding=encoding) as f:
            for l in f:
                wds = l.strip().split('  ')
                l.replace('[', '')
                l.replace(']', '')
                last_wd = ''
                for i in range(1, len(wds)): # 下标从1开始，因为每行第一个词是标签
                    try:
                        wd, type = wds[i].split('/')
                    except:
                        self.__log__('word load error, %s' % wds[i], 'debug')
                        continue
                    if len(wd) == 0 or len(wd) > max_load_word_length or not wd.isalpha():
                        continue
                    if wd not in self.dict:
                        self.dict[wd] = 0
                        if len(wd) > self.max_word_length:
                            # 更新最大词长度
                            self.max_word_length = len(wd)
                            self.__log__('max_word_length=%d, word is %s' %(self.max_word_length, wd), 'debug')
                    self.dict[wd] += 1
                    if last_wd:
                        if last_wd+':'+wd not in self.dict2:
                            self.dict2[last_wd+':'+wd] = 0
                        self.dict2[last_wd+':'+wd] += 1
                    last_wd = wd
                    
        self.words_cnt = 0
        self.words2_cnt = 0
        max_c = 0
        for wd in self.dict:
            self.words_cnt += self.dict[wd]
            if self.dict[wd] > max_c:
                max_c = self.dict[wd]
                wdxx = wd
        for wd in self.dict2:
            self.words2_cnt += self.dict2[wd]
        self.__log__('load corpus finished, %d words in dict and frequency is %d, %d words in dict2 frequency is %d' % (len(self.dict),len(self.dict2), self.words_cnt, self.words2_cnt), 'msg')
        self.__log__('%f seconds elapsed' % (time.time()-begin_time), 'msg')
    
    def split(self, text):
        sentence = ''
        result = ''
        for ch in text:
            if not ch.isalpha():
                result += self.__split_sentence__(sentence) + ' ' + ch + ' '
                sentence = ''
            else:
                sentence += ch
        return result.strip(' ')
    
    def __get_a_split__(self, cur_split, i):
        if i >= len(self.cur_sentence):
            self.split_set.append(cur_split)
            return
        j = min(self.max_word_length, len(self.cur_sentence) - i + 1)
        while j > 0:
            if j == 1 or self.cur_sentence[i:i+j] in self.dict:
                self.__get_a_split__(cur_split + [self.cur_sentence[i:i+j]], i+j)
                if j == 2:
                    break
            j -= 1
    
    def __get_cnt__(self, dictx, key):
        try:
            return dictx[key] + 1
        except KeyError:
            return 1
    
    def __get_word_probablity__(self, wd, pioneer=''):
        if pioneer == '':
            return self.__get_cnt__(self.dict, wd) / self.words_cnt
        return self.__get_cnt__(self.dict2, pioneer + ':' + wd) / self.__get_cnt__(self.dict, pioneer)
    
    def __calc_probability__(self, sequence):
        probability = 1
        pioneer = ''
        for wd in sequence:
            probability *= self.__get_word_probablity__(wd, pioneer)
            pioneer = wd
        return probability
    
    def __split_sentence__(self, sentence):
        if len(sentence) == 0:
            return ''
        self.cur_sentence = sentence.strip()
        self.split_set = []
        self.__get_a_split__([], 0)
        self.__log__(sentence + str(len(self.split_set)),  'debug')
        max_probability = 0
        for splitx in self.split_set:
            probability = self.__calc_probability__(splitx)
            self.__log__(str(splitx)+ ' - ' +str(probability), 'debug')
            if probability > max_probability:
                max_probability = probability
                best_split = splitx
        return ' '.join(best_split)
    
    def __log__(self, msg, level='info'):
        if level == 'info' and self.verbose:
            print (msg)
        elif level == 'debug' and self.debug:
            print (msg)
        elif level == 'error':
            print (msg, file=sys.stderr)
        elif level == 'msg':
            print (msg)

In [3]:
btime = time.time()
base_path = '.'# os.path.dirname(os.path.realpath(__file__))
spliter = TextSpliter(os.path.join(base_path, '199801.txt'), debug=False, verbose=True)
print ('time elapsed %f' % (time.time() - btime))

start load corpus from ./199801.txt
load corpus finished, 49689 words in dict and frequency is 463624, 912534 words in dict2 frequency is 893149
1.461231 seconds elapsed
time elapsed 1.461527
