In [6]:
import pandas as pd
import jieba
import jieba.posseg as pseg

class JiebaSegmentor:

    def __init__(self, dict_path, userdict=[]):
        self.__dict_path = dict_path
        self.__userdict = userdict
        self.dictionary_init()

    def dictionary_init(self):
        jieba.set_dictionary(self.__dict_path)
        for path in self.__userdict:
            print path
            jieba.load_userdict(path)
            
    def taiwan_country(self):
        return [u'臺北',u'台北',u'基隆',u'臺中',u'台中',u'臺南',u'台南',u'高雄',
                u'宜蘭',u'桃園',u'新竹',u'苗栗',u'彰化',u'南投',u'嘉義',u'雲林',
                u'屏東',u'臺東',u'台東',u'花蓮',u'澎湖']

    def get_names(self, input_text):
        names = []
        words = pseg.cut(input_text)
        print words
        for w, f in words:
            if f.lower() == 'nr':
                names.append(w)
        for name in names:
            print name.encode('utf-8')
        return names

    def lcut(self, input_text):
        cut_raw = jieba.lcut(input_text)
        key = []

        for k in cut_raw:
            key.append(k)
        df = pd.DataFrame({"word": key})
        return df

    def pseg_lcut(self, input_text):
        cut_raw = pseg.lcut(input_text)
        key = []
        value = []

        for k, v in cut_raw:
            tag = v
            if k in self.taiwan_country():
                tag = u'ns'
            if len(k) > 1 and tag == u'x':
                tag = u'n'
            key.append(k)
            value.append(tag)
        df = pd.DataFrame({"word": key, "tag": value})
        return df


In [7]:
jieba_dict_path1 = "/home/charles/dataset/jieba/dict_taiwan.txt"
jieba_dict_path2 = "/home/charles/dataset/jieba/userdict.txt"
jieba_dict_path3 = "/home/charles/dataset/jieba/dict.txt.big"
jieba_dict_path4 = "/home/charles/dataset/jieba/dict.txt.small"
js = JiebaSegmentor(jieba_dict_path1, [jieba_dict_path2, jieba_dict_path3, jieba_dict_path4])

Building prefix dict from /home/charles/dataset/jieba/dict_taiwan.txt ...
DEBUG:jieba:Building prefix dict from /home/charles/dataset/jieba/dict_taiwan.txt ...
Loading model from cache /tmp/jieba.u48306fa201322dcccc3d0c62898fbadc.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u48306fa201322dcccc3d0c62898fbadc.cache
Loading model cost 0.646 seconds.
DEBUG:jieba:Loading model cost 0.646 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


/home/charles/dataset/jieba/userdict.txt
/home/charles/dataset/jieba/dict.txt.big
/home/charles/dataset/jieba/dict.txt.small


In [4]:
js.pseg_lcut('花蓮太魯閣號跑好快')

Unnamed: 0,tag,word
0,ns,花蓮
1,n,太魯閣號
2,v,跑
3,a,好
4,a,快


In [8]:
js.lcut('花蓮太魯閣號跑好快')

Unnamed: 0,word
0,花蓮
1,太魯閣號
2,跑
3,好
4,快


In [10]:
js.pseg_lcut('一二零年蔡英文是政治界的A咖大老')

Unnamed: 0,tag,word
0,n,一二零年
1,nr,蔡英文
2,v,是
3,n,政治界
4,uj,的
5,n,A咖
6,a,大
7,a,老


In [19]:
js.pseg_lcut('一九零六年台北的未來在迪化街強烈懷疑')

Unnamed: 0,tag,word
0,n,一九零六年
1,ns,台北
2,uj,的
3,t,未來
4,p,在
5,ns,迪化街
6,a,強烈
7,v,懷疑


In [11]:
js.pseg_lcut('我要下週一要帶小孩去東京迪士尼')

Unnamed: 0,tag,word
0,r,我
1,v,要
2,l,下週一
3,v,要
4,v,帶
5,n,小孩
6,v,去
7,ns,東京
8,nr,迪士尼


In [21]:
js.pseg_lcut('臺東三仙台美麗的海灣')

Unnamed: 0,tag,word
0,ns,臺東
1,n,三仙台
2,ns,美麗
3,uj,的
4,ns,海灣


In [22]:
js.pseg_lcut('蛙式要怎樣才游的快')

Unnamed: 0,tag,word
0,n,蛙式
1,v,要
2,r,怎樣
3,d,才
4,v,游
5,uj,的
6,a,快


In [23]:
js.pseg_lcut('給我你的臉書')

Unnamed: 0,tag,word
0,p,給
1,r,我
2,r,你
3,uj,的
4,n,臉書


In [14]:
js.pseg_lcut('花蓮瑞穗農場喝牛奶擠牛奶')

Unnamed: 0,tag,word
0,ns,花蓮
1,nr,瑞穗
2,n,農場
3,v,喝牛奶
4,v,擠
5,n,牛奶


In [13]:
js.pseg_lcut('太陽的後裔好難看哦')

Unnamed: 0,tag,word
0,ns,太陽
1,uj,的
2,n,後裔
3,a,好
4,v,難看
5,zg,哦


In [25]:
js.pseg_lcut('臺北市、基隆、臺中、臺南、高雄、臺北、宜蘭、桃園、新竹、苗栗、彰化、臺中、南投、臺南、嘉義、雲林、高雄、屏東、臺東、花蓮、澎湖')

Unnamed: 0,tag,word
0,ns,臺北市
1,x,、
2,ns,基隆
3,x,、
4,ns,臺中
5,x,、
6,ns,臺南
7,x,、
8,ns,高雄
9,x,、
