# 使用中文飯店評論資料集練習斷詞

In [9]:
import pandas as pd
import paddle

In [10]:
pd_corpus = pd.read_csv('ChnSentiCorp_htl_all.csv')
pd_corpus.head(10)

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"
5,1,总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象
6,1,价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...
7,1,不错，在同等档次酒店中应该是值得推荐的！
8,1,入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味，房间内较新。房间大小合适，卫生间设备齐...
9,1,1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。...


In [11]:
pd_postive = pd_corpus[pd_corpus['label'] == 1]
pd_negative = pd_corpus[pd_corpus['label'] == 0]
print('Total: {}, Postive: {}, Negative: {}'.format(len(pd_corpus), len(pd_postive), len(pd_negative)))

Total: 7766, Postive: 5322, Negative: 2444


In [12]:
pd_corpus.dropna(inplace=True)

In [13]:
import jieba

# 建構jieba斷詞函數

建構可將讀入的dataframe的文本資料，外加一欄為cut的review欄斷詞結果

In [14]:
class JiebaCutingClass(object):
    '''class to use jieba to parse corpus from dataframe and cut the corpus
    parameters
    ---------
    key to cut: str
        the dataframe key to parse the sentence for jieba cutting
    dic: str
        the dictionary to use for jieba, default is None (use default dictionary)
    userdict: str
        the user defined dictionary to use for jieba, default is None
    '''
    def __init__(self, key_to_cut:str, dic:str = None, userdict:str = None):
        if dic is not None:
            jieba.set_dictionary(dic)
        if userdict is not None:
            jieba.load_userdict(userdict)
        
        self.key_to_cut = key_to_cut
        # padding 開啟
        paddle.enable_static()
        jieba.enable_paddle()
    
    @staticmethod
    def cut_single_sentence(sentence, use_paddle = False, use_full = False, use_search = False):
        if use_search:
            out = jieba.cut_for_search(sentence)
        
        else:
            out = jieba.cut(sentence, use_paddle = use_paddle, cut_all = use_full)
        return out

    def cut_corpus(self, corpus: pd.DataFrame, mode: str) -> pd.DataFrame:
        '''Method to read and cut sentence from dataframe and append another column named cut

        Parameters
        ----------------
        corpus: pd.DataFrame
            Input corpus in DataFrame
        mode: str
            Jieba mode to be used
        Return
        ----------------
        out: pd.DataFrame
            Output corpus in DataFrame
        
        '''
        # checking valid mode
        if mode not in ['paddle', 'full', 'precise', 'search']:
            raise TypeError(f'only support "paddle","full","precise","search" mode, but get {mode}')

        # cut the corpus based on mode 
        if mode == 'paddle':
            out = self._paddle_cut(corpus)
        elif mode == 'full':
            out = self._full_cut(corpus)
        elif mode == 'precise':
            out = self._precise_cut(corpus)
        elif mode == 'search':
            out = self._search_cut(corpus)
        
        return out

    def _paddle_cut(self, corpus):
        '''paddle mode
        '''
        jieba.enable_paddle()
        out = []
        for single_review in corpus[self.key_to_cut]:
            out.append([word for word in self.cut_single_sentence(single_review, use_paddle= True)])

        corpus['cut'] = out

        return corpus

    def _full_cut(self, corpus):
        '''full mode
        '''
        out = []
        for single_review in corpus[self.key_to_cut]:
            out.append([word for word in self.cut_single_sentence(single_review, use_full= True)])
        
        corpus['out'] = out
        
        return corpus

    def _precise_cut(self, corpus):
        '''precise mode
        '''
        out = []
        for single_review in corpus[self.key_to_cut]:
            out.append([word for word in self.cut_single_sentence(single_review)])
        
        corpus['cut'] = out

        return corpus

    def _search_cut(self, corpus):
        '''search mode
        '''
        out = []
        for single_review in corpus[self.key_to_cut]:
            out.append([word for word in self.cut_single_sentence(single_review, use_search= True)])
        
        corpus['cut'] = out
        return corpus

使用建構好的斷詞物件對文本進行斷詞

In [7]:
tokenizer = JiebaCutingClass(key_to_cut= 'review')
# 為了避免處理時間過久，這裡用前50個進行斷詞
pd_cut = tokenizer.cut_corpus(pd_corpus.loc[:50, :], mode = 'precise')
pd_cut.head(10)

Paddle enabled successfully......
DEBUG 2022-05-11 16:37:32,061 _compat.py:47] Paddle enabled successfully......
Building prefix dict from the default dictionary ...
DEBUG 2022-05-11 16:37:32,063 __init__.py:113] Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kt/j0fpnwf96017z868368dzkrw0000gn/T/jieba.cache
DEBUG 2022-05-11 16:37:32,064 __init__.py:132] Loading model from cache /var/folders/kt/j0fpnwf96017z868368dzkrw0000gn/T/jieba.cache
Loading model cost 0.557 seconds.
DEBUG 2022-05-11 16:37:32,621 __init__.py:164] Loading model cost 0.557 seconds.
Prefix dict has been built successfully.
DEBUG 2022-05-11 16:37:32,622 __init__.py:166] Prefix dict has been built successfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus

Unnamed: 0,label,review,cut
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较...","[距离, 川沙, 公路, 较近, ,, 但是, 公交, 指示, 不, 对, ,, 如果, 是..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!,"[商务, 大床, 房, ，, 房间, 很大, ，, 床有, 2M, 宽, ，, 整体, 感觉..."
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。,"[早餐, 太, 差, ，, 无论, 去, 多少, 人, ，, 那边, 也, 不加, 食品, ..."
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...,"[宾馆, 在, 小, 街道, 上, ，, 不大好, 找, ，, 但, 还好, 北京, 热心,..."
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风","[CBD, 中心, ,, 周围, 没什么, 店铺, ,, 说, 5, 星, 有点, 勉强, ..."
5,1,总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象,"[总的来说, ，, 这样, 的, 酒店, 配, 这样, 的, 价格, 还, 算, 可以, ，..."
6,1,价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...,"[价格比, 比较, 不错, 的, 酒店, 。, 这次, 免费, 升级, 了, ，, 感谢, ..."
7,1,不错，在同等档次酒店中应该是值得推荐的！,"[不错, ，, 在, 同等, 档次, 酒店, 中, 应该, 是, 值得, 推荐, 的, ！]"
8,1,入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味，房间内较新。房间大小合适，卫生间设备齐...,"[入住, 丽晶, ，, 感觉, 很, 好, 。, 因为, 是, 新, 酒店, ，, 的确, ..."
9,1,1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。...,"[1, 。, 酒店, 比较, 新, ，, 装潢, 和, 设施, 还, 不错, ，, 只是, ..."


In [8]:
test_string = '我喜歡花式鋪克'
jieba_cut = JiebaCutingClass(key_to_cut='', dic = 'dict.txt.big')

out_string = jieba_cut.cut_single_sentence(test_string, use_paddle= True)
print(f'Paddle模式： {[string for string in out_string]}')

out_string = jieba_cut.cut_single_sentence(test_string, use_full= True)
print(f'全模式： {[string for string in out_string]}')

out_string = jieba_cut.cut_single_sentence(test_string, use_search= True)
print(f'搜尋模式： {[string for string in out_string]}')

out_string = jieba_cut.cut_single_sentence(test_string)
print(f'精確模式： {[string for string in out_string]}')

Paddle enabled successfully......
DEBUG 2022-05-11 16:37:32,667 _compat.py:47] Paddle enabled successfully......
Building prefix dict from /Users/ching/nlp/dict.txt.big ...
DEBUG 2022-05-11 16:37:32,684 __init__.py:113] Building prefix dict from /Users/ching/nlp/dict.txt.big ...
Loading model from cache /var/folders/kt/j0fpnwf96017z868368dzkrw0000gn/T/jieba.u3bb22c0208e7ce65853115ed11a2f833.cache
DEBUG 2022-05-11 16:37:32,685 __init__.py:132] Loading model from cache /var/folders/kt/j0fpnwf96017z868368dzkrw0000gn/T/jieba.u3bb22c0208e7ce65853115ed11a2f833.cache


Paddle模式： ['我', '喜歡花式', '鋪克']


Loading model cost 0.952 seconds.
DEBUG 2022-05-11 16:37:33,637 __init__.py:164] Loading model cost 0.952 seconds.
Prefix dict has been built successfully.
DEBUG 2022-05-11 16:37:33,638 __init__.py:166] Prefix dict has been built successfully.


全模式： ['我', '喜歡', '花式', '鋪', '克']
搜尋模式： ['我', '喜歡', '花式', '鋪克']
精確模式： ['我', '喜歡', '花式', '鋪克']
