In [None]:
!pip install pycantonese jieba requests pandas

# Default

In [2]:
import pycantonese

pycantonese.segment("兒子生性病母倍感安慰")

['兒子', '生性', '病', '母', '倍感', '安慰']

In [3]:
import jieba

[s for s in jieba.cut("兒子生性病母倍感安慰")]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.730 seconds.
Prefix dict has been built successfully.


['兒子生', '性病', '母', '倍感', '安慰']

# Customize Pycantonese Segmenter

In [4]:
from pycantonese.word_segmentation import Segmenter

segmenter = Segmenter(allow={"病母"})
pycantonese.segment("兒子生性病母倍感安慰", cls=segmenter)

['兒子', '生性', '病母', '倍感', '安慰']

# Customized Jieba by 粵典詞表使用頻率

In [5]:
import jieba
import pandas as pd
import requests
import json

In [6]:
def wget(url, encoding='utf8'):
    r = requests.get(url)
    r.raise_for_status()
    return r.content.decode(encoding)

def download_word_frequency():
    data = wget(url='https://words.hk/faiman/analysis/existingwordcount.json')#粵典詞表使用頻率
    return pd.DataFrame(json.loads(data), index=['count']).transpose() \
                .sort_values(by='count', ascending = False)
    
def save_word_frequency(df, filename):
    with open(filename, "w") as file:
        for word, row in df.iterrows():
            if len(word) > 1:
                file.write(f'{word.replace("*","")} {row["count"]}\n')

In [7]:
save_word_frequency(download_word_frequency(), "粵典_userdict.txt")

jieba.load_userdict("粵典_userdict.txt")

[s for s in jieba.cut("兒子生性病母倍感安慰")]

['兒子', '生性', '病母', '倍感', '安慰']

# Other Examples

In [12]:
examples = [
    "又無珍珠又無波霸你點做生意架",
    "我要波波波波波波波波波波波霸",
    "就算你無你無你無你無珍珠奶茶",
    "咁你淨系俾波霸我都無所謂啦",
    "油尖旺金毛玲最怕有閃電",
    "再見不到也許人生少不免",
    "金毛玲何事秋風悲畫扇",
    "房租必須是一次性交一年的",
    "為食女立志做醫生",
    "梁烈唯封口交無線處理",
    "獅子山下體現香港精神",
    "我們中出了一個叛徒"]

for s in examples:
    print("PyCantonese:", pycantonese.segment(s))
    print("      Jieba:",[sg for sg in jieba.cut(s)])
    print('-' * 50)


PyCantonese: ['又', '無', '珍珠', '又', '無', '波霸', '你', '點做', '生意', '架']
      Jieba: ['又', '無', '珍珠', '又', '無波霸', '你點', '做生意', '架']
--------------------------------------------------
PyCantonese: ['我', '要', '波', '波', '波', '波', '波', '波', '波', '波', '波', '波', '波霸']
      Jieba: ['我要', '波波波', '波波', '波波波', '波波波', '霸']
--------------------------------------------------
PyCantonese: ['就算', '你', '無', '你', '無', '你', '無', '你', '無', '珍珠奶茶']
      Jieba: ['就算', '你', '無', '你', '無', '你', '無', '你', '無', '珍珠奶茶']
--------------------------------------------------
PyCantonese: ['咁', '你', '淨', '系', '俾', '波霸', '我', '都', '無所謂', '啦']
      Jieba: ['咁', '你', '淨系', '俾', '波霸', '我', '都', '無', '所謂', '啦']
--------------------------------------------------
PyCantonese: ['油尖旺', '金毛', '玲', '最', '怕', '有', '閃電']
      Jieba: ['油尖旺', '金毛', '玲', '最怕', '有', '閃電']
--------------------------------------------------
PyCantonese: ['再見', '不到', '也許', '人生', '少不免']
      Jieba: ['再見', '不到', '也許', '人生', '少不免']
-----------------------