In [1]:
##########################################################
# Type in the base name of your zht document
# File extension (副檔名) must be ".zht.txt";
# Input file must be encoded in UTF-8 (without BOM)
#
base      = r'songshu.all'
POS       = "yes" # 產生詞性標記
user_dict = "yes" # "yes" or "no" 是否要使用使用者自訂詞庫
#dict_file = "userdict.txt"
dict_file = "DSSP.userdict.txt"
#
# Output file will have the file extension ".zht.seg.txt".
##########################################################

In [4]:
"""
Updated on Fri 2017-03-30
Written for:
Translation Technology
Graduate Program in Translation & Interpretation
National Taiwan University
Written by: RubenTsui@gmail.com
Note: This script performs the following:
    (1) Convert zht to zhs with OpenCC-Python (pure Python) available from
            https://github.com/yichen0831/opencc-python 
    (2) Segment zhs text with NLPIR:
            http://ictclas.nlpir.org/
        However, we use the Python wrapper available from:
            http://pynlpir.readthedocs.io/en/latest/
    (3) Tokenize original zht text based on segmentation
        patterns the converted (simplified) text from (2)
"""
import time
start_time = time.time()
# convert zh-tw to zh-cn
from opencc import OpenCC 
openCC = OpenCC('t2s')  # convert from Simplified-to-Traditional

USERDICT = (user_dict.lower() == "yes" )
if (USERDICT):
    userdict = dict_file
    userdict_zhs = userdict + ".zhs" 
    udi = open(userdict, "r", encoding='utf-8')
    udo = open(userdict_zhs , 'w', encoding='utf-8', newline="\n")
    for linet in udi:
        lines = openCC.convert(linet.strip())
        udo.write(lines+"\n")
    udi.close()
    udo.close()

zht     = base + ".zht.txt"
zht_seg = base + ".zht.seg.pos.txt"


# traditional Chinese
fit = open(zht, "r", encoding='utf-8')
fot = open(zht_seg , 'w', encoding='utf-8', newline="\n")

# begin segmentation
import pynlpir
pynlpir.open(encoding="utf-8")
if (USERDICT):
    pynlpir.nlpir.ImportUserDict(bytes(userdict_zhs, 'utf-8')) 

sep = " " # separator of Chinese tokens (space by default)
n = 0
for linet in fit:

    n += 1
    if (linet.strip() == ''): # empty string
        fot.write("\n")
        continue
    lines = openCC.convert(linet.strip())
    lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) 
    # segment with optional POS-tagging

    # The following segments the zht text according to the
    # segmentation patterns obtained from NLPIR above
    tokens   = []  # initialize list to hold 'words' of segmented zht line
    pos_tags = []  # initialize list to hold pos tags of segmented words
    while len(lines_seg) > 0:  # loop until nothing is left in lines_seg
        t, p = lines_seg.pop(0)  # remove leftmost zhs token and save to variable t0
        m = len(t)  # no. of characters in token
        tokens.append(linet[0:m])  # add corresponding zht token to tokens[]
        pos_tags.append(p)
        linet = linet[m:]  # delete token from zht line (from beginning of string)

    #fot.write(sep.join(tokens)+"\n")  # wirte zht-seg output
    #tok_pos = ["{}_{}".format(x, y) for x,y in zip(tokens, pos_tags)]  # list of tok_pos pairs
    tok_pos = []
    for word, pos in zip(tokens, pos_tags):
        if pos in ('name','office','place', 'era'):
            tok_pos.append(f"{word}_{pos}")
        else:
            tok_pos.append(word)
    
    fot.write(sep.join(tok_pos)+"\n")
    #if (n % 1000 == 0): print(f"processed {n} lines...")
    #if (n == 1000): break
    if (n % 1000 == 0): print(f"processed {n} lines...")

fit.close()
fot.close()
pynlpir.close()
print("All done! Total no. of lines processed = {num}.".format(num=n))
elapsed_time = round(time.time() - start_time, 2)
print("Elapsed time: {sec} sec".format(sec=elapsed_time))
print("Your segmented file with POS tags is: '{f}'".format(f=zht_seg))


processed 1000 lines...
processed 2000 lines...
processed 3000 lines...
processed 4000 lines...
processed 5000 lines...
processed 6000 lines...
processed 7000 lines...
processed 8000 lines...
processed 9000 lines...
processed 10000 lines...
processed 11000 lines...
All done! Total no. of lines processed = 11656.
Elapsed time: 19.02 sec
Your segmented file with POS tags is: 'songshu.all.zht.seg.pos.txt'


In [44]:
n

10711

In [23]:
isinstance(lines, str)

True

In [36]:
fot.close()

In [46]:
linet

'\n'

In [None]:
lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) 

In [32]:
linet

'太宗遣荊州典籤邵宰乘驛還江陵，經過襄陽，袁顗馳書報琬，勸勿解甲，并奉表勸子勛即位。郢州承子勛初檄，及聞太宗定大事，即解甲下標。繼聞尋陽不息，而顗又響應，郢府行事錄事參軍荀卞之大懼，慮為琬所咎責，即遣諮議領中兵參軍鄭景玄率軍馳下，并送軍糧。琬乃稱說符瑞，造乘輿御服，云松滋縣生豹自來，柴桑縣送竹有「來奉天子」字，又云青龍見東淮，白鹿出西岡。令顧昭之撰為瑞命記。立宗廟，設壇埸，矯作崇憲太后璽，令羣僚上偽號於子勛。泰始二年正月七日，即位於尋陽城，改景和二年為義嘉元年。以安陸王子綏為司徒、驃騎將軍、揚州刺史，尋陽王子房車騎將軍，臨海王子頊衞將軍，並開府儀同三司。邵陵王子元撫軍將軍。其日雲雨晦合，行禮忘稱萬歲。取子勛所乘車，除脚以為輦，置偽殿之西，其夕有鳩棲其中，鴞集其幰。又有禿鶖集城上。子綏拜司徒日，雷電晦冥，震其黃閤柱，鴟尾墮地，又有鴟棲其帳上。以鄧琬為左將軍、尚書右僕射，張悅領軍將軍、吏部尚書，征虜將軍如故，進袁顗號安北將軍，加尚書左僕射。臨川內史張淹為侍中。府主簿顧昭之、武昌太守劉弼並為黃門侍郎，廬江太守王子仲委郡奔尋陽，亦為黃門侍郎。鄱陽內史丘景先，廬陵內史殷損、西陽太守謝稚、後軍府記室參軍孫詵、長沙內史孔靈產、參軍事沈伯玉、荀道林並為中書侍郎。荀卞之為尚書左丞，府主簿江乂為右丞。府主簿蕭寶欣為通直郎。琬大息粹、悅息洵並正員郎，粹領衞尉，洵弟洌司徒主簿。建武將軍、領軍主、晉熙太守閻湛之加寧朔將軍。廬陵內史王僧胤為祕書丞。桂陽太守劉卷為尚書殿中郎。褚靈嗣、潘欣之、沈光祖，中書通事舍人。餘諸州郡，並加爵號。\n'

In [48]:
import pandas as pd

In [49]:
df = pd.read_excel('sort_test.xlsx')
df

Unnamed: 0,phrase,type
0,德輿,name
1,寄奴,name
2,漢高帝,name
3,綏輿里,place
4,楚元王,office
5,晉哀帝,name
6,隆安,era
7,安帝,name
8,會稽,place
9,衞將軍,office


In [54]:
s = df.phrase.str.len().sort_values(ascending=True).index
df = df.reindex(s)
df

Unnamed: 0,phrase,type
6,隆安,era
10,謝琰,name
7,安帝,name
1,寄奴,name
0,德輿,name
8,會稽,place
5,晉哀帝,name
2,漢高帝,name
9,衞將軍,office
4,楚元王,office


In [55]:
df = df.sort_values(['type'], ascending=[True])
df

Unnamed: 0,phrase,type
6,隆安,era
10,謝琰,name
7,安帝,name
1,寄奴,name
0,德輿,name
5,晉哀帝,name
2,漢高帝,name
9,衞將軍,office
4,楚元王,office
8,會稽,place
