In [None]:
##########################################################
# Instructions:
#   (a) File extension (副檔名) of your input file must be ".zht.txt";
#   (b) Enter the "base name" of your zht document into the variable 'base':
#         e.g. If your file is 'abc.zht.txt', just enter 'abc'
#   (c) Input file must be encoded in UTF-8 (without BOM)
#   (d) Use UNIX newline characters (LF)
#
base      = "oldc3"
POS       = "yes" # 產生詞性標記 "yes" or "no"
user_dict = "no" # "yes" or "no" 是否要使用使用者自訂詞庫
dict_file = "userdict.txt"
pos_names = None     # options: None, 'parent', 'child', or 'all'
                     # See http://pynlpir.readthedocs.io/en/latest/api.html
                     #   for the explanations
                     # Also consult "ICTPOS3.0.doc" for the POS tag definitiions   
# Output file will have the file extension ".zht.seg.pos.txt".
##########################################################

In [None]:
!python -m pip install pynlpir
!pynlpir update

In [None]:
"""
Updated on Fri 2017-06-04
Written for:
Translation Technology/Computer-Aided Translation
Graduate Program in Translation & Interpretation
National Taiwan University
Written by: Ruben Tsui (RubenTsui@gmail.com)

Note: This script performs the following:
    (1) Convert zht to zhs with OpenCC-Python (pure Python) available from
            https://github.com/yichen0831/opencc-python 
    (2) Segment zhs text with NLPIR:
            http://ictclas.nlpir.org/
        However, we use the Python wrapper available from:
            http://pynlpir.readthedocs.io/en/latest/
    (3) Tokenize original zht text based on segmentation
        patterns the converted (simplified) text from (2)
"""
import time
start_time = time.time()
# convert zh-tw to zh-cn
from opencc import OpenCC 
openCC = OpenCC('t2s')  # convert from Simplified-to-Traditional


USERDICT = (user_dict.lower() == "yes" )
if (USERDICT):
    userdict = dict_file
    userdict_zhs = userdict + ".zhs" 
    udi = open(userdict, "r", encoding='utf-8')
    udo = open(userdict_zhs , 'w', encoding='utf-8', newline="\n")
    for linet in udi:
        lines = openCC.convert(linet.strip())
        udo.write(lines+"\n")
    udi.close()
    udo.close()

zht     = base + ".zht.txt"
if POS == "no":
    zht_seg = base + ".zht.seg.txt"
else:
    zht_seg = base + ".zht.seg.pos.txt"


# traditional Chinese
fit = open(zht, "r", encoding='utf-8')
fot = open(zht_seg , 'w', encoding='utf-8', newline="\n")

# begin segmentation
import pynlpir
pynlpir.open(encoding="utf-8")
if (USERDICT):
    pynlpir.nlpir.ImportUserDict(bytes(userdict_zhs, 'utf-8')) 

sep = " " # separator of Chinese tokens (space by default)
n = 0
for linet in fit:

    n += 1
    if (linet.strip() == ''): # empty string
        fot.write("\n")
        continue
    lines = openCC.convert(linet.strip())
    lines_seg = pynlpir.segment(lines, pos_tagging=(POS=='yes'), pos_names=pos_names) # segment with optional POS-tagging

    # The following segments the zht text according to the
    # segmentation patterns obtained from NLPIR above
    if POS == "no":
        # The following segments the zht text according to the
        # segmentation patterns obtained from NLPIR above
        tokens = []  # initialize list to hold segmented zht line
        while len(lines_seg) > 0:  # loop until nothing is left in lines_seg
            t = lines_seg.pop(0)  # remove leftmost zhs token and save to variable t
            m = len(t)  # no. of characters in token
            tokens.append(linet[0:m])  # add corresponding zht token to tokens[]
            linet = linet[m:]  # delete token from zht line (from beginning of string)

        fot.write(sep.join(tokens)+"\n")  # wirte zht-seg output

    else:
        tokens   = []  # initialize list to hold 'words' of segmented zht line
        pos_tags = []  # initialize list to hold pos tags of segmented words
        while len(lines_seg) > 0:  # loop until nothing is left in lines_seg
            t, p = lines_seg.pop(0)  # remove leftmost zhs token and save to variable t0
            m = len(t)  # no. of characters in token
            tokens.append(linet[0:m])  # add corresponding zht token to tokens[]
            pos_tags.append(p)
            linet = linet[m:]  # delete token from zht line (from beginning of string)

        #fot.write(sep.join(tokens)+"\n")  # wirte zht-seg output
        tok_pos = ["{}_{}".format(x, y) for x,y in zip(tokens, pos_tags)]  # list of tok_pos pairs
        fot.write(sep.join(tok_pos)+"\n")
        #if (n == 1): break

fit.close()
fot.close()
pynlpir.close()
print("All done! Total no. of lines processed = {num}.".format(num=n))
elapsed_time = round(time.time() - start_time, 2)
print("Elapsed time: {sec} sec".format(sec=elapsed_time))
print("Your segmented file with POS tags is: '{f}'".format(f=zht_seg))


In [None]:
lines_seg