# Chinese Word Segmentation

The training/test data is publicly avaialbe here: http://sighan.cs.uchicago.edu/bakeoff2005/

## Prepare Training Data

In [1]:
raw_train = []
raw_test = []
with open("data/as_training.utf8", encoding="utf8") as fin:
    for line in fin:
        raw_train.append(line.strip().split("　"))   # It is a full white space

with open("data/as_testing_gold.utf8", encoding="utf8") as fin:
    for line in fin:
        raw_test.append(line.strip().split("　"))   # It is a full white space

print("Number of sentences in the training data: %d" % len(raw_train))
print("Number of sentences in the test data: %d" % len(raw_test))


Number of sentences in the training data: 708953
Number of sentences in the test data: 14432


## Use jieba

In [2]:
import jieba

print(list(jieba.cut("".join(raw_test[0]))))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 0.564 seconds.
Prefix dict has been built successfully.


['許多', '社區長', '青學苑', '多', '開設', '有書法', '、', '插花', '、', '土風', '舞班', '，']


# Convert Traditional Chinese characters to Simplified

In [3]:
from hanziconv.hanziconv import HanziConv

print(list(jieba.cut(HanziConv.toSimplified("".join(raw_test[0])))))

['许多', '社区', '长青', '学苑', '多', '开设', '有', '书法', '、', '插花', '、', '土风舞', '班', '，']


# Split sentence to tokens

In [4]:
def restore(text, toks):
    results = []
    offset = 0
    for tok in toks:
        results.append(text[offset:offset + len(tok)])
        offset += len(tok)
    return results

text = "".join(raw_test[0])
print(text)
print(restore(text, list(jieba.cut(HanziConv.toSimplified(text)))))

許多社區長青學苑多開設有書法、插花、土風舞班，
['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞', '班', '，']


## Build Our Own Model

### Convert a list of words to a sequence of tags


In [5]:
def words_to_tags(words):
    tags = []
    for word in words:
        if len(word) == 1:
            tags.append('S')
        else:
            for i in range(len(word)):
                if i == 0:
                    tags.append('L')
                elif i == len(word) - 1:
                    tags.append('R')
                else:
                    tags.append('M')
    return tags
    
train_X = []
train_Y = []

test_X = []
test_Y = []

for sent in raw_train:
    train_X.append(list("".join(sent)))  # Make the unsegmented sentence as a sequence of characters
    train_Y.append(words_to_tags(sent))
    
for sent in raw_test:
    test_X.append(list("".join(sent)))  # Make the unsegmented sentence
    test_Y.append(words_to_tags(sent))
    
print(test_X[0])
print(test_Y[0])


['許', '多', '社', '區', '長', '青', '學', '苑', '多', '開', '設', '有', '書', '法', '、', '插', '花', '、', '土', '風', '舞', '班', '，']
['L', 'R', 'L', 'R', 'L', 'R', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'R', 'S', 'L', 'M', 'M', 'R', 'S']


## Create a CRF model for word segmentation 

###  add word position information and bigram trigram features

In [6]:
def my_feature(sent, position):
    feature_dict = {'bias' : 1,
                    'if_start_word' : 1 if position==0 else 0,
                    'if_end_word' : 1 if position==len(sent)-1 else 0,
                    'bigram_start_with_x': ''.join(sent[position:position+2]),
                    'bigram_end_with_x': ''.join(sent[position-1:position+1]) if position > 0 else '<SOS>'+sent[position] ,
                    'trigram_strat_with_x': ''.join(sent[position:position+3]),
                    'trigram_middle_with_x': ''.join(sent[position-1:position+2]) if position >0 else '<SOS>'+''.join(sent[position:position+2])
                   }
    if position > 1 :
         feature_dict['trigram_end_with_x'] = ''.join(sent[position-2:position+1]) 
    elif position > 0 :
        feature_dict['trigram_end_with_x'] = '<SOS>' + ''.join(sent[position-1:position+1])
    else :
        feature_dict['trigram_end_with_x'] = '<SOS>' + '<2OS>' + sent[position]
    return feature_dict


# Collect surround word and add above features 

In [7]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics

def extract_sent_features(x):
    sent_features = []
    for i in range(len(x)):
        tmp = {}
        tmp.update(extract_char_features(x, i))
        tmp.update(my_feature(x, i))
        sent_features.append(tmp)    
     
    return sent_features

  
def extract_char_features(sent, position):
    char_features = {}
    for i in range(-3, 4):
        if len(sent) > position + i >= 0:
            char_features['char_at_%d' % i] = sent[position + i]
    return char_features

crf_tagger = sklearn_crfsuite.CRF(algorithm='lbfgs', min_freq=20, max_iterations=300, verbose=True)

In [8]:
from tqdm.auto import tqdm

In [9]:
extract_sent_features(train_X[0])

[{'char_at_0': '時',
  'char_at_1': '間',
  'char_at_2': '：',
  'bias': 1,
  'if_start_word': 1,
  'if_end_word': 0,
  'bigram_start_with_x': '時間',
  'bigram_end_with_x': '<SOS>時',
  'trigram_strat_with_x': '時間：',
  'trigram_middle_with_x': '<SOS>時間',
  'trigram_end_with_x': '<SOS><2OS>時'},
 {'char_at_-1': '時',
  'char_at_0': '間',
  'char_at_1': '：',
  'bias': 1,
  'if_start_word': 0,
  'if_end_word': 0,
  'bigram_start_with_x': '間：',
  'bigram_end_with_x': '時間',
  'trigram_strat_with_x': '間：',
  'trigram_middle_with_x': '時間：',
  'trigram_end_with_x': '<SOS>時間'},
 {'char_at_-2': '時',
  'char_at_-1': '間',
  'char_at_0': '：',
  'bias': 1,
  'if_start_word': 0,
  'if_end_word': 1,
  'bigram_start_with_x': '：',
  'bigram_end_with_x': '間：',
  'trigram_strat_with_x': '：',
  'trigram_middle_with_x': '間：',
  'trigram_end_with_x': '時間：'}]

In [10]:
feature_X = []
for x in tqdm(train_X):
    feature_X.append(extract_sent_features(x))
crf_tagger.fit(feature_X, train_Y)

HBox(children=(IntProgress(value=0, max=708953), HTML(value='')))




loading training data to CRFsuite: 100%|█████████████████████████████████████| 708953/708953 [03:23<00:00, 3475.78it/s]



Feature generation
type: CRF1d
feature.minfreq: 20.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 289847
Seconds required: 54.157

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=8.62  loss=9268754.59 active=289679 feature_norm=1.00
Iter 2   time=4.36  loss=7087119.33 active=289847 feature_norm=2.72
Iter 3   time=4.27  loss=6792615.70 active=289847 feature_norm=4.77
Iter 4   time=4.29  loss=5857999.87 active=289847 feature_norm=5.28
Iter 5   time=8.52  loss=5125039.60 active=289847 feature_norm=6.27
Iter 6   time=8.63  loss=4400907.48 active=289847 feature_norm=7.50
Iter 7   time=8.61  loss=3888464.28 active=289847 feature_norm=9.20
Iter 8   time=4.51  loss=3496175.15 active=289847 feature_norm=11.64
Iter 9   time=4.28  loss=3036294.16 active=289847 f

Iter 231 time=4.29  loss=553674.15 active=289847 feature_norm=240.91
Iter 232 time=4.48  loss=552944.68 active=289847 feature_norm=241.08
Iter 233 time=4.22  loss=551903.46 active=289847 feature_norm=241.28
Iter 234 time=4.45  loss=551159.10 active=289847 feature_norm=241.36
Iter 235 time=4.31  loss=550774.49 active=289847 feature_norm=241.40
Iter 236 time=4.38  loss=550128.56 active=289847 feature_norm=241.48
Iter 237 time=4.29  loss=549170.11 active=289847 feature_norm=241.72
Iter 238 time=4.40  loss=548250.87 active=289847 feature_norm=241.90
Iter 239 time=4.26  loss=547882.65 active=289847 feature_norm=241.89
Iter 240 time=4.26  loss=547533.59 active=289847 feature_norm=241.91
Iter 241 time=8.44  loss=547371.02 active=289847 feature_norm=241.95
Iter 242 time=4.26  loss=547025.73 active=289847 feature_norm=241.99
Iter 243 time=4.32  loss=545617.16 active=289847 feature_norm=242.18
Iter 244 time=4.28  loss=544725.21 active=289847 feature_norm=242.35
Iter 245 time=4.26  loss=543863.77

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=None,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
    max_linesearch=None, min_freq=20, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=True)

In [11]:
def segment(sent):
    tags = crf_tagger.predict_single(extract_sent_features(list(sent)))
    tokens = []
    tok = ""
    for ch, tag in zip(list(sent), tags):
        if tag in ['S', 'L'] and tok != "":
            tokens.append(tok)
            tok = ""
        tok += ch
    if tok:
        tokens.append(tok)
    return tokens
            
print(segment("法國總統馬克宏已到現場勘災，初步傳出火警可能與目前聖母院的維修工程有關。"))
    

['法國', '總統', '馬克宏', '已', '到', '現場', '勘災', '，', '初步', '傳出', '火警', '可能', '與', '目前', '聖母院', '的', '維修', '工程', '有關', '。']


## Evaluation

### Scorer for CWS

In [12]:
def compare(actual_toks, pred_toks):
    i = 0
    j = 0
    p = 0
    q = 0
    tp = 0
    fp = 0
    while i < len(actual_toks) and j < len(pred_toks):
        if p == q:
            if actual_toks[i] == pred_toks[j]:
                tp += 1
            else:
                fp += 1
            p += len(actual_toks[i])
            q += len(pred_toks[j])
            i += 1
            j += 1
        elif p < q:
            p += len(actual_toks[i])
            i += 1
        else:
            fp += 1
            q += len(pred_toks[j])
            j += 1
    return tp, fp, len(actual_toks)
    
def score(actual_sents, pred_sents):
    tp = 0
    fp = 0
    total = 0
    for actual_toks, pred_toks in zip(actual_sents, pred_sents):
        tp_, fp_, total_ = compare(actual_toks, pred_toks)
        tp += tp_
        fp += fp_
        total += total_
    recall = float(tp) / total
    precision = float(tp) / (tp + fp)
    f1 = 2.0 * recall * precision / (recall + precision)
    return recall, precision, f1        

# Testing
# My model acheived F-score up to  0.93

In [13]:
pred = []
actual = []
for sent in raw_test:
    pred.append(segment("".join(sent)))
    actual.append(sent)
print(actual[0])
print(pred[0])

print(score(actual, pred))

['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞班', '，']
['許多', '社區', '長青', '學苑', '多', '開', '設有', '書法', '、', '插花', '、', '土風舞班', '，']
(0.9362155020551967, 0.9246997414272251, 0.9304219906872698)


# Compared with jieba F-score 0.82

In [14]:
pred = []
actual = []
fout = open("jieba.out", "w")
for sent in raw_test:
    text = "".join(sent)
    r = list(jieba.cut(HanziConv.toSimplified(text)))
    r = restore(text, r)
    fout.write(" ".join(r) + "\n")
    pred.append(r)
    actual.append(sent)
print(actual[0])
print(pred[0])

print(score(actual, pred))

['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞班', '，']
['許多', '社區', '長青', '學苑', '多', '開設', '有', '書法', '、', '插花', '、', '土風舞', '班', '，']
(0.8149262738957396, 0.8293466352378739, 0.8220732208967503)
