# CRF-Cut: Sentence Segmentation
---
This notebook combine 3 datasets (ted, orchid and fake review) to train a model and validate separated datasets

The result of CRF-Cut is trained by datasets are as follows:

| dataset_train              | dataset_validate | E_f1-score |
|----------------------------|------------------|------------|
| Ted                        | Ted              | 0.72       |
| Orchid                     | Orchid           | 0.77       |
| Fake review                | Fake review      | 0.97       |
| Ted + Orchid + Fake review | Ted              | 0.72       |
| Ted + Orchid + Fake review | Orchid           | 0.69       |
| Ted + Orchid + Fake review | Fake review      | 0.97       |

We sample 25% from each dataset to train and validate because it does not have memory enough.

In [1]:
!cd data/checkpoint; unzip ted_fake.zip

The system cannot find the path specified.


In [2]:
#adapted from @bact at https://colab.research.google.com/drive/1hdtmwTXHLrqNmDhDqHnTQGpDVy1aJc4t
import json
import pandas as pd
import numpy as np
import re
import pycrfsuite
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from ast import literal_eval
from tqdm import tqdm
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

In [3]:
orchid = pd.read_csv('data/orchid_corpus/orchid97.crp.utf',sep='\t',header=None)
orchid.columns = ['text']
#remove weird words
orchid['first_char'] = orchid.text.map(lambda x: x[0])
orchid = orchid[(orchid.first_char!='%')&(orchid.first_char!='#')][['text']]
#get word,pos
orchid['word'] = orchid.text.map(lambda x: x.split('/')[0])
orchid['word'] = orchid.word.map(lambda x: ' ' if (x=='<space>')|(x=='') else x)
orchid['pos'] = orchid.text.map(lambda x: x.split('/')[1] if len(x.split('/'))==2 else None)
#labels
orchid['lab'] = orchid.apply(lambda row: 'E' if row['text']=='//' else 'I',1)
orchid = orchid[(orchid.lab=='E')|(~orchid.pos.isna())].reset_index(drop=True)

In [4]:
%%time
ted_all_sentences = np.load('data/checkpoint/ted-all-sentences.npy') 
fake_review_all_sentences = np.load('data/checkpoint/fake-review-all-sentences.npy') 

Wall time: 503 ms


In [5]:
# Sample from 3 datasets
np.random.seed(42)
ratio = .40
ted_sample = np.random.choice(ted_all_sentences, int(len(ted_all_sentences) * ratio))
orchid_sample = orchid.iloc[:int(len(orchid) * ratio)]
fake_review_sample = np.random.choice(fake_review_all_sentences, int(len(fake_review_all_sentences) * ratio))

In [6]:
print(f"Length of TED: {len(ted_sample)}")
print(f"Length of orchid: {len(orchid_sample)}")
print(f"Length of fake review: {len(fake_review_sample)}")

Length of TED: 617
Length of orchid: 146325
Length of fake review: 86993


In [7]:
def assign_word_lab(all_sentences):
    all_tuples = []
    for i in tqdm(range(len(all_sentences)), total=len(all_sentences)):
        tuples = []
        for s in all_sentences[i].split('|'):
            s_lst = word_tokenize(s)
            for j in range(len(s_lst)):
                lab = 'E' if j==len(s_lst)-1 else 'I'
                tuples.append((s_lst[j],lab))
        all_tuples.append(tuples)
    return all_tuples

In [8]:
%%time
ted_all_tuples = assign_word_lab(ted_sample)
orchid_all_tuples = [(row['word'],row['lab']) for i,row in orchid_sample.iterrows()]
fake_review_all_tuples = assign_word_lab(fake_review_sample)

100%|████████████████████████████████████████████████████████████████████████████████| 617/617 [00:12<00:00, 50.24it/s]
100%|██████████████████████████████████████████████████████████████████████████| 86993/86993 [00:58<00:00, 1487.10it/s]

Wall time: 1min 25s





In [9]:
enders = ["ครับ","ค่ะ","คะ","นะคะ","นะ","น่า", "จ้ะ","จ้า","จ๋า", "ขา","ฮะ", "ละครับ", "ละ","ละนะ" "ละค่ะ"#ending honorifics
          #enders
          "ๆ","ได้","แล้ว","ด้วย","เลย","มาก","น้อย","กัน","เช่นกัน","เท่านั้น",
          "อยู่","ลง","ขึ้น","มา","ไป","ไว้","เอง","อีก","ใหม่","จริงๆ", "ซิน่ะ",  "ซิ", "ซี","ซี่",
          "บ้าง","หมด","ทีเดียว","เดียว","เถอะ", "เถิด" ,"เถอะน่า", "เถอะน่ะ", "หรอก",
          #demonstratives
          "นั้น","นี้","เหล่านี้","เหล่านั้น",
          #questions
          "อย่างไร","ยังไง","หรือไม่","มั้ย","ไหน","อะไร","ทำไม","เมื่อไหร่"]
starters = ["ผม","ฉัน","ดิฉัน","ชั้น","คุณ","มัน","เขา","เค้า",
            "เธอ","เรา","พวกเรา","พวกเขา", #pronouns
            #connectors
            "และ","หรือ","แต่","เมื่อ","ถ้า","ใน",
            "ด้วย","เพราะ","เนื่องจาก","ซึ่ง","ไม่",
            "ตอนนี้","ทีนี้","ดังนั้น","เพราะฉะนั้น","ฉะนั้น",
            "ตั้งแต่","ในที่สุด",
            #demonstratives
            "นั้น","นี้","เหล่านี้","เหล่านั้น"]

def extract_features(doc, window=2, max_n_gram=3):
    doc_features = []
    #paddings for word and POS
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    doc_ender = []
    doc_starter = []
    #add enders
    for i in range(len(doc)):
        if doc[i] in enders:
            doc_ender.append('ender')
        else:
            doc_ender.append('normal')
    #add starters
    for i in range(len(doc)):
        if doc[i] in starters:
            doc_starter.append('starter')
        else:
            doc_starter.append('normal')
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
                ender_ =  f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f'ender_{feature_position}={ender_}']
                starter_ =  f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f'starter_{feature_position}={starter_}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

In [10]:
%%time
# ted
#target
ted_y = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    ted_y.append(temp)

#features
ted_x_pre = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    ted_x_pre.append(temp)
ted_x = []
for x_ in tqdm(ted_x_pre, total=len(ted_x_pre)):
    ted_x.append(extract_features(x_, window=2, max_n_gram = 3))

100%|██████████████████████████████████████████████████████████████████████████████| 617/617 [00:00<00:00, 3964.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 617/617 [00:00<00:00, 3877.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 617/617 [00:29<00:00, 21.23it/s]

Wall time: 29.4 s





In [11]:
%%time
# orchid
#target
orchid_y = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_y.append(l)
#features
orchid_x_pre = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_x_pre.append(w)
orchid_x = extract_features(orchid_x_pre, window=2, max_n_gram = 3) 

100%|█████████████████████████████████████████████████████████████████████| 146325/146325 [00:00<00:00, 2328860.27it/s]
100%|█████████████████████████████████████████████████████████████████████| 146325/146325 [00:00<00:00, 2256200.03it/s]


Wall time: 3.35 s


In [12]:
# fake review
#target
fake_review_y = []
for t in tqdm(fake_review_all_tuples, total=len(fake_review_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    fake_review_y.append(temp)

#features
fake_review_x_pre = []
for t in tqdm(fake_review_all_tuples, total=len(fake_review_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    fake_review_x_pre.append(temp)
fake_review_x = []
for x_ in tqdm(fake_review_x_pre, total=len(fake_review_x_pre)):
    fake_review_x.append(extract_features(x_, window=2, max_n_gram = 3))

100%|█████████████████████████████████████████████████████████████████████████| 86993/86993 [00:01<00:00, 55274.92it/s]
100%|████████████████████████████████████████████████████████████████████████| 86993/86993 [00:00<00:00, 103225.29it/s]
100%|███████████████████████████████████████████████████████████████████████████| 86993/86993 [02:11<00:00, 661.74it/s]


In [13]:
# Split train and test set at 80/20 proportion
ted_x_train, ted_x_test, ted_y_train, ted_y_test = train_test_split(ted_x, ted_y, test_size=0.2, random_state=1412)
idx = int(len(orchid_x)*0.8)
orchid_x_train, orchid_x_test = orchid_x[:idx], orchid_x[idx:]
orchid_y_train, orchid_y_test = orchid_y[:idx], orchid_y[idx:]
fake_review_x_train, fake_review_x_test, fake_review_y_train, fake_review_y_test \
    = train_test_split(fake_review_x, fake_review_y, test_size=0.2, random_state=1412)

In [14]:
%%time
# Train model
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in tqdm(zip(ted_x_train, ted_y_train), total=len(ted_y_train)):
    trainer.append(xseq, yseq)
    
trainer.append(orchid_x_train, orchid_y_train)

for xseq, yseq in tqdm(zip(fake_review_x_train, fake_review_y_train), total=len(fake_review_y_train)):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1,
    'c2': 0,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('models/datasets-crf2.model')

100%|████████████████████████████████████████████████████████████████████████████████| 493/493 [00:48<00:00, 10.23it/s]
100%|███████████████████████████████████████████████████████████████████████████| 69594/69594 [04:53<00:00, 237.16it/s]


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 8625929
Seconds required: 85.691

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2267068.948768
Feature norm: 1.000000
Error norm: 2301725.534986
Active features: 1739954
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 19.158

***** Iteration #2 *****
Loss: 1985409.774762
Feature norm: 0.877057
Error norm: 2210195.890448
Active features: 1308541
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.457

***** Iteration #3 *****
Loss: 1892598.892880
Feature norm: 0.391655
Error norm: 6587168.652968
Active features: 495670
Line search trials: 3
Line search step: 0.25000

***** Iteration #42 *****
Loss: 186820.107672
Feature norm: 28.028682
Error norm: 49257.734551
Active features: 240443
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.038

***** Iteration #43 *****
Loss: 183324.393825
Feature norm: 29.948303
Error norm: 74983.756697
Active features: 236739
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.044

***** Iteration #44 *****
Loss: 178623.656556
Feature norm: 31.362475
Error norm: 36220.599678
Active features: 231744
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.045

***** Iteration #45 *****
Loss: 174921.590399
Feature norm: 33.795208
Error norm: 75704.555730
Active features: 225762
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.901

***** Iteration #46 *****
Loss: 170180.311830
Feature norm: 35.887530
Error norm: 38760.791812
Active features: 221645
Line search trials: 1
Line sear

***** Iteration #81 *****
Loss: 129390.697835
Feature norm: 127.202524
Error norm: 4727.058048
Active features: 96179
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.544

***** Iteration #82 *****
Loss: 129279.953045
Feature norm: 128.207909
Error norm: 2568.139618
Active features: 95224
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.742

***** Iteration #83 *****
Loss: 129170.006841
Feature norm: 129.296368
Error norm: 2833.608242
Active features: 94642
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.122

***** Iteration #84 *****
Loss: 129077.403791
Feature norm: 130.126987
Error norm: 1713.864304
Active features: 93993
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.530

***** Iteration #85 *****
Loss: 128994.041234
Feature norm: 130.919907
Error norm: 2627.321867
Active features: 93578
Line search trials: 1
Line search ste

***** Iteration #121 *****
Loss: 128163.184802
Feature norm: 139.319032
Error norm: 1081.303935
Active features: 83859
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.028

***** Iteration #122 *****
Loss: 128158.667945
Feature norm: 139.380487
Error norm: 1329.308807
Active features: 83754
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.623

***** Iteration #123 *****
Loss: 128154.026771
Feature norm: 139.430172
Error norm: 1176.357886
Active features: 83664
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.883

***** Iteration #124 *****
Loss: 128149.326140
Feature norm: 139.482921
Error norm: 1336.876706
Active features: 83563
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.122

***** Iteration #125 *****
Loss: 128144.941057
Feature norm: 139.530072
Error norm: 1225.456557
Active features: 83468
Line search trials: 1
Line search

***** Iteration #160 *****
Loss: 128066.697296
Feature norm: 140.205619
Error norm: 615.568041
Active features: 82127
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.973

***** Iteration #161 *****
Loss: 128066.582630
Feature norm: 140.224143
Error norm: 1204.843290
Active features: 82117
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.244

***** Iteration #162 *****
Loss: 128064.796927
Feature norm: 140.242209
Error norm: 511.193337
Active features: 82069
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.644

***** Iteration #163 *****
Loss: 128064.461258
Feature norm: 140.264275
Error norm: 897.043757
Active features: 82035
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.265

***** Iteration #164 *****
Loss: 128062.327885
Feature norm: 140.281754
Error norm: 424.216605
Active features: 82044
Line search trials: 1
Line search st

***** Iteration #199 *****
Loss: 128030.992357
Feature norm: 140.748551
Error norm: 539.408837
Active features: 81468
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.988

***** Iteration #200 *****
Loss: 128030.354775
Feature norm: 140.761304
Error norm: 970.384294
Active features: 81463
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.052

***** Iteration #201 *****
Loss: 128029.598508
Feature norm: 140.772301
Error norm: 345.474943
Active features: 81446
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.955

***** Iteration #202 *****
Loss: 128028.997873
Feature norm: 140.788504
Error norm: 586.967291
Active features: 81437
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.492

***** Iteration #203 *****
Loss: 128028.217957
Feature norm: 140.798046
Error norm: 473.240074
Active features: 81437
Line search trials: 1
Line search ste

***** Iteration #241 *****
Loss: 128012.046444
Feature norm: 141.067444
Error norm: 122.524589
Active features: 80904
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.276

***** Iteration #242 *****
Loss: 128011.611732
Feature norm: 141.070543
Error norm: 347.757952
Active features: 80899
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.592

***** Iteration #243 *****
Loss: 128011.225634
Feature norm: 141.075751
Error norm: 727.187076
Active features: 80868
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 8.717

***** Iteration #244 *****
Loss: 128010.806333
Feature norm: 141.078413
Error norm: 323.639193
Active features: 80866
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.109

***** Iteration #245 *****
Loss: 128010.459189
Feature norm: 141.080398
Error norm: 343.243951
Active features: 80864
Line search trials: 1
Line search st

***** Iteration #283 *****
Loss: 128001.285086
Feature norm: 141.285749
Error norm: 289.733726
Active features: 80427
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.711

***** Iteration #284 *****
Loss: 128001.186948
Feature norm: 141.294661
Error norm: 474.772783
Active features: 80425
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.742

***** Iteration #285 *****
Loss: 128000.792934
Feature norm: 141.301681
Error norm: 467.594472
Active features: 80414
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.280

***** Iteration #286 *****
Loss: 128000.508405
Feature norm: 141.307781
Error norm: 293.114482
Active features: 80402
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.141

***** Iteration #287 *****
Loss: 128000.200785
Feature norm: 141.314492
Error norm: 407.329727
Active features: 80402
Line search trials: 1
Line search st

***** Iteration #323 *****
Loss: 127990.921765
Feature norm: 141.476726
Error norm: 199.476948
Active features: 80235
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.450

***** Iteration #324 *****
Loss: 127990.893338
Feature norm: 141.476928
Error norm: 697.595708
Active features: 80221
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 28.263

***** Iteration #325 *****
Loss: 127990.673912
Feature norm: 141.477656
Error norm: 292.370647
Active features: 80227
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.272

***** Iteration #326 *****
Loss: 127990.614681
Feature norm: 141.478463
Error norm: 690.877844
Active features: 80220
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.282

***** Iteration #327 *****
Loss: 127990.386277
Feature norm: 141.479075
Error norm: 198.464362
Active features: 80217
Line search trials: 1
Line search st

In [15]:
# ted
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf2.model')
# y_pred = [tagger.tag(xseq) for xseq in x_test]
y_pred = []
for xseq in tqdm(ted_x_test, total=len(ted_x_test)):
    y_pred.append(tagger.tag(xseq))

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in ted_y_test for tag in row])

print("Validate TED dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:09<00:00, 13.51it/s]


Validate TED dataset
              precision    recall  f1-score   support

           E       0.65      0.75      0.70     10261
           I       0.99      0.98      0.99    238204

    accuracy                           0.97    248465
   macro avg       0.82      0.87      0.84    248465
weighted avg       0.98      0.97      0.97    248465



In [16]:
# orchid
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
y_pred = tagger.tag(orchid_x_test)

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in orchid_y_test for tag in row])

print("Validate orchid dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

Validate orchid dataset
              precision    recall  f1-score   support

           E       0.76      0.65      0.70      1729
           I       0.98      0.99      0.98     27536

    accuracy                           0.97     29265
   macro avg       0.87      0.82      0.84     29265
weighted avg       0.97      0.97      0.97     29265



In [17]:
# fake review
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
# y_pred = [tagger.tag(xseq) for xseq in x_test]
y_pred = []
for xseq in tqdm(fake_review_x_test, total=len(fake_review_x_test)):
    y_pred.append(tagger.tag(xseq))

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in fake_review_y_test for tag in row])

print("Validate TED dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

100%|███████████████████████████████████████████████████████████████████████████| 17399/17399 [00:42<00:00, 407.23it/s]


Validate TED dataset
              precision    recall  f1-score   support

           E       0.98      0.96      0.97     77577
           I       1.00      1.00      1.00   1056133

    accuracy                           1.00   1133710
   macro avg       0.99      0.98      0.98   1133710
weighted avg       1.00      1.00      1.00   1133710

