# CRF-Cut: Sentence Segmentation
---
This notebook combine 3 datasets (ted, orchid and fake review) to train a model and validate separated datasets

The result of CRF-Cut is trained by datasets are as follows:

| dataset_train              | dataset_validate | E_f1-score |
|----------------------------|------------------|------------|
| Ted                        | Ted              | 0.72       |
| Orchid                     | Orchid           | 0.77       |
| Fake review                | Fake review      | 0.97       |
| Ted + Orchid + Fake review | Ted              | 0.72       |
| Ted + Orchid + Fake review | Orchid           | 0.69       |
| Ted + Orchid + Fake review | Fake review      | 0.97       |

We sample 25% from each dataset to train and validate because it does not have memory enough.

In [1]:
!cd data/checkpoint; unzip ted_fake.zip

The system cannot find the path specified.


In [2]:
#adapted from @bact at https://colab.research.google.com/drive/1hdtmwTXHLrqNmDhDqHnTQGpDVy1aJc4t
import json
import pandas as pd
import numpy as np
import re
import pycrfsuite
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from ast import literal_eval
from tqdm import tqdm
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

In [3]:
orchid = pd.read_csv('data/orchid_corpus/orchid97.crp.utf',sep='\t',header=None)
orchid.columns = ['text']
#remove weird words
orchid['first_char'] = orchid.text.map(lambda x: x[0])
orchid = orchid[(orchid.first_char!='%')&(orchid.first_char!='#')][['text']]
#get word,pos
orchid['word'] = orchid.text.map(lambda x: x.split('/')[0])
orchid['word'] = orchid.word.map(lambda x: ' ' if (x=='<space>')|(x=='') else x)
orchid['pos'] = orchid.text.map(lambda x: x.split('/')[1] if len(x.split('/'))==2 else None)
#labels
orchid['lab'] = orchid.apply(lambda row: 'E' if row['text']=='//' else 'I',1)
orchid = orchid[(orchid.lab=='E')|(~orchid.pos.isna())].reset_index(drop=True)

In [4]:
%%time
ted_all_sentences = np.load('data/checkpoint/ted-all-sentences.npy') 
fake_review_all_sentences = np.load('data/checkpoint/fake-review-all-sentences.npy') 

Wall time: 313 ms


In [5]:
# Sample from 3 datasets
np.random.seed(42)
ratio = .50
ted_sample = np.random.choice(ted_all_sentences, int(len(ted_all_sentences) * ratio))
orchid_sample = orchid.iloc[:int(len(orchid) * ratio)]
fake_review_sample = np.random.choice(fake_review_all_sentences, int(len(fake_review_all_sentences) * ratio))

In [6]:
print(f"Length of TED: {len(ted_sample)}")
print(f"Length of orchid: {len(orchid_sample)}")
print(f"Length of fake review: {len(fake_review_sample)}")

Length of TED: 771
Length of orchid: 182907
Length of fake review: 108741


In [7]:
def assign_word_lab(all_sentences):
    all_tuples = []
    for i in tqdm(range(len(all_sentences)), total=len(all_sentences)):
        tuples = []
        for s in all_sentences[i].split('|'):
            s_lst = word_tokenize(s)
            for j in range(len(s_lst)):
                lab = 'E' if j==len(s_lst)-1 else 'I'
                tuples.append((s_lst[j],lab))
        all_tuples.append(tuples)
    return all_tuples

In [8]:
%%time
ted_all_tuples = assign_word_lab(ted_sample)
orchid_all_tuples = [(row['word'],row['lab']) for i,row in orchid_sample.iterrows()]
fake_review_all_tuples = assign_word_lab(fake_review_sample)

100%|████████████████████████████████████████████████████████████████████████████████| 771/771 [00:19<00:00, 39.89it/s]
100%|████████████████████████████████████████████████████████████████████████| 108741/108741 [01:21<00:00, 1326.68it/s]

Wall time: 2min 2s





In [9]:
enders = ["ครับ","ค่ะ","คะ","นะคะ","นะ","น่า", "จ้ะ","จ้า","จ๋า", "ขา","ฮะ", "ละครับ", "ละ","ละนะ" "ละค่ะ"#ending honorifics
          #enders
          "ๆ","ได้","แล้ว","ด้วย","เลย","มาก","น้อย","กัน","เช่นกัน","เท่านั้น",
          "อยู่","ลง","ขึ้น","มา","ไป","ไว้","เอง","อีก","ใหม่","จริงๆ", "ซิน่ะ",  "ซิ", "ซี","ซี่",
          "บ้าง","หมด","ทีเดียว","เดียว","เถอะ", "เถิด" ,"เถอะน่า", "เถอะน่ะ", "หรอก",
          #demonstratives
          "นั้น","นี้","เหล่านี้","เหล่านั้น",
          #questions
          "อย่างไร","ยังไง","หรือไม่","มั้ย","ไหน","อะไร","ทำไม","เมื่อไหร่"]
starters = ["ผม","ฉัน","ดิฉัน","ชั้น","คุณ","มัน","เขา","เค้า",
            "เธอ","เรา","พวกเรา","พวกเขา", #pronouns
            #connectors
            "และ","หรือ","แต่","เมื่อ","ถ้า","ใน",
            "ด้วย","เพราะ","เนื่องจาก","ซึ่ง","ไม่",
            "ตอนนี้","ทีนี้","ดังนั้น","เพราะฉะนั้น","ฉะนั้น",
            "ตั้งแต่","ในที่สุด",
            #demonstratives
            "นั้น","นี้","เหล่านี้","เหล่านั้น"]

def extract_features(doc, window=2, max_n_gram=3):
    doc_features = []
    #paddings for word and POS
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    doc_ender = []
    doc_starter = []
    #add enders
    for i in range(len(doc)):
        if doc[i] in enders:
            doc_ender.append('ender')
        else:
            doc_ender.append('normal')
    #add starters
    for i in range(len(doc)):
        if doc[i] in starters:
            doc_starter.append('starter')
        else:
            doc_starter.append('normal')
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
                ender_ =  f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f'ender_{feature_position}={ender_}']
                starter_ =  f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f'starter_{feature_position}={starter_}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

In [10]:
%%time
# ted
#target
ted_y = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    ted_y.append(temp)

#features
ted_x_pre = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    ted_x_pre.append(temp)
ted_x = []
for x_ in tqdm(ted_x_pre, total=len(ted_x_pre)):
    ted_x.append(extract_features(x_, window=2, max_n_gram = 3))

100%|██████████████████████████████████████████████████████████████████████████████| 771/771 [00:00<00:00, 3055.57it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 771/771 [00:00<00:00, 2518.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 771/771 [00:45<00:00, 16.85it/s]

Wall time: 46.3 s





In [11]:
%%time
# orchid
#target
orchid_y = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_y.append(l)
#features
orchid_x_pre = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_x_pre.append(w)
orchid_x = extract_features(orchid_x_pre, window=2, max_n_gram = 3) 

100%|█████████████████████████████████████████████████████████████████████| 182907/182907 [00:00<00:00, 1780557.77it/s]
100%|█████████████████████████████████████████████████████████████████████| 182907/182907 [00:00<00:00, 2736688.09it/s]


Wall time: 4.26 s


In [12]:
# fake review
#target
fake_review_y = []
for t in tqdm(fake_review_all_tuples, total=len(fake_review_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    fake_review_y.append(temp)

#features
fake_review_x_pre = []
for t in tqdm(fake_review_all_tuples, total=len(fake_review_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    fake_review_x_pre.append(temp)
fake_review_x = []
for x_ in tqdm(fake_review_x_pre, total=len(fake_review_x_pre)):
    fake_review_x.append(extract_features(x_, window=2, max_n_gram = 3))

100%|██████████████████████████████████████████████████████████████████████| 108741/108741 [00:01<00:00, 106998.64it/s]
100%|███████████████████████████████████████████████████████████████████████| 108741/108741 [00:02<00:00, 46995.77it/s]
100%|█████████████████████████████████████████████████████████████████████████| 108741/108741 [04:12<00:00, 429.97it/s]


In [13]:
# Split train and test set at 80/20 proportion
ted_x_train, ted_x_test, ted_y_train, ted_y_test = train_test_split(ted_x, ted_y, test_size=0.2, random_state=1412)
idx = int(len(orchid_x)*0.8)
orchid_x_train, orchid_x_test = orchid_x[:idx], orchid_x[idx:]
orchid_y_train, orchid_y_test = orchid_y[:idx], orchid_y[idx:]
fake_review_x_train, fake_review_x_test, fake_review_y_train, fake_review_y_test \
    = train_test_split(fake_review_x, fake_review_y, test_size=0.2, random_state=1412)

In [14]:
%%time
# Train model
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in tqdm(zip(ted_x_train, ted_y_train), total=len(ted_y_train)):
    trainer.append(xseq, yseq)
    
trainer.append(orchid_x_train, orchid_y_train)

for xseq, yseq in tqdm(zip(fake_review_x_train, fake_review_y_train), total=len(fake_review_y_train)):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1,
    'c2': 0,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('models/datasets-crf2.model')

100%|████████████████████████████████████████████████████████████████████████████████| 616/616 [01:35<00:00,  6.46it/s]
100%|███████████████████████████████████████████████████████████████████████████| 86992/86992 [06:51<00:00, 211.16it/s]


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 9959403
Seconds required: 103.151

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2839206.176629
Feature norm: 1.000000
Error norm: 2882790.820663
Active features: 2120671
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 20.293

***** Iteration #2 *****
Loss: 2486822.803932
Feature norm: 0.877192
Error norm: 2768189.985005
Active features: 1602963
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.973

***** Iteration #3 *****
Loss: 2367238.338858
Feature norm: 0.391730
Error norm: 8228260.903886
Active features: 604622
Line search trials: 3
Line search step: 0.2500

***** Iteration #40 *****
Loss: 246881.691297
Feature norm: 25.326125
Error norm: 69499.947237
Active features: 309151
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.797

***** Iteration #41 *****
Loss: 243434.367930
Feature norm: 26.512847
Error norm: 106923.754216
Active features: 303309
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.165

***** Iteration #42 *****
Loss: 238383.775667
Feature norm: 27.691559
Error norm: 71388.935672
Active features: 297255
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.844

***** Iteration #43 *****
Loss: 234500.547246
Feature norm: 29.021962
Error norm: 98057.630048
Active features: 293692
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.269

***** Iteration #44 *****
Loss: 229471.038845
Feature norm: 30.379953
Error norm: 61707.952362
Active features: 289843
Line search trials: 1
Line sea

***** Iteration #80 *****
Loss: 160790.032620
Feature norm: 136.751947
Error norm: 930.937580
Active features: 127475
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 11.631

***** Iteration #81 *****
Loss: 160535.815005
Feature norm: 137.211920
Error norm: 2754.220134
Active features: 126422
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 11.386

***** Iteration #82 *****
Loss: 160282.966344
Feature norm: 141.770084
Error norm: 24216.130643
Active features: 122772
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 11.593

***** Iteration #83 *****
Loss: 160058.011164
Feature norm: 142.317690
Error norm: 5850.721347
Active features: 123274
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 11.606

***** Iteration #84 *****
Loss: 159976.794646
Feature norm: 142.831911
Error norm: 1286.238363
Active features: 122792
Line search trials: 1
Line se

***** Iteration #120 *****
Loss: 157596.792675
Feature norm: 158.715537
Error norm: 1239.393744
Active features: 104265
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.450

***** Iteration #121 *****
Loss: 157581.184501
Feature norm: 158.766432
Error norm: 1838.146963
Active features: 104202
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.340

***** Iteration #122 *****
Loss: 157576.619907
Feature norm: 158.836014
Error norm: 1648.665270
Active features: 104076
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.688

***** Iteration #123 *****
Loss: 157562.314444
Feature norm: 158.930366
Error norm: 1970.694692
Active features: 103859
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.967

***** Iteration #124 *****
Loss: 157546.814720
Feature norm: 159.017856
Error norm: 2702.514115
Active features: 103754
Line search trials: 1
Li

***** Iteration #160 *****
Loss: 157386.745787
Feature norm: 160.172686
Error norm: 1020.698000
Active features: 101413
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.498

***** Iteration #161 *****
Loss: 157384.019164
Feature norm: 160.183751
Error norm: 839.445070
Active features: 101347
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.980

***** Iteration #162 *****
Loss: 157381.041701
Feature norm: 160.196470
Error norm: 1266.155218
Active features: 101273
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.498

***** Iteration #163 *****
Loss: 157378.681576
Feature norm: 160.212244
Error norm: 1145.531255
Active features: 101205
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.987

***** Iteration #164 *****
Loss: 157375.616360
Feature norm: 160.225083
Error norm: 875.923434
Active features: 101133
Line search trials: 1
Line s

***** Iteration #200 *****
Loss: 157311.089397
Feature norm: 160.757998
Error norm: 470.590416
Active features: 99734
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.964

***** Iteration #201 *****
Loss: 157309.998068
Feature norm: 160.775568
Error norm: 696.449100
Active features: 99720
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.138

***** Iteration #202 *****
Loss: 157308.669035
Feature norm: 160.800485
Error norm: 642.849495
Active features: 99693
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.470

***** Iteration #203 *****
Loss: 157307.466207
Feature norm: 160.832190
Error norm: 796.050534
Active features: 99653
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.051

***** Iteration #204 *****
Loss: 157306.297006
Feature norm: 160.849998
Error norm: 528.903083
Active features: 99614
Line search trials: 1
Line search s

***** Iteration #240 *****
Loss: 157277.401024
Feature norm: 161.674711
Error norm: 361.081800
Active features: 98605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.305

***** Iteration #241 *****
Loss: 157276.928827
Feature norm: 161.685686
Error norm: 394.026618
Active features: 98597
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.802

***** Iteration #242 *****
Loss: 157276.266714
Feature norm: 161.698002
Error norm: 511.278888
Active features: 98602
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.240

***** Iteration #243 *****
Loss: 157275.847928
Feature norm: 161.708670
Error norm: 652.487473
Active features: 98570
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.771

***** Iteration #244 *****
Loss: 157275.275615
Feature norm: 161.722547
Error norm: 590.848584
Active features: 98557
Line search trials: 1
Line search s

***** Iteration #279 *****
Loss: 157262.860019
Feature norm: 161.972039
Error norm: 413.221715
Active features: 98172
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.888

***** Iteration #280 *****
Loss: 157262.583294
Feature norm: 161.976343
Error norm: 540.296259
Active features: 98169
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.969

***** Iteration #281 *****
Loss: 157262.339241
Feature norm: 161.983977
Error norm: 690.080171
Active features: 98150
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 12.020

***** Iteration #282 *****
Loss: 157262.061227
Feature norm: 161.987526
Error norm: 789.377187
Active features: 98126
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.347

***** Iteration #283 *****
Loss: 157261.686760
Feature norm: 161.994357
Error norm: 623.220375
Active features: 98110
Line search trials: 1
Line search s

***** Iteration #318 *****
Loss: 157251.372020
Feature norm: 162.154662
Error norm: 507.105604
Active features: 97942
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.383

***** Iteration #319 *****
Loss: 157251.122441
Feature norm: 162.162852
Error norm: 668.564715
Active features: 97929
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.945

***** Iteration #320 *****
Loss: 157250.777720
Feature norm: 162.162974
Error norm: 558.526552
Active features: 97926
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.777

***** Iteration #321 *****
Loss: 157250.512834
Feature norm: 162.172902
Error norm: 625.683450
Active features: 97944
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 10.106

***** Iteration #322 *****
Loss: 157250.076524
Feature norm: 162.175164
Error norm: 526.871424
Active features: 97919
Line search trials: 1
Line search st

***** Iteration #358 *****
Loss: 157241.483582
Feature norm: 162.252283
Error norm: 554.998888
Active features: 97770
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.688

***** Iteration #359 *****
Loss: 157241.223716
Feature norm: 162.255189
Error norm: 230.621652
Active features: 97774
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.629

***** Iteration #360 *****
Loss: 157241.127712
Feature norm: 162.256627
Error norm: 585.522608
Active features: 97770
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.605

***** Iteration #361 *****
Loss: 157240.982339
Feature norm: 162.260535
Error norm: 710.261156
Active features: 97764
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.613

***** Iteration #362 *****
Loss: 157240.604951
Feature norm: 162.261568
Error norm: 149.594156
Active features: 97774
Line search trials: 1
Line search step

***** Iteration #398 *****
Loss: 157234.425867
Feature norm: 162.333748
Error norm: 702.514690
Active features: 97620
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.521

***** Iteration #399 *****
Loss: 157234.177438
Feature norm: 162.336334
Error norm: 319.355529
Active features: 97615
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.430

***** Iteration #400 *****
Loss: 157234.145421
Feature norm: 162.337262
Error norm: 660.672257
Active features: 97611
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 9.615

L-BFGS terminated with the stopping criteria
Total seconds required for training: 4401.972

Storing the model
Number of active features: 97611 (9959403)
Number of active attributes: 58926 (9760738)
Number of active labels: 2 (2)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0

In [15]:
# ted
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf2.model')
# y_pred = [tagger.tag(xseq) for xseq in x_test]
y_pred = []
for xseq in tqdm(ted_x_test, total=len(ted_x_test)):
    y_pred.append(tagger.tag(xseq))

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in ted_y_test for tag in row])

print("Validate TED dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

100%|████████████████████████████████████████████████████████████████████████████████| 155/155 [00:11<00:00, 13.10it/s]


Validate TED dataset
              precision    recall  f1-score   support

           E       0.66      0.78      0.71     12670
           I       0.99      0.98      0.99    302593

    accuracy                           0.97    315263
   macro avg       0.82      0.88      0.85    315263
weighted avg       0.98      0.97      0.98    315263



In [16]:
# orchid
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
y_pred = tagger.tag(orchid_x_test)

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in orchid_y_test for tag in row])

print("Validate orchid dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

Validate orchid dataset
              precision    recall  f1-score   support

           E       0.73      0.59      0.65      2384
           I       0.97      0.99      0.98     34198

    accuracy                           0.96     36582
   macro avg       0.85      0.79      0.82     36582
weighted avg       0.96      0.96      0.96     36582



In [17]:
# fake review
# Predict (using test set)
tagger = pycrfsuite.Tagger()
tagger.open('models/datasets-crf.model')
# y_pred = [tagger.tag(xseq) for xseq in x_test]
y_pred = []
for xseq in tqdm(fake_review_x_test, total=len(fake_review_x_test)):
    y_pred.append(tagger.tag(xseq))

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in fake_review_y_test for tag in row])

print("Validate TED dataset")
print(classification_report(
    truths, predictions,
    target_names=["E", "I"]))

100%|███████████████████████████████████████████████████████████████████████████| 21749/21749 [00:50<00:00, 428.80it/s]


Validate TED dataset
              precision    recall  f1-score   support

           E       0.98      0.95      0.97     97081
           I       1.00      1.00      1.00   1320628

    accuracy                           1.00   1417709
   macro avg       0.99      0.98      0.98   1417709
weighted avg       1.00      1.00      1.00   1417709

