In [1]:
!pip install sklearn-crfsuite pythainlp

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting pythainlp
  Downloading pythainlp-3.0.5-py3-none-any.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 4.3 MB/s 
[?25hCollecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 46.7 MB/s 
Collecting tinydb>=3.0
  Downloading tinydb-4.6.1-py3-none-any.whl (24 kB)
Installing collected packages: tinydb, python-crfsuite, sklearn-crfsuite, pythainlp
Successfully installed pythainlp-3.0.5 python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6 tinydb-4.6.1


In [2]:
import json
import pandas as pd
import numpy as np
import re
# import pycrfsuite
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from ast import literal_eval
from tqdm import tqdm

from itertools import chain

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')


In [3]:
!tar -xvf /content/drive/MyDrive/SuperAI_NLP/AIFORTHAI-LST20Corpus.tar.gz

tar: /content/drive/MyDrive/SuperAI_NLP/AIFORTHAI-LST20Corpus.tar.gz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


## Load and Prepare Dataset

Using the LST20 dataset, we combine train and test into one training data and leave validation data for validation.

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 4.1 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 66.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 84.0 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 79.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.9 MB/s 
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)
[K     |████████████████████████████████| 94 k

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset

dataset = load_dataset("lst20", data_dir="/content/LST20_Corpus")

Using custom data configuration default-97621787f5b60e1b
Reusing dataset lst20 (/root/.cache/huggingface/datasets/lst20/default-97621787f5b60e1b/0.0.0/e1b2a921fb011578ab43ddbbf789f3c500d62cb2df8ae4ed4b60bae8e4c0d3ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'fname', 'tokens', 'pos_tags', 'ner_tags', 'clause_tags'],
        num_rows: 63310
    })
    validation: Dataset({
        features: ['id', 'fname', 'tokens', 'pos_tags', 'ner_tags', 'clause_tags'],
        num_rows: 5620
    })
    test: Dataset({
        features: ['id', 'fname', 'tokens', 'pos_tags', 'ner_tags', 'clause_tags'],
        num_rows: 5250
    })
})

In [None]:
# The following script is used to tokenize the text in LST20 according to PythaiNLP

def extraction(datasets):
    fastarray_token = np.array(datasets['tokens'])
    fastarray_topic = np.array(datasets['fname'])
    all_tuples = []
    tuples = []
    for j in tqdm(range(len(fastarray_token)), total=len(fastarray_token)):
        token = word_tokenize("".join((fastarray_token[j])))
        tag = len(token)*['I']
        token.append(' ')
        tag.append('E')
        for i in range(len(token)):
            if token[i] == '_':
                token[i] = ' '
            tuples.append((token[i],tag[i]))
        if j == len(fastarray_token)-1:
            all_tuples.append(tuples)
            tuples = []
            return(all_tuples)
        if fastarray_topic[j] != fastarray_topic[j+1]:
            all_tuples.append(tuples)
            tuples = []
    return(all_tuples)

In [None]:
train_data = extraction(dataset['train'])

test_data = extraction(dataset['test'])

validation_data = extraction(dataset['validation'])

100%|█████████▉| 63309/63310 [00:41<00:00, 1537.73it/s]
100%|█████████▉| 5249/5250 [00:02<00:00, 1770.10it/s]
100%|█████████▉| 5619/5620 [00:03<00:00, 1599.69it/s]


In [None]:
len(train_data)

3794

In [None]:
len(test_data)

483

In [None]:
train_data = train_data + test_data

### Investigate Size of an Article and Sentence

LST20 is a news article, one of the thing that could be a problem is a really long editorial. This could make it slightly difficult for CRF to understand and dealt with it. This is a quick check to see the distribution of length of an article is LST20. We also want to quickly check sentence length according to LST20. This is to make sure that distribution of validation and training are not drastically different.

In [None]:
# training set

size_of_news = []
size_of_sentence = []
ender_word = []
start_word = []

for i in range(len(train_data)):
    size_of_news.append(len(train_data[i]))
    count = 0
    for j in range(len(train_data[i])):
        if j == 0:
            start_word.append(train_data[i][j][0])
        count += 1
        if train_data[i][j][1] == 'E':
            ender_word.append(train_data[i][j-1][0])
            size_of_sentence.append(count)
            count = 0

In [None]:
# validation set

size_of_news_validation = []
size_of_sentence_validation = []
ender_word_validation = []
start_word_validation = []

for i in range(len(validation_data)):
    size_of_news_validation.append(len(validation_data[i]))
    count = 0
    for j in range(len(validation_data[i])):
        if j == 0:
            start_word_validation.append(validation_data[i][j][0])
        count += 1
        if validation_data[i][j][1] == 'E':
            ender_word_validation.append(validation_data[i][j-1][0])
            size_of_sentence_validation.append(count)
            count = 0

In [None]:
print(pd.Series(size_of_news).describe())

count     4277.000000
mean       637.405658
std       1024.786979
min         29.000000
25%        282.000000
50%        441.000000
75%        711.000000
max      51766.000000
dtype: float64


In [None]:
print(pd.Series(size_of_news_validation).describe())

count      474.000000
mean       478.170886
std       1469.430004
min         54.000000
25%        224.000000
50%        302.000000
75%        467.000000
max      29893.000000
dtype: float64


In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=size_of_news))
fig.add_trace(go.Histogram(x=size_of_news_validation))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
print(pd.Series(size_of_sentence).describe())

count    68560.000000
mean        39.763477
std         27.940715
min          2.000000
25%         19.000000
50%         34.000000
75%         54.000000
max        909.000000
dtype: float64


In [None]:
print(pd.Series(size_of_sentence_validation).describe())

count    5620.000000
mean       40.329715
std        25.986875
min         2.000000
25%        20.000000
50%        37.000000
75%        55.000000
max       410.000000
dtype: float64


In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=size_of_sentence))
fig.add_trace(go.Histogram(x=size_of_sentence_validation))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Standard Feature Extractor from PyThaiNLP

คำสรรพนาม

Pronoun 1st person
	ฉัน ผม ดิฉัน เรา หนู กู ทางนี้ ด้านนี้ ตัวผม ข้าพเจ้า กระผม อาตมา ข้าพระพุทธเจ้า 
Pronoun 2nd Person
	เขา เธอ พวกเขา พวกเธอ ท่าน คุณ ใต้เท้า พระคุณเจ้า ฝ่าพระบาท แก ใต้ฝ่าละอองธุลีพระบาท
Pronoun 3rd Person
	เขา เธอ พวกเขา พวกเธอ มัน พระองค์ ท่าน คน พวกคน บุคคล
Distance Pronoun
	นี่ นั่น โน่น นี้ นั้น โน้น
Question Pronoun
	ใคร อะไร ที่ไหน ผู้ใด ทำไม ทางไหน อย่างไร 



In [None]:
enders = ["ครับ","ค่ะ","คะ","นะคะ","นะ","จ้ะ","จ้า","จ๋า","ฮะ", #ending honorifics
          #enders
          "ๆ","ได้","แล้ว","ด้วย","เลย","มาก","น้อย","กัน","เช่นกัน","เท่านั้น",
          "อยู่","ลง","ขึ้น","มา","ไป","ไว้","เอง","อีก","ใหม่","จริงๆ",
          "บ้าง","หมด","ทีเดียว","เดียว","บาท","กล่าว",
          #demonstratives
          "นั้น","นี้","เหล่านี้","เหล่านั้น",
          #questions
          "อย่างไร","ยังไง","หรือไม่","มั้ย","ไหน","อะไร","ทำไม","เมื่อไหร่"]
starters = ["ผม","ฉัน","ดิฉัน","ชั้น","คุณ","มัน","เขา","เค้า",
            "เธอ","เรา","พวกเรา","พวกเขา", #pronouns
            #connectors
            "และ","หรือ","แต่","เมื่อ","ถ้า","ใน",
            "ด้วย","เพราะ","เนื่องจาก","ซึ่ง","ไม่",
            "ตอนนี้","ทีนี้","ดังนั้น","เพราะฉะนั้น","ฉะนั้น",
            "ตั้งแต่","ในที่สุด","จากนั้น","แล้ว","เตรียม",'รวมถึง',
            #demonstratives
            "นั้น","นี้","เหล่านี้","เหล่านั้น",'ทั้งนี้']

#Connector Nouns Questions

connector = ['และ' ,'จึง', 'แต่', 'หรือ', 'เพราะ', 'ซึ่ง', 'ที่' ,'เป็น', 'อัน', 'เมื่อ', 'จน', 'เนื่องจาก','ระหว่าง']


# nouns = ["ผม","ฉัน","ดิฉัน","ชั้น","หนู","คุณ","มัน","เขา","เค้า","ข้าพเจ้า"
#             "เธอ","เรา","ท่าน","พวกเธอ","พวกเรา","พวกเขา", #pronouns
#             "นาย","นาง","นางสาว","ตน","ตัวผม","ตัวเอง","คุณ","ท่าน",'นั่น','โน่น','นี้']

# questions = ['ใคร', 'อะไร', 'ที่ไหน', 'ผู้ใด', 'ทำไม', 'ทางไหน','อย่างไร']


# verbs = ['คือ','เป็น','อยู่','ทำ','มี','เผย','กล่าว','เป็น','เดินทาง','กำหนด','ไป','ชี้','เล่าว่า','เริ่ม','เปิด','ปิด','ขอให้','จึง','ถาม','ตอบ']

In [None]:
### Add connector and noun

def extract_features(doc, window=3, max_n_gram=3):
    doc_features = []
    #paddings for word and POS
    doc = ['xxpad' for i in range(window)] + doc + ['xxpad' for i in range(window)]
    doc_ender = []
    doc_starter = []
    #add enders
    for i in range(len(doc)):
        if doc[i] in enders:
            doc_ender.append('ender')
        else:
            doc_ender.append('normal')
    #add starters
    for i in range(len(doc)):
        if doc[i] in starters:
            doc_starter.append('starter')
        else:
            doc_starter.append('normal')
    
    
    ####New#####
    
    doc_connector = []
    doc_tag = []
    #add connecter
    for i in range(len(doc)):
        if doc[i] in connector:
            doc_connector.append('connector')
        else:
            doc_connector.append('normal')    
    #add  pos_tag
    tag = pos_tag(doc)
    for i in range(len(doc)):
        doc_tag.append(tag[i][1])
            
            
    #for each word
    for i in range(window, len(doc)-window):
        #bias term
        word_features = ['bias'] 
        #ngram features
        for n_gram in range(1, min(max_n_gram+1,2+window*2)):
            for j in range(i-window,i+window+2-n_gram):
                feature_position = f'{n_gram}_{j-i}_{j-i+n_gram}'
                word_ = f'{"|".join(doc[j:(j+n_gram)])}'
                word_features += [f'word_{feature_position}={word_}']
                ender_ =  f'{"|".join(doc_ender[j:(j+n_gram)])}'
                word_features += [f'ender_{feature_position}={ender_}']
                starter_ =  f'{"|".join(doc_starter[j:(j+n_gram)])}'
                word_features += [f'starter_{feature_position}={starter_}']
                connector_ =  f'{"|".join(doc_connector[j:(j+n_gram)])}'
                word_features += [f'connector_{feature_position}={connector_}']
                tag_ =  f'{"|".join(doc_tag[j:(j+n_gram)])}'
                word_features += [f'tags_{feature_position}={tag_}']
        
        #append to feature per word
        doc_features.append(word_features)
    return doc_features

In [None]:
len(train_data)

4277

In [None]:
train_data[1]

[('บุก', 'I'),
 ('ยึด', 'I'),
 ('ไม้', 'I'),
 ('เถื่อน', 'I'),
 ('อดีต', 'I'),
 ('ส.ส.', 'I'),
 ('บุรีรัมย์', 'I'),
 ('เตรียม', 'I'),
 ('สร้าง', 'I'),
 ('คฤหาสน์', 'I'),
 ('ทรง', 'I'),
 ('ไทย', 'I'),
 (' ', 'E'),
 ('1', 'I'),
 (' ', 'I'),
 ('กันยายน', 'I'),
 (' ', 'I'),
 ('2550', 'I'),
 (' ', 'I'),
 ('12', 'I'),
 (':', 'I'),
 ('00', 'I'),
 (' ', 'I'),
 ('น.', 'I'),
 (' ', 'E'),
 ('ตำรวจภูธร', 'I'),
 ('จ.', 'I'),
 ('บุรีรัมย์', 'I'),
 ('บุก', 'I'),
 ('ตรวจ', 'I'),
 ('ยึด', 'I'),
 ('ไม้แปรรูป', 'I'),
 ('หวงห้าม', 'I'),
 ('กว่า', 'I'),
 (' ', 'I'),
 ('80', 'I'),
 (' ', 'I'),
 ('แผ่น', 'I'),
 (' ', 'E'),
 ('เตรียม', 'I'),
 ('นำ', 'I'),
 ('ไป', 'I'),
 ('ก่อสร้าง', 'I'),
 ('คฤหาสน์', 'I'),
 ('ทรง', 'I'),
 ('ไทย', 'I'),
 ('ของ', 'I'),
 ('อดีต', 'I'),
 ('ส.ส.', 'I'),
 ('ดัง', 'I'),
 ('พร้อม', 'I'),
 ('นำ', 'I'),
 ('ตัว', 'I'),
 ('ภายในบ้าน', 'I'),
 ('ไป', 'I'),
 ('สอบสวน', 'I'),
 ('เพื่อให้', 'I'),
 ('เจ้าของ', 'I'),
 ('นำ', 'I'),
 ('เอกสาร', 'I'),
 ('หลักฐาน', 'I'),
 ('ที่มา', 'I'),
 ('ของ', 

In [None]:
%%time
# ted
# target
LST20_y = []
for t in tqdm(train_data, total=len(train_data)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    LST20_y.append(temp)

100%|██████████| 4277/4277 [00:00<00:00, 6808.18it/s]

CPU times: user 610 ms, sys: 17.9 ms, total: 628 ms
Wall time: 637 ms





In [None]:
LST20_y

In [None]:
%%time
# features
LST20_x_pre = []
for t in tqdm(train_data, total=len(train_data)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    LST20_x_pre.append(temp)
LST20_x = []
for x_ in tqdm(LST20_x_pre, total=len(LST20_x_pre)):
    LST20_x.append(extract_features(x_, window=3, max_n_gram = 3))

100%|██████████| 4277/4277 [00:00<00:00, 5606.47it/s]
 28%|██▊       | 1209/4277 [02:42<27:17,  1.87it/s]

In [None]:
%%time
# features
LST20_y_eval = []
for t in tqdm(validation_data, total=len(validation_data)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    LST20_y_eval.append(temp)

100%|██████████████████████████████████████████████████████████████████████████████| 474/474 [00:00<00:00, 2553.57it/s]

Wall time: 207 ms





In [None]:
%%time
LST20_x_eval_pre = []
for t in tqdm(validation_data, total=len(validation_data)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    LST20_x_eval_pre.append(temp)
LST20_x_eval = []
for x_ in tqdm(LST20_x_eval_pre, total=len(LST20_x_eval_pre)):
    LST20_x_eval.append(extract_features(x_, window=3, max_n_gram = 3))

100%|██████████████████████████████████████████████████████████████████████████████| 474/474 [00:00<00:00, 7426.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 474/474 [00:23<00:00, 20.09it/s]

Wall time: 23.7 s





### SKlearn_CRF

In [None]:
X_train = []
y_train = []

for xseq, yseq in tqdm(zip(LST20_x, LST20_y), total=len(LST20_x)):
    X_train.append(xseq)
    y_train.append(yseq)

100%|██████████████████████████████████████████████████████████████████████████| 4277/4277 [00:00<00:00, 194372.62it/s]


In [None]:
X_valid = []
y_valid = []

for xseq, yseq in tqdm(zip(LST20_x_eval, LST20_y_eval), total=len(LST20_x_eval)):
    X_valid.append(xseq)
    y_valid.append(yseq)

100%|████████████████████████████████████████████████████████████████████████████████████████| 474/474 [00:00<?, ?it/s]


In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=1, 
    c2=0, 
    max_iterations=1000, 
    all_possible_transitions=True,verbose=True
)

In [None]:
%%time
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|███████████████████████████████████████████| 4277/4277 [08:04<00:00,  8.83it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 11713087
Seconds required: 106.636

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=35.92 loss=679920.38 active=1780886 feature_norm=1.00
Iter 2   time=15.14 loss=643328.48 active=1284043 feature_norm=0.95
Iter 3   time=104.59 loss=365136.90 active=267856 feature_norm=0.54
Iter 4   time=61.79 loss=308537.27 active=296470 feature_norm=0.40
Iter 5   time=15.58 loss=290855.90 active=291926 feature_norm=0.47
Iter 6   time=15.60 loss=261310.85 active=286076 feature_norm=1.01
Iter 7   time=15.73 loss=228658.37 active=348766 feature_norm=1.06
Iter 8   time=16.84 loss=219407.99 active=349026 feature_norm=1.44
Iter 9   time=17.93 loss=209919.92 active=344939 featu

Iter 121 time=15.15 loss=83804.13 active=75133 feature_norm=103.38
Iter 122 time=15.32 loss=83801.21 active=74976 feature_norm=103.41
Iter 123 time=15.43 loss=83798.29 active=74858 feature_norm=103.46
Iter 124 time=16.67 loss=83795.59 active=74687 feature_norm=103.50
Iter 125 time=16.82 loss=83793.14 active=74607 feature_norm=103.54
Iter 126 time=18.76 loss=83790.86 active=74510 feature_norm=103.55
Iter 127 time=17.35 loss=83788.57 active=74457 feature_norm=103.57
Iter 128 time=15.58 loss=83786.24 active=74370 feature_norm=103.59
Iter 129 time=13.96 loss=83784.10 active=74325 feature_norm=103.62
Iter 130 time=13.92 loss=83782.08 active=74285 feature_norm=103.62
Iter 131 time=13.88 loss=83780.04 active=74228 feature_norm=103.65
Iter 132 time=15.21 loss=83777.84 active=74158 feature_norm=103.67
Iter 133 time=13.90 loss=83775.90 active=74066 feature_norm=103.70
Iter 134 time=13.77 loss=83774.21 active=74033 feature_norm=103.72
Iter 135 time=13.91 loss=83772.66 active=73992 feature_norm=10

Iter 246 time=13.83 loss=83707.68 active=71925 feature_norm=105.36
Iter 247 time=14.37 loss=83707.35 active=71940 feature_norm=105.37
Iter 248 time=15.73 loss=83707.06 active=71945 feature_norm=105.38
Iter 249 time=15.09 loss=83706.74 active=71951 feature_norm=105.39
Iter 250 time=13.67 loss=83706.48 active=71938 feature_norm=105.39
Iter 251 time=13.75 loss=83706.17 active=71940 feature_norm=105.41
Iter 252 time=13.79 loss=83705.90 active=71898 feature_norm=105.41
Iter 253 time=13.78 loss=83705.57 active=71922 feature_norm=105.43
Iter 254 time=14.22 loss=83705.30 active=71912 feature_norm=105.43
Iter 255 time=14.73 loss=83705.01 active=71903 feature_norm=105.44
Iter 256 time=15.86 loss=83704.78 active=71892 feature_norm=105.44
Iter 257 time=14.42 loss=83704.48 active=71890 feature_norm=105.45
Iter 258 time=14.14 loss=83704.24 active=71877 feature_norm=105.45
Iter 259 time=13.98 loss=83703.97 active=71885 feature_norm=105.47
Iter 260 time=13.99 loss=83703.74 active=71868 feature_norm=10

Wall time: 1h 43min 24s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=1, c2=0,
    keep_tempfiles=None, max_iterations=1000, verbose=True)

In [None]:
import joblib

joblib_file = './model/crfsklearn_lst20_addedPOSWin3Ng3.pkl'
joblib.dump(crf,joblib_file)

['./model/crfsklearn_lst20_addedPOSWin3Ng3.pkl']

In [None]:
import joblib
joblib_file = './model/crfsklearn_lst20_addedPOSWin3Ng3.pkl'
crf = joblib.load(joblib_file)

In [None]:
labels = list(crf.classes_)
labels
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

In [None]:
y_pred = crf.predict(X_valid)

In [None]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_valid, y_pred, labels=sorted_labels, digits=3
))
metrics.flat_f1_score(y_valid, y_pred, 
                      average='weighted', labels=sorted_labels)

              precision    recall  f1-score   support

           E      0.809     0.608     0.694      5620
           I      0.990     0.996     0.993    220960

    accuracy                          0.987    226580
   macro avg      0.900     0.802     0.844    226580
weighted avg      0.986     0.987     0.986    226580



0.9858024019139024

In [None]:
results = []
for i in range(len(y_valid)):
    s=0
    for j in range(len(y_valid[i])):
        results.append({'sentence_idx':f'{str(i).zfill(3)}_{str(s).zfill(3)}',
                        'word':X_valid[i][j][11].split('=')[1],
                        'y':y_valid[i][j],
                        'pred':y_pred[i][j]})
        if y_valid[i][j]=='E': s+=1
result_df = pd.DataFrame(results)[['sentence_idx','word','y','pred']]
result_df['wrong_flag'] = result_df.apply(lambda row: 0 if row.y==row.pred else 1,1)

#space correct
space_df = result_df.copy()
space_df = space_df[space_df.word!=' ']
print(f"Error space correct: {space_df.wrong_flag.mean()} from shape: {space_df.shape}")
print(f"Accuracy space correct: {1 - space_df.wrong_flag.mean():.2f}")

Error space correct: 0.016057653810222873 from shape: (187325, 5)
Accuracy space correct: 0.98


### Validation with Other Dataset (TED, Orchid, Fake Review)

In [None]:
orchid = pd.read_csv('Orchid/orchid97.crp.utf',sep='\t',header=None)
orchid.columns = ['text']
#remove weird words
orchid['first_char'] = orchid.text.map(lambda x: x[0])
orchid = orchid[(orchid.first_char!='%')&(orchid.first_char!='#')][['text']]
#get word,pos
orchid['word'] = orchid.text.map(lambda x: x.split('/')[0])
orchid['word'] = orchid.word.map(lambda x: ' ' if (x=='<space>')|(x=='') else x)
orchid['pos'] = orchid.text.map(lambda x: x.split('/')[1] if len(x.split('/'))==2 else None)
#labels
orchid['lab'] = orchid.apply(lambda row: 'E' if row['text']=='//' else 'I',1)
orchid = orchid[(orchid.lab=='E')|(~orchid.pos.isna())].reset_index(drop=True)

In [None]:
%%time
ted_all_sentences = np.load('LST20/ted-all-sentences.npy') 
fake_review_all_sentences = np.load('LST20/fake-review-all-sentences.npy') # Sample from 3 datasets
# Sample from 3 datasets
np.random.seed(42)
ratio = .25 # sample ratio
ted_sample = np.random.choice(ted_all_sentences, int(len(ted_all_sentences) * ratio))
orchid_sample = orchid.iloc[:int(len(orchid) * ratio)]
# fake_review_sample = np.random.choice(fake_review_all_sentences, int(len(fake_review_all_sentences) * ratio))
fake_review_train, fake_review_test = fake_review_all_sentences[:-39632], fake_review_all_sentences[-39632:]
fake_review_sample = np.random.choice(fake_review_train, int(len(fake_review_all_sentences) * ratio))
fake_review_test_sample = np.random.choice(fake_review_test, int(len(fake_review_test) * ratio))

Wall time: 1.56 s


In [None]:
print(f"Length of TED talk (talk): {len(ted_sample)}")
print(f"Length of orchid (word): {len(orchid_sample)}")
print(f"Length of fake review train (review): {len(fake_review_sample)}")
print(f"Length of fake review test (review): {len(fake_review_test_sample)}")

Length of TED talk (talk): 385
Length of orchid (word): 91453
Length of fake review train (review): 49540
Length of fake review test (review): 9908


In [None]:

%%time
def assign_word_lab(all_sentences):
    all_tuples = []
    for i in tqdm(range(len(all_sentences)), total=len(all_sentences)):
        tuples = []
        for s in all_sentences[i].split('|'):
            s_lst = word_tokenize(s)
            for j in range(len(s_lst)):
                lab = 'E' if j==len(s_lst)-1 else 'I'
                tuples.append((s_lst[j],lab))
        all_tuples.append(tuples)
    return all_tuples

ted_all_tuples = assign_word_lab(ted_sample)
orchid_all_tuples = [(row['word'],row['lab']) for i,row in orchid_sample.iterrows()]
# fake_review_all_tuples = assign_word_lab(fake_review_sample)
fake_review_all_tuples = assign_word_lab(fake_review_sample)
fake_review_test_tuples = assign_word_lab(fake_review_test_sample)

100%|████████████████████████████████████████████████████████████████████████████████| 385/385 [00:07<00:00, 48.77it/s]
100%|██████████████████████████████████████████████████████████████████████████| 49540/49540 [00:34<00:00, 1423.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 9908/9908 [00:05<00:00, 1680.50it/s]

Wall time: 54.5 s





In [None]:
%%time
# ted
# target
ted_y = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    ted_y.append(temp)

# features
ted_x_pre = []
for t in tqdm(ted_all_tuples, total=len(ted_all_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    ted_x_pre.append(temp)
ted_x = []
for x_ in tqdm(ted_x_pre, total=len(ted_x_pre)):
    ted_x.append(extract_features(x_, window=3, max_n_gram = 3))

100%|██████████████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 3170.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 3376.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 385/385 [08:58<00:00,  1.40s/it]

Wall time: 8min 58s





In [None]:
%%time
# orchid
# target
orchid_y = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_y.append(l)
# features
orchid_x_pre = []
for (w, l) in tqdm(orchid_all_tuples, total=len(orchid_all_tuples)):
    orchid_x_pre.append(w)
orchid_x = extract_features(orchid_x_pre, window=3, max_n_gram = 3) 

100%|████████████████████████████████████████████████████████████████████████| 91453/91453 [00:00<00:00, 201239.75it/s]
100%|███████████████████████████████████████████████████████████████████████| 91453/91453 [00:00<00:00, 1312736.77it/s]


Wall time: 9.56 s


In [None]:
fake_review_test_y = []
for t in tqdm(fake_review_test_tuples, total=len(fake_review_test_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(l)
    fake_review_test_y.append(temp)

# features
fake_review_test_x_pre = []
for t in tqdm(fake_review_test_tuples, total=len(fake_review_test_tuples)):
    temp = []
    for (w, l) in t:
        temp.append(w)
    fake_review_test_x_pre.append(temp)
fake_review_test_x = []
for x_ in tqdm(fake_review_test_x_pre, total=len(fake_review_test_x_pre)):
    fake_review_test_x.append(extract_features(x_, window=3, max_n_gram = 3))

100%|███████████████████████████████████████████████████████████████████████████| 9908/9908 [00:00<00:00, 22926.94it/s]
100%|███████████████████████████████████████████████████████████████████████████| 9908/9908 [00:00<00:00, 60038.26it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 9908/9908 [10:40<00:00, 15.46it/s]


In [None]:
ted_x_train, ted_x_test, ted_y_train, ted_y_test = train_test_split(ted_x, ted_y, test_size=0.2, random_state=1412)
idx = int(len(orchid_x)*0.8)
orchid_x_train, orchid_x_test = orchid_x[:idx], orchid_x[idx:]
orchid_y_train, orchid_y_test = orchid_y[:idx], orchid_y[idx:]
fake_review_x_test = fake_review_test_x
fake_review_y_test = fake_review_test_y

### Validation with TED

In [None]:
# ted
# Predict (using test set)

# Evaluate at word-level
labels = {'E': 0, "I": 1} # classification_report() needs values in 0s and 1s

ted_x_test_extract = []
ted_y_test_extract = []
for xseq, yseq in tqdm(zip(ted_x_test, ted_y_test), total=len(ted_y_test)):
    ted_x_test_extract.append(xseq)
    ted_y_test_extract.append(yseq)

y_pred = crf.predict(ted_x_test_extract)

100%|███████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 76968.88it/s]


In [None]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    ted_y_test_extract, y_pred, labels=sorted_labels, digits=3
))

metrics.flat_f1_score(ted_y_test_extract, y_pred, 
                      average='weighted', labels=sorted_labels)

              precision    recall  f1-score   support

           E      0.410     0.146     0.215      6990
           I      0.963     0.991     0.976    155114

    accuracy                          0.954    162104
   macro avg      0.686     0.568     0.596    162104
weighted avg      0.939     0.954     0.944    162104



0.9435285985162364

### Orchid

In [None]:
# orchid
# Predict (using test set)
orchid_x_test_new = []
orchid_x_test_new.append(orchid_x_test)
orchid_y_test_new = []
orchid_y_test_new.append(orchid_y_test)
y_pred = crf.predict(orchid_x_test_new)

In [None]:
# group B and I results

print(metrics.flat_classification_report(
    orchid_y_test_new, y_pred, labels=sorted_labels, digits=3
))

metrics.flat_f1_score(orchid_y_test_new, y_pred, average='weighted', labels=sorted_labels)

              precision    recall  f1-score   support

           E      0.384     0.157     0.223      1179
           I      0.944     0.983     0.963     17112

    accuracy                          0.929     18291
   macro avg      0.664     0.570     0.593     18291
weighted avg      0.908     0.929     0.915     18291



0.915314345695198

### Fake Review

In [None]:
# fake review
# Predict (using test set)

fake_review_x_test_extract = []
fake_review_y_test_extract = []
for xseq, yseq in tqdm(zip(fake_review_x_test, fake_review_y_test), total=len(fake_review_y_test)):
    fake_review_x_test_extract.append(xseq)
    fake_review_y_test_extract.append(yseq)

y_pred = crf.predict(fake_review_x_test_extract)


100%|█████████████████████████████████████████████████████████████████████████| 9908/9908 [00:00<00:00, 1099599.50it/s]


In [None]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    fake_review_y_test_extract, y_pred, labels=sorted_labels, digits=3
))

metrics.flat_f1_score(fake_review_y_test_extract, y_pred, 
                      average='weighted', labels=sorted_labels)

              precision    recall  f1-score   support

           E      0.754     0.348     0.476     43254
           I      0.952     0.991     0.971    568879

    accuracy                          0.946    612133
   macro avg      0.853     0.670     0.724    612133
weighted avg      0.938     0.946     0.936    612133



0.9364743542471556