In [1]:
from util import preprocess

train_seqs = preprocess.get_sequence("data/newly_tokenized/train_auto_tok.tsv")
dev_seqs   = preprocess.get_sequence("data/newly_tokenized/dev_auto_tok.tsv")
test_seqs   = preprocess.get_sequence("data/newly_tokenized/test_auto_tok.tsv")

In [2]:
train_seqs_wordOnly = [ [e[0] for e in seq] for seq in train_seqs]
dev_seqs_wordOnly = [ [e[0] for e in seq] for seq in dev_seqs]
test_seqs_wordOnly = [ [e[0] for e in seq] for seq in test_seqs]

print(len(train_seqs_wordOnly), len(dev_seqs_wordOnly), len(test_seqs_wordOnly))

63310 5620 5250


In [3]:
from pythainlp.tag import pos_tag

train_seqs_pthai_tags = [ pos_tag(e, corpus="lst20") for e in train_seqs_wordOnly]
dev_seqs_pthai_tags = [ pos_tag(e, corpus="lst20") for e in dev_seqs_wordOnly]
test_seqs_pthai_tags = [ pos_tag(e, corpus="lst20") for e in test_seqs_wordOnly]

print(len(train_seqs_pthai_tags), len(dev_seqs_pthai_tags), len(test_seqs_pthai_tags))

63310 5620 5250


# 1. Create features

In [6]:
def current_word(i, tokens):
    return {'w0 word': tokens[i][0]}

def current_pos(i, tokens):
    return {'w0 pos': tokens[i][1]}

def prev_pos(i, tokens):
    if i == 0:
        return {}
    if i == 1:
        return {'w-1 pos': tokens[i-1][1]}
    return {'w-1 pos': tokens[i-1][1], 'w-2 pos': tokens[i-2][1]}

In [7]:
print(current_word(2, train_seqs_pthai_tags[0]))
print(current_pos(2, train_seqs_pthai_tags[0]))
print(prev_pos(2, train_seqs_pthai_tags[0]))

{'w0 word': 'ลั่น'}
{'w0 pos': 'VV'}
{'w-1 pos': 'NN', 'w-2 pos': 'NN'}


In [8]:
features = {}
features.update(current_pos(3, train_seqs_pthai_tags[0]))
features.update(current_word(3, train_seqs_pthai_tags[0]))
features.update(prev_pos(2, train_seqs_pthai_tags[0]))
print(features)

{'w0 pos': 'VV', 'w0 word': 'เร่ง', 'w-1 pos': 'NN', 'w-2 pos': 'NN'}


In [9]:
# Featurize text into feature sequences
def featurize(feature_function_list, tokens):
    feature_dict_seq = []
    for i in range(len(tokens)): # ทุก token
        feature_dict = {}
        for feature_fn in feature_function_list: # ทุก feature function
            feature_dict.update(feature_fn(i, tokens))
        feature_dict_seq.append(feature_dict)
    return feature_dict_seq
    
def labelize(tokens):
    return [e[1] for e in tokens]
    
feature_function_list = [current_word, current_pos, prev_pos]

In [10]:
# X_train, X_dev, X_test
feature_seq_list = [featurize(feature_function_list, e) for e in train_seqs_pthai_tags]
dev_feature_seq_list = [featurize(feature_function_list, e) for e in dev_seqs_pthai_tags]
test_feature_seq_list = [featurize(feature_function_list, e) for e in test_seqs_pthai_tags]

# Y_train, Y_dev
label_seq_list = [labelize(e) for e in train_seqs]
dev_label_seq_list = [labelize(e) for e in dev_seqs]

# Train CRF

In [12]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF()
crf.fit(feature_seq_list, label_seq_list)



CRF(keep_tempfiles=None)

### save model

In [13]:
import joblib
filename = 'models/crf_3_feat_me_tok.crfsuite'
joblib.dump(crf, filename)

['models/crf_3_feat_me_tok.crfsuite']

# Load trained model

In [16]:
import joblib
filename = 'models/crf_3_feat_me_tok.crfsuite'
loaded_model = joblib.load(filename)

# Inference
#train_pred = loaded_model.predict(feature_seq_list)
dev_pred = loaded_model.predict(dev_feature_seq_list)
test_pred = loaded_model.predict(test_feature_seq_list)

len(dev_pred), len(test_pred)

(5620, 5250)

In [20]:
feature_seq_list[:2]

[[{'w0 word': 'รมว.', 'w0 pos': 'NN'},
  {'w0 word': 'พม.', 'w0 pos': 'NN', 'w-1 pos': 'NN'},
  {'w0 word': 'ลั่น', 'w0 pos': 'VV', 'w-1 pos': 'NN', 'w-2 pos': 'NN'},
  {'w0 word': 'เร่ง', 'w0 pos': 'VV', 'w-1 pos': 'VV', 'w-2 pos': 'NN'},
  {'w0 word': 'ปราบปราม', 'w0 pos': 'VV', 'w-1 pos': 'VV', 'w-2 pos': 'VV'},
  {'w0 word': '"', 'w0 pos': 'PU', 'w-1 pos': 'VV', 'w-2 pos': 'VV'},
  {'w0 word': 'ค้า', 'w0 pos': 'VV', 'w-1 pos': 'PU', 'w-2 pos': 'VV'},
  {'w0 word': 'มนุษย์', 'w0 pos': 'NN', 'w-1 pos': 'VV', 'w-2 pos': 'PU'},
  {'w0 word': '"', 'w0 pos': 'PU', 'w-1 pos': 'NN', 'w-2 pos': 'VV'}],
 [{'w0 word': 'จาก', 'w0 pos': 'PS'},
  {'w0 word': 'การที่', 'w0 pos': 'CC', 'w-1 pos': 'PS'},
  {'w0 word': 'สถานการณ์', 'w0 pos': 'NN', 'w-1 pos': 'CC', 'w-2 pos': 'PS'},
  {'w0 word': 'ด้าน', 'w0 pos': 'NN', 'w-1 pos': 'NN', 'w-2 pos': 'CC'},
  {'w0 word': 'การ', 'w0 pos': 'FX', 'w-1 pos': 'NN', 'w-2 pos': 'NN'},
  {'w0 word': 'ค้า', 'w0 pos': 'VV', 'w-1 pos': 'FX', 'w-2 pos': 'NN'},
  {'

In [None]:
#import sklearn_crfsuite.metrics
#print(sklearn_crfsuite.metrics.flat_classification_report(label_seq_list, train_pred))

In [17]:
def joinSeqTag(seqs, tags):
    output = []
    for i in range(len(seqs)):
        temp = []
        for token, tag in zip(seqs[i], tags[i]):
            if tag != "O":
                temp.append( (token[0],tag) )

        output.append(temp)
        
    return output

test_output = joinSeqTag(test_seqs, test_pred)
dev_output  = joinSeqTag(dev_seqs, dev_pred)

from lst20utils import extract_entities
test_pred_output = [extract_entities(e, post=True)[1] for e in test_output]
dev_pred_output = [extract_entities(e, post=True)[1] for e in dev_output]

# Save result

In [18]:
import json
json.dump(dev_pred_output, open('result/predicted_dev_entities_me_tok_3feat.json', encoding='utf8', mode='w'))
json.dump(test_pred_output, open('result/predicted_test_entities_me_tok_3feat.json', encoding='utf8', mode='w'))


| file      | Description |
| ----------- | ----------- |
| **model_1**: `models/crf_3_feat.crfsuite`   | CRF model, use 3 features, pos tag by corpus        |
| **model_2**: `models/crf_1_feat_me_tok.crfsuite`      | CRF model, use 1 feature (currect token), pos tag by Pythai       |
| **model_3**: `models/crf_3_feat_me_tok.crfsuite`      | CRF model, use 3 features, pos tag by Pythai       |
| `result/test_pred_output_re.json` | output on *test set* by *model_1* |
| `result/predicted_dev_entities.json` | output on *dev set* by *model_1* |
| `result/predicted_train_entities.json` | output on *train set* by *model_1* |
| `result/predicted_test_entities_me_tok_1feat.json` | output on *test set* by *model_2* |
| `result/predicted_dev_entities_me_tok_1feat.json` | output on *dev set* by *model_2* |
| `result/predicted_test_entities_me_tok_3feat.json` | output on *test set* by *model_3* |
| `result/predicted_dev_entities_me_tok_3feat.json` | output on *dev set* by *model_3* |

to evaluate: `python3 evaluate.py data/dev_entities.json result/predicted_dev_entities_me_tok_1feat.jsonn`

# Make prediction on Test set
Inferencing test set needs a bit workaround bc. we need POS and test data from corpus, and mapping from a given test data.

In [29]:
# read test file
with open("data/raw/test_set.txt", "r") as f:
    test_file = f.readlines() 

import re
# tokenize
test_seqs_given = [e.strip("\n") for e in test_file]
test_seqs_given = [re.sub("\|","",e) for e in test_seqs_given]

In [15]:
test_sentences = {}

for i, seq in enumerate(test_seqs):
    tokens = [e[0] for e in seq]
    sentence = ''.join(tokens)
    test_sentences[sentence] = test_pred_output[i]


In [36]:
test_pred_output_re = []
for test_sentence in test_seqs_given:
    test_pred_output_re.append( test_sentences[test_sentence] )

In [38]:
import json
json.dump(test_pred_output_re, open('result/test_pred_output_re.json', encoding='utf8', mode='w'))