In [1]:
from util import preprocess

train_seqs = preprocess.get_sequence("data/newly_tokenized/train_auto_tok.tsv")
dev_seqs   = preprocess.get_sequence("data/newly_tokenized/dev_auto_tok.tsv")
test_seqs   = preprocess.get_sequence("data/newly_tokenized/test_auto_tok.tsv")

In [2]:
""" import json

with open("train_seqs_with_pos.json","r") as f:
    train_seqs = json.load(f)
with open("dev_seqs_with_pos.json","r") as f:
    dev_seqs = json.load(f)
with open("test_seqs_with_pos.json","r") as f:
    test_seqs = json.load(f)
print(len(train_seqs), len(dev_seqs), len(test_seqs)) """

63310 5620 5250


In [3]:
train_seqs_wordOnly = [ [e[0] for e in seq] for seq in train_seqs]
dev_seqs_wordOnly = [ [e[0] for e in seq] for seq in dev_seqs]
test_seqs_wordOnly = [ [e[0] for e in seq] for seq in test_seqs]

print(len(train_seqs_wordOnly), len(dev_seqs_wordOnly), len(test_seqs_wordOnly))

63310 5620 5250


# 1. Create features

In [2]:
import json

with open("data/word_entity_pair.json", "r") as f:
    word_entity = json.load(f)

from collections import defaultdict
word_entity_dict = defaultdict(lambda: set())

for e in word_entity:
    word_entity_dict[e[1]].add(e[0])

word_entity_dict = dict(word_entity_dict)
num_in_type = {k:len(word_entity_dict[k]) for k in word_entity_dict}

from util.const import mappingList

toEntity = {}
for new_entity in word_entity_dict.keys():
    for entity in mappingList:
        if new_entity in mappingList[entity]:
            toEntity[new_entity] = entity


In [3]:
final_entity_word = {}

for e in word_entity_dict:
    if e in toEntity:
        for word in word_entity_dict[e]:
            final_entity_word[ word ] = toEntity[e]

In [4]:
def current_word(i, tokens):
    return {'w0 word': tokens[i][0]}

def current_pos(i, tokens):
    return {'w0 pos': tokens[i][-1]}

def prev_pos(i, tokens):
    if i == 0:
        return {}
    if i == 1:
        return {'w-1 pos': tokens[i-1][-1]}
    return {'w-1 pos': tokens[i-1][-1], 'w-2 pos': tokens[i-2][-1]}

import re
def isIn_word_entity(i, tokens):
    word = re.sub(r'\s', '', tokens[i][0])
    if word in final_entity_word:
        return {"possible-tag" : final_entity_word[word]}
    else:
        return {}



In [5]:
position = 3
n_sentence = 2971
print(current_word(position, train_seqs[n_sentence]))
print(current_pos(position, train_seqs[n_sentence]))
print(prev_pos(position, train_seqs[n_sentence]))
print(isIn_word_entity(position, train_seqs[n_sentence]))
train_seqs[n_sentence][position]

{'w0 word': 'กล่าว'}
{'w0 pos': 'VV'}
{'w-1 pos': 'PU', 'w-2 pos': 'NN'}
{'possible-tag': 'LOC'}


['กล่าว', 'O', 'VV']

In [6]:
# Featurize text into feature sequences
def featurize(feature_function_list, tokens):
    feature_dict_seq = []
    for i in range(len(tokens)): # ทุก token
        feature_dict = {}
        for feature_fn in feature_function_list: # ทุก feature function
            feature_dict.update(feature_fn(i, tokens))
        feature_dict_seq.append(feature_dict)
    return feature_dict_seq
    
def labelize(tokens):
    return [e[1] for e in tokens]
    
feature_function_list = [current_word, current_pos, prev_pos, isIn_word_entity]

In [7]:
# X_train, X_dev, X_test
feature_seq_list = [featurize(feature_function_list, e) for e in train_seqs]
dev_feature_seq_list = [featurize(feature_function_list, e) for e in dev_seqs]
test_feature_seq_list = [featurize(feature_function_list, e) for e in test_seqs]

# Y_train, Y_dev
label_seq_list = [labelize(e) for e in train_seqs]
dev_label_seq_list = [labelize(e) for e in dev_seqs]

# Train CRF

In [11]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF()
crf.fit(feature_seq_list, label_seq_list)

<sklearn_crfsuite.estimator.CRF at 0x26270196a70>

### save model

In [12]:
import joblib
filename = 'models/crf_4_feat.crfsuite'
joblib.dump(crf, filename)

['models/crf_4_feat.crfsuite']

# Load trained model

In [17]:
import joblib
filename = 'models/crf_4_feat.crfsuite'
loaded_model = joblib.load(filename)

# Inference
#train_pred = loaded_model.predict(feature_seq_list)
dev_pred = loaded_model.predict(dev_feature_seq_list)
test_pred = loaded_model.predict(test_feature_seq_list)

len(dev_pred), len(test_pred)

(5620, 5250)

In [None]:
#import sklearn_crfsuite.metrics
#print(sklearn_crfsuite.metrics.flat_classification_report(label_seq_list, train_pred))

In [15]:
def joinSeqTag(seqs, tags):
    output = []
    for i in range(len(seqs)):
        temp = []
        for token, tag in zip(seqs[i], tags[i]):
            if tag != "O":
                temp.append( (token[0],tag) )

        output.append(temp)
        
    return output

test_output = joinSeqTag(test_seqs, test_pred)
dev_output  = joinSeqTag(dev_seqs, dev_pred)

from lst20utils import extract_entities
test_pred_output = [extract_entities(e, post=True)[1] for e in test_output]
dev_pred_output = [extract_entities(e, post=True)[1] for e in dev_output]

# Save result

In [16]:
import json
json.dump(dev_pred_output, open('result/predicted_dev_entities_4feat.json', encoding='utf8', mode='w'))
json.dump(test_pred_output, open('result/predicted_test_entities_4feat.json', encoding='utf8', mode='w'))


| file      | Description |
| ----------- | ----------- |
| **model_1**: `models/crf_3_feat.crfsuite`   | CRF model, use 3 features, pos tag by corpus        |
| **model_2**: `models/crf_1_feat_me_tok.crfsuite`      | CRF model, use 1 feature (currect token), pos tag by Pythai       |
| **model_3**: `models/crf_3_feat_me_tok.crfsuite`      | CRF model, use 3 features, pos tag by Pythai       |
| `result/test_pred_output_re.json` | output on *test set* by *model_1* |
| `result/predicted_dev_entities.json` | output on *dev set* by *model_1* |
| `result/predicted_train_entities.json` | output on *train set* by *model_1* |
| `result/predicted_test_entities_me_tok_1feat.json` | output on *test set* by *model_2* |
| `result/predicted_dev_entities_me_tok_1feat.json` | output on *dev set* by *model_2* |
| `result/predicted_test_entities_me_tok_3feat.json` | output on *test set* by *model_3* |
| `result/predicted_dev_entities_me_tok_3feat.json` | output on *dev set* by *model_3* |

# Make prediction on Test set
Inferencing test set needs a bit workaround bc. we need POS and test data from corpus, and mapping from a given test data.

to evaluate: `python evaluate.py data/newly_tokenized/dev_entities.json result/predicted_dev_entities_4feat.json`

In [19]:
# read test file
with open("data/raw/test_set.txt", "r", encoding="utf8") as f:
    test_file = f.readlines() 

import re
# tokenize
test_seqs_given = [e.strip("\n") for e in test_file]
test_seqs_given = [re.sub("\|","",e) for e in test_seqs_given]

In [20]:
test_sentences = {}

for i, seq in enumerate(test_seqs):
    tokens = [e[0] for e in seq]
    sentence = ''.join(tokens)
    test_sentences[sentence] = test_pred_output[i]


In [21]:
test_pred_output_re = []
for test_sentence in test_seqs_given:
    test_pred_output_re.append( test_sentences[test_sentence] )

In [22]:
import json
json.dump(test_pred_output_re, open('result/test_pred_output_4feat.json', encoding='utf8', mode='w'))