In [2]:
from util import preprocess

train_seqs = preprocess.get_sequence("data/newly_tokenized/train_auto_tok.tsv")

In [3]:
train_seqs[0]

[['รมว.', 'B_DES', 'NN'],
 ['พม.', 'B_ORG', 'NN'],
 ['ลั่น', 'O', 'VV'],
 ['เร่ง', 'O', 'VV'],
 ['ปราบปราม', 'O', 'VV'],
 ['"', 'O', 'PU'],
 ['ค้า', 'O', 'VV'],
 ['มนุษย์', 'O', 'NN'],
 ['"', 'O', 'PU']]

# Create features

In [4]:
def current_word(i, tokens):
    return {'w0 word': tokens[i][0]}

def current_pos(i, tokens):
    return {'w0 pos': tokens[i][2]}

def prev_pos(i, tokens):
    if i == 0:
        return {}
    if i == 1:
        return {'w-1 pos': tokens[i-1][2]}
    return {'w-1 pos': tokens[i-1][2], 'w-2 pos': tokens[i-2][2]}

In [5]:
print(current_word(2, train_seqs[0]))
print(current_pos(2, train_seqs[0]))
print(prev_pos(2, train_seqs[0]))

{'w0 word': 'ลั่น'}
{'w0 pos': 'VV'}
{'w-1 pos': 'NN', 'w-2 pos': 'NN'}


In [9]:
features = {}
features.update(current_pos(3, train_seqs[0]))
features.update(current_word(3, train_seqs[0]))
features.update(prev_pos(2, train_seqs[0]))
print(features)

{'w0 pos': 'VV', 'w0 word': 'เร่ง', 'w-1 pos': 'NN', 'w-2 pos': 'NN'}


In [6]:
# Featurize text into feature sequences
def featurize(feature_function_list, tokens):
    feature_dict_seq = []
    for i in range(len(tokens)): # ทุก token
        feature_dict = {}
        for feature_fn in feature_function_list: # ทุก feature function
            feature_dict.update(feature_fn(i, tokens))
        feature_dict_seq.append(feature_dict)
    return feature_dict_seq
    
feature_function_list = [current_word, current_pos, prev_pos]

In [7]:
feature_seq_list = [featurize(feature_function_list, e) for e in train_seqs]

In [9]:
def labelize(tokens):
    return [e[1] for e in tokens]

In [10]:
label_seq_list = [labelize(e) for e in train_seqs]

# Train CRF

In [12]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF()
crf.fit(feature_seq_list, label_seq_list)