<a href="https://colab.research.google.com/github/slee987/LIS640tmp/blob/main/lis640_week10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1 Set Up Environment

Download CRFsuite on your computer: https://www.chokkan.org/software/crfsuite/
* Windows: https://github.com/downloads/chokkan/crfsuite/crfsuite-0.12_win32.zip
* Linux: https://github.com/downloads/chokkan/crfsuite/crfsuite-0.12-x86_64.tar.gz
* Mac: build your own from source

## 2 Explore the CoNLL 2003 NER dataset

Download the dataset (three files) from Canvas:
* training set: CoNLL2003_ner_train.txt
* validation set: CoNLL2003_ner_valid.txt
* test set: CoNLL2003_ner_test.txt

In [None]:
# utility functions for reading the dataset

def load_dataset(path):
    sents = []
    with open(path) as f:
        sent = []
        for line in f.readlines():
            line = line.rstrip()
            if len(line)==0 or ( len(line)>=10 and line[:10]=='-DOCSTART-' ):
                if len(sent)>0:
                    sents.append(sent)
                    sent = []
            else:
                token = line.split(' ')
                assert len(token)==4
                sent.append(token)
    return sents

path_train = 'CoNLL2003_ner_train.txt'
path_valid = 'CoNLL2003_ner_valid.txt'
path_test = 'CoNLL2003_ner_test.txt'

train = load_dataset(path_train)
valid = load_dataset(path_valid)
test = load_dataset(path_test)

print(len(train), len(valid), len(test))

In [None]:
# each sentence is a list of tokens (including the word, its POS tag, NP tag, and NER tag)

train[0]

In [None]:
train[0][1]

## 3 Feature Extraction

In [None]:
# feature template functions

import re

# w[offset]=? for the ith word in a sentence
def feat_w(sent, i, offset):
    if i+offset<0:
        return [ 'w[%d]=%s' % (offset, '<S>') ]
    if i+offset>=len(sent):
        return [ 'w[%d]=%s' % (offset, '</S>') ]
    else:
        return [ 'w[%d]=%s' % (offset, sent[i+offset][0]) ]

# pos[offset]=? for the ith word in a sentence
def feat_pos(sent, i, offset):
    if i+offset<0:
        return [ 'pos[%d]=%s' % (offset, '<S>') ]
    if i+offset>=len(sent):
        return [ 'pos[%d]=%s' % (offset, '</S>') ]
    else:
        return [ 'pos[%d]=%s' % (offset, sent[i+offset][1]) ]

# character k-gram for the ith word in a sentence
def feat_charngram(sent, i, k):
    return [ ('char-%dgram='%k)+sent[i][0][j:j+k] for j in range(len(sent[i][0])-k+1) ]

# a simple word shape feature: uppercase letter-->X, lowercase-->x, digit-->d
def feat_wordshape(sent, i):
    shape = re.sub('[A-Z]', 'X', sent[i][0])
    shape = re.sub('[a-z]', 'x', shape)
    shape = re.sub('[0-9]', 'd', shape)
    return [ 'shape='+shape ]

In [None]:
feat_charngram( train[0], 0, 2 )

In [None]:
feat_wordshape( train[0], 0 )

In [None]:
# output features to a file (tab separated)

def output_feature_file( path, sents ):
    f = open(path, 'w')
    for sent in sents:
        for i in range(len(sent)):

            label = sent[i][3]

            feats_w = feat_w(sent, i, 0)
            feats_w_next = feat_w(sent, i, 1)
            feats_w_next2 = feat_w(sent, i, 2)
            feats_w_prev = feat_w(sent, i, -1)
            feats_w_prev2 = feat_w(sent, i, -2)

            feats_pos = feat_pos(sent, i, 0)
            feats_pos_next = feat_pos(sent, i, 1)
            feats_pos_next2 = feat_pos(sent, i, 2)
            feats_pos_prev = feat_pos(sent, i, -1)
            feats_pos_prev2 = feat_pos(sent, i, -2)

            feat_3gram = feat_charngram(sent, i, 3)
            feat_4gram = feat_charngram(sent, i, 4)

            feat_shape = feat_wordshape(sent, i)

            feats = feats_w+feats_w_next+feats_w_next2+feats_w_prev+feats_w_prev2 \
                + feats_pos+feats_pos_next+feats_pos_next2+feats_pos_prev+feats_pos_prev2 \
                + feat_3gram + feat_4gram + feat_shape

            f.write( label + '\t' + '\t'.join(feats) + '\n' )
            
        f.write('\n')
        
    f.close()        

In [None]:
output_feature_file( 'feats_train.txt', train )

In [None]:
output_feature_file( 'feats_test.txt', test )

## 4 Train & Test Models

Download the CRF suite (binaries or build on your own) and go to /bin folder. There will be an excutable file named crfsuite.

Help INFO: crfsuite --help

Training Help INFO: crfsuite learn --help

Testing Help INFO: crfsuite tag --help

Dump model Help INFO: crfsuite dump --help

https://www.chokkan.org/software/crfsuite/