# NER CRF Example

### Install required packages

* nltk
* python-crfsuite

In [None]:
#optional, if you have installed those python modules, you can skip this stsp.
!pip install nltk
!pip install python-crfsuite
!python -m nltk.downloader all

Now, we need to import those packages into our code

In [1]:
import nltk
import pycrfsuite

In [2]:
# list all files under the conll2000 dataset
nltk.corpus.conll2000.fileids()

['train.txt', 'test.txt']

### Get the training data and testing data. 

The data includes a lot of sentences which have been tokenized and tagged in IOB(Inside–outside–beginning) format.


```
Alex B-PER
is O
going O
to O
Los B-LOC
Angeles I-LOC
```

In [3]:
train_sent = list(nltk.corpus.conll2000.iob_sents('train.txt'))
test_sent = list(nltk.corpus.conll2000.iob_sents('test.txt'))

# see the first sentences
print(train_sent[0])

[('Confidence', 'NN', 'B-NP'), ('in', 'IN', 'B-PP'), ('the', 'DT', 'B-NP'), ('pound', 'NN', 'I-NP'), ('is', 'VBZ', 'B-VP'), ('widely', 'RB', 'I-VP'), ('expected', 'VBN', 'I-VP'), ('to', 'TO', 'I-VP'), ('take', 'VB', 'I-VP'), ('another', 'DT', 'B-NP'), ('sharp', 'JJ', 'I-NP'), ('dive', 'NN', 'I-NP'), ('if', 'IN', 'B-SBAR'), ('trade', 'NN', 'B-NP'), ('figures', 'NNS', 'I-NP'), ('for', 'IN', 'B-PP'), ('September', 'NNP', 'B-NP'), (',', ',', 'O'), ('due', 'JJ', 'B-ADJP'), ('for', 'IN', 'B-PP'), ('release', 'NN', 'B-NP'), ('tomorrow', 'NN', 'B-NP'), (',', ',', 'O'), ('fail', 'VB', 'B-VP'), ('to', 'TO', 'I-VP'), ('show', 'VB', 'I-VP'), ('a', 'DT', 'B-NP'), ('substantial', 'JJ', 'I-NP'), ('improvement', 'NN', 'I-NP'), ('from', 'IN', 'B-PP'), ('July', 'NNP', 'B-NP'), ('and', 'CC', 'I-NP'), ('August', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('near-record', 'JJ', 'I-NP'), ('deficits', 'NNS', 'I-NP'), ('.', '.', 'O')]


### Define a function that can convert the sentence to the text feature. 

我們會觀察前後字然後納為特徵

In [4]:
def word2features(sent, i):
    word = sent[i][0] #token
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2]
    ]
    
    # look up the previous word
    if i > 0:
        word1 =sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag[:2] #we're interested in normal form 
        ])
    else:
        features.append('BOS')
        
    # loop up the next wrd
    if i < len(sent) -1:
        word2 = sent[i + 1][0]
        postag2 = sent[i + 1][1]
        features.extend([
            '+1:word.lower=' + word2.lower(),
            '+1:word.istitle=%s' % word2.istitle(),
            '+1:word.isupper=%s' % word2.isupper(),
            '+1:postag=' + postag2,
            '+1:postag[:2]=' +postag2[:2]
        ])
    else:
        features.append('EOS')
    return features
    
def sent2features(sent):
    return [ word2features(sent,i) for i in range(len(sent)) ]
    
def sent2labels(sent):
    return [ label for token, pos, label in sent]

def sent2tokens(sent):
    return [ token for token, pos, label in sent]

### Implement our training function

In [5]:
def train():
    X_train = [ sent2features(s) for s in train_sent]
    Y_train = [ sent2labels(s) for s in train_sent]
    
    trainer = pycrfsuite.Trainer(verbose=False)
    trainer.set_params({
        'c1': 1.0,
        'c2': 1e-3,
        'max_iterations': 50,
        'feature.possible_transitions': True
    })
    
    for xseq, yseq in zip(X_train, Y_train):
        trainer.append(xseq, yseq)
        
    trainer.train('mytrain_model')

In [6]:
train()

To use our model we just trained, we can cal .open() to load it.

In [7]:
def predict():
    tagger = pycrfsuite.Tagger()
    tagger.open('mytrain_model')
    example_set = test_sent[3]
    print(' '.join(sent2tokens(example_set)), end='\n\n')
    print("Predicted:", ' '.join(tagger.tag(sent2features(example_set))))
    print("Correct:  ", ' '.join(sent2labels(example_set)))

In [8]:
predict()

Under the existing contract , Rockwell said , it has already delivered 793 of the shipsets to Boeing .

Predicted: B-PP B-NP I-NP I-NP O B-NP B-VP O B-NP B-VP I-VP I-VP B-NP B-PP B-NP I-NP B-PP B-NP O
Correct:   B-PP B-NP I-NP I-NP O B-NP B-VP O B-NP B-VP I-VP I-VP B-NP B-PP B-NP I-NP B-PP B-NP O
