## NER Implementation
Below is the code used to implement a Named Entity Recognizer (NER) application in Python using the CRFsuite library.

In [1]:
from __future__ import print_function
from sklearn.metrics import confusion_matrix
import io
import nltk
from nltk import pos_tag
import scipy
import codecs
import sklearn
import pycrfsuite
import pandas as pd
from itertools import chain
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn_crfsuite.metrics import flat_classification_report

print('sklearn version:', sklearn.__version__)
print('Libraries succesfully loaded!')

sklearn version: 0.21.2
Libraries succesfully loaded!


In [2]:
def sent2features_train(sent, feature_func):
    sentences, labels = zip(*sent)
    sentences = list(sentences)
    pos_tagged = pos_tag(sentences)
    return [feature_func(pos_tagged, i) for i in range(len(sent))]

def sent2features(sent, feature_func):
    sentences = [i[0] for i in sent]
    pos_tagged = pos_tag(sentences)
    return [feature_func(pos_tagged, i) for i in range(len(sent))]

def sent2labels(sent):
    return [s[-1] for s in sent]

def sent2tokens(sent):
    return [s[0] for s in sent]

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly.
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(y_true)
    y_pred_combined = lb.transform(y_pred)
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
            
def word2simple_features(sent, i):
    '''
    Performs feature extraction on the given sentence input.
    Used as a mapping function by the above functions that convert sentences to features.
    '''
    word = sent[i][0]
    pos_tag = sent[i][1]
    
    features = {
        'bias': 1.0, # This feature is constant for all words.
        'word.lower()': word.lower(), # This feature is the word, ignoring case.
        'word[-2:]': word[-2:], # This feature is the last two characters of the word (i.e. the suffix).
        'word[-3:]': word[-3:], # This feature is the last three characters of the word (also the suffix).
        'word.isupper()': word.isupper(), # This feature is a binary indicator of whether the word is capitalized.
        'word.istitle()': word.istitle(), # This feature is a binary indicator of whether the word is titled.
        'word.isdigit()': word.isdigit(), # This feature is a binary indicator of whether the word is numeric or not.
        'pos_tag': pos_tag
    }
    if i == 0:
        features['BOS'] = True # Mark the beginning of sentence.
    
    if i > 0:
        prev_word = sent[i-1][0]
        prev_pos_tag = sent[i-1][1]
        features.update({
            'prev_word.lower()': prev_word.lower(),
            'prev_word.isupper()': prev_word.isupper(),
            'prev_word.istitle()': prev_word.istitle(),
            'prev_pos_tag': prev_pos_tag
        })
        
    if i == len(sent)-1:
        features['EOS'] = True # Mark the end of sentence.
        
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_pos_tag = sent[i+1][1]
        features.update({
            'next_word.lower()': next_word.lower(),
            'next_word.isupper()': next_word.isupper(),
            'next_word.istitle()': next_word.istitle(),
            'next_pos_tag': next_pos_tag
        })       

    return features

# load data and preprocess
def extract_data(path):
    """
    Extracting data from train file or test file. 
    path - the path of the file to extract
    
    return:
        res - a list of sentences, each sentence is a
              a list of tuples. For train file, each tuple
              contains token and label. For test file, each
              tuple only contains token.
        ids - a list of ids for the corresponding token. This
              is mainly for a Kaggle submission.
    """
    file = io.open(path, mode="r", encoding="utf-8")
    next(file)
    res = []
    ids = []
    sent = []
    for line in file:
        if line != '\n':
            # Each line contains the position ID, the token, and (for the training set) the label.
            parts = line.strip().split(' ')
            sent.append(tuple(parts[1:]))
            ids.append(parts[0])
        else:
            res.append(sent)
            sent = []
           
    return res, ids

In [3]:
%%time
# Load train and test data
train_data, train_ids = extract_data('train')
test_data, test_ids = extract_data('test')

# Load true labels for test data
test_labels = list(pd.read_csv('test_ground_truth').loc[:, 'label'])

print('Train and Test data loaded succesfully!')

# Feature extraction using the word2simple_features function
train_features = [sent2features_train(s, feature_func=word2simple_features) for s in train_data]
train_labels = [sent2labels(s) for s in train_data]
test_features = [sent2features(s, feature_func=word2simple_features) for s in test_data]

trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(train_features, train_labels):
    trainer.append(xseq, yseq)
print('Feature Extraction done!')    

# Explore the extracted features    
sent2features_train(train_data[0], word2simple_features)

Train and Test data loaded succesfully!
Feature Extraction done!
Wall time: 1min 6s


[{'bias': 1.0,
  'word.lower()': 'también',
  'word[-2:]': 'én',
  'word[-3:]': 'ién',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'pos_tag': 'NNP',
  'BOS': True,
  'next_word.lower()': 'el',
  'next_word.isupper()': False,
  'next_word.istitle()': False,
  'next_pos_tag': 'NN'},
 {'bias': 1.0,
  'word.lower()': 'el',
  'word[-2:]': 'el',
  'word[-3:]': 'el',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'pos_tag': 'NN',
  'prev_word.lower()': 'también',
  'prev_word.isupper()': False,
  'prev_word.istitle()': True,
  'prev_pos_tag': 'NNP',
  'next_word.lower()': 'secretario',
  'next_word.isupper()': False,
  'next_word.istitle()': False,
  'next_pos_tag': 'NN'},
 {'bias': 1.0,
  'word.lower()': 'secretario',
  'word[-2:]': 'io',
  'word[-3:]': 'rio',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'pos_tag': 'NN',
  'prev_word.lower()': 'el',
  'prev_word.isupper()': False,


In [4]:
# Explore the model parameters
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [5]:
# Change to training algorithm to pa as it seems to yield better results given experimentation using the data.
trainer.select('pa')

In [6]:
trainer.set_params({
    
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [7]:
%%time
trainer.train('ner-esp.model')

print('Training done :)')

Training done :)
Wall time: 44.7 s


In [8]:
# Make predictions
tagger = pycrfsuite.Tagger()
tagger.open('ner-esp.model')
test_pred = [tagger.tag(xseq) for xseq in test_features]
test_pred = [s for w in test_pred for s in w]

# Print evaluation
print(bio_classification_report(test_pred, test_labels))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

       B-LOC       0.82      0.82      0.82      2039
       I-LOC       0.72      0.78      0.75       702
      B-MISC       0.62      0.69      0.66       786
      I-MISC       0.59      0.72      0.65      1029
       B-ORG       0.84      0.87      0.86      3123
       I-ORG       0.85      0.81      0.83      2313
       B-PER       0.90      0.91      0.91      1881
       I-PER       0.94      0.94      0.94      1627

   micro avg       0.82      0.84      0.83     13500
   macro avg       0.79      0.82      0.80     13500
weighted avg       0.82      0.84      0.83     13500
 samples avg       0.10      0.10      0.10     13500



In [9]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

100 {'num': 100, 'scores': {}, 'loss': 2098.923487, 'feature_norm': 65.791975, 'time': 0.4}


## Check what the classifier has learned

In [10]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
I-ORG  -> I-ORG   1.951016
B-ORG  -> I-ORG   1.909189
I-MISC -> I-MISC  1.701659
B-MISC -> I-MISC  1.554546
I-LOC  -> I-LOC   1.478325
B-LOC  -> I-LOC   1.472591
B-PER  -> I-PER   1.404586
I-PER  -> I-PER   1.315277
O      -> B-ORG   1.136446
O      -> B-MISC  1.097690
O      -> O       1.093353
O      -> B-PER   0.893465
O      -> B-LOC   0.707388
I-PER  -> B-LOC   0.210940
I-MISC -> B-MISC  0.165571

Top unlikely transitions:
B-LOC  -> I-ORG   -0.390828
B-ORG  -> I-LOC   -0.395795
I-ORG  -> I-MISC  -0.398521
I-PER  -> I-MISC  -0.409148
B-ORG  -> B-ORG   -0.418304
I-LOC  -> B-PER   -0.423404
B-MISC -> B-MISC  -0.430383
I-MISC -> I-LOC   -0.435844
I-ORG  -> B-LOC   -0.451822
I-PER  -> B-MISC  -0.539836
I-ORG  -> I-LOC   -0.601203
O      -> I-ORG   -0.910523
O      -> I-MISC  -0.983119
O      -> I-PER   -1.003715
O      -> I-LOC   -1.108540


## Check the state features

In [11]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
2.284351 B-ORG  word.lower():efe-cantabria
1.897555 B-ORG  word.lower():petrobras
1.816511 B-ORG  word.lower():psoe-progresistas
1.811980 B-ORG  word.lower():xfera
1.623314 I-PER  prev_word.lower():antoñete
1.539890 B-LOC  word.lower():líbano
1.525887 B-LOC  prev_word.lower():celebrarán
1.505443 B-ORG  word.lower():terra
1.489606 I-ORG  prev_word.lower():bt
1.410677 B-ORG  word.lower():esquerra
1.401450 B-ORG  word.lower():eu-ecologista
1.394640 B-ORG  word.lower():coag-extremadura
1.381444 B-PER  word.lower():franca
1.380736 I-ORG  prev_word.lower():l
1.356953 B-ORG  word.lower():telefónica
1.350667 O      word.lower():y
1.335575 B-ORG  prev_word.lower():mayorista
1.331495 I-ORG  prev_word.lower():ag
1.319324 B-ORG  word.lower():cámara
1.314306 B-ORG  word.lower():amena

Top negative:
-0.683210 O      prev_word.lower():de
-0.685269 O      word[-3:]:LOS
-0.685784 O      prev_word.lower():cantidad
-0.687172 I-PER  word[-3:]:ico
-0.730285 O      word.lower():061
-0.733810 B