# Training an NER Extractor on the Groningen MEANING BANK

In [46]:
import os
import collections
from nltk.chunk import conlltags2tree, tree2conlltags

In [25]:
ner_tags = collections.Counter()

corpus_root = "./models/gmb-2.2.0/data/"

for root, dirs, files in os.walk(corpus_root):
    for filename in files:
        if filename.endswith('.tags'):
            with open(os.path.join(root, filename), 'rb') as file_handle:
                file_content = file_handle.read().decode('utf-8').strip()
                annotated_sentences = file_content.split('\n\n') # Split the sentences according to markup
                for annotated_sentence in annotated_sentences:
                    annotated_tokens = [seq for seq in annotated_sentence.split('\n')]
                    
                    standard_form_tokens = []
                    
                    for idx, annotated_token in enumerate(annotated_tokens):
                        annotations = annotated_token.split('\t') # Split annotations
                        word, tag, ner = annotations[0], annotations[1], annotations[3]
                        
                        # To get only the primary categories
                        if ner != 'O':
                            ner = ner.split('-')[0]
                        
                        ner_tags[ner] += 1

print(ner_tags)

Counter({'O': 1146068, 'geo': 58388, 'org': 48094, 'per': 44254, 'tim': 34789, 'gpe': 20680, 'art': 867, 'eve': 709, 'nat': 300})


The Corpus is annotated with the following Entity Types:

* geo = Geographical Entity
* org = Organization
* per = Person
* gpe = Geopolitical Entity
* tim = Time indicator
* art = Artifact
* eve = Event
* nat = Natural Phenomenon

## Training our own System on the Wordbank Corpus

### First we create a Feature Extractor to prepare the Training Data

In [38]:
import string
from nltk.stem.snowball import SnowballStemmer

def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
    
    # Init the stemmer
    stemmer = SnowballStemmer('english')
    
    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)
    
    
    # Shift the indexes by 2
    index += 2
    
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos =tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
    
    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase
    
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
    
    nextallcaps = nextword == nextword.capitalize()
    nextcapitalized = nextword[0] in string.ascii_uppercase
    
    return {
        'word': word, 
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
        
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
        
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
        
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
        
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
        
        'prev-iob': previob,
        
        'contains-dash': contains_dash,
        'contains-dot': contains_dot, 
        
        'all-caps': allcaps,
        'capitalized': capitalized,
        
        'prev-all-caps': prevallcaps, 
        'prev-capitalized': prevcapitalized,
        
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
    }

### Next we create a few utility functions to help in training `

In [55]:
def to_conll_iob(annotated_sentence):
    """
    `annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
    Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
    to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    """
    
    proper_iob_tokens = []
    for idx, annotated_token in enumerate(annotated_sentence):
        tag, word, ner = annotated_token
        
        if ner != 'O':
            if idx == 0:
                ner = "B-" + ner
            elif annotated_sentence[idx - 1][2] == ner:
                ner = 'I-' + ner
            else:
                ner = "B-" + ner
        proper_iob_tokens.append((tag, word, ner))
    return proper_iob_tokens


def read_gmb(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    file_content = file_handle.read().decode('utf-8').strip()
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
 
                        standard_form_tokens = []
 
                        for idx, annotated_token in enumerate(annotated_tokens):
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[1], annotations[3]
 
                            if ner != 'O':
                                ner = ner.split('-')[0]
 
                            if tag in ('LQU', 'RQU'):   # Make it NLTK compatible
                                tag = "``"
 
                            standard_form_tokens.append((word, tag, ner))
 
                        conll_tokens = to_conll_iob(standard_form_tokens)
 
                        # Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
                        # Because the classfier expects a tuple as input, first item input, second the class
                        yield [((w, t), iob) for w, t, iob in conll_tokens]
 
reader = read_gmb(corpus_root)

Check the output

In [54]:
list(reader)[:10]

[[(('Thousands', 'NNS'), 'O'),
  (('of', 'IN'), 'O'),
  (('demonstrators', 'NNS'), 'O'),
  (('have', 'VBP'), 'O'),
  (('marched', 'VBN'), 'O'),
  (('through', 'IN'), 'O'),
  (('London', 'NNP'), 'B-geo'),
  (('to', 'TO'), 'O'),
  (('protest', 'VB'), 'O'),
  (('the', 'DT'), 'O'),
  (('war', 'NN'), 'O'),
  (('in', 'IN'), 'O'),
  (('Iraq', 'NNP'), 'B-geo'),
  (('and', 'CC'), 'O'),
  (('demand', 'VB'), 'O'),
  (('the', 'DT'), 'O'),
  (('withdrawal', 'NN'), 'O'),
  (('of', 'IN'), 'O'),
  (('British', 'JJ'), 'B-gpe'),
  (('troops', 'NNS'), 'O'),
  (('from', 'IN'), 'O'),
  (('that', 'DT'), 'O'),
  (('country', 'NN'), 'O'),
  (('.', '.'), 'O')],
 [(('Families', 'NNS'), 'O'),
  (('of', 'IN'), 'O'),
  (('soldiers', 'NNS'), 'O'),
  (('killed', 'VBN'), 'O'),
  (('in', 'IN'), 'O'),
  (('the', 'DT'), 'O'),
  (('conflict', 'NN'), 'O'),
  (('joined', 'VBD'), 'O'),
  (('the', 'DT'), 'O'),
  (('protesters', 'NNS'), 'O'),
  (('who', 'WP'), 'O'),
  (('carried', 'VBD'), 'O'),
  (('banners', 'NNS'), 'O'),
  

## Train the Model and Save it for later use

In [56]:
import pickle
from collections import Iterable
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI

class NamedEntityChunker(ChunkParserI):
    
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
        
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)
        
    
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
        
        # Transform the result from [((w1, t1), iob1), ...]
        # to the prefered list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w,t,c) for ((w,t),c) in chunks]
        
        # Transform the list of triplet to nltk.Tree format
        return conlltags2tree(iob_triplets)

#### Let's build the Dataset

In [57]:
reader = read_gmb(corpus_root)
data = list(reader)
training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]


print('# Training Samples: ', len(training_samples))
print('# Test Samples: ', len(test_samples))

# Training Samples:  55809
# Test Samples:  6201


#### Train the Model

In [58]:
chunker = NamedEntityChunker(training_samples)

#### Let's try the model

In [60]:
from nltk import pos_tag, word_tokenize

print(chunker.parse(pos_tag(word_tokenize("George Bush visited Germany this Monday"))))

(S
  (per George/NNP Bush/NNP)
  visited/VBD
  (geo Germany/NNP)
  this/DT
  Monday/NNP)


In [64]:
# Store the Model
with open('./models/001_production/GMB_NERTagger.pkl', 'wb') as file:
    pickle.dump(chunker, file)