In [1]:
from __future__ import unicode_literals, print_function
import os
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

### Helpers

In [2]:
def save_model(output_dir, nlp):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

def load_model(output_dir):
    print("Loading from", output_dir)
    nlp_model = spacy.load(output_dir)
    return nlp_model

def test_model(train_data, nlp):
    for text, _ in train_data:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

def ner_trainer( train_data, config):

    n_iter = config['n_iter']
    model = config['model']

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    return nlp



### Configuration and Training

In [3]:
train_data = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]

config = {
    'n_iter': 100,
    'model': None,
}

In [4]:

if not os.path.isdir('model_data'):
    os.mkdir('model_data')

current_dir = os.getcwd()
NEW_MODEL = os.path.join(current_dir, 'model_data')

nlp = ner_trainer(train_data,config)
save_model(NEW_MODEL, nlp)
nlp_model = load_model(NEW_MODEL)


Created blank 'en' model


  **kwargs


Losses {'ner': 9.899998903274536}
Losses {'ner': 9.691761136054993}
Losses {'ner': 9.591479182243347}
Losses {'ner': 9.285689115524292}
Losses {'ner': 8.800917148590088}
Losses {'ner': 8.058760046958923}
Losses {'ner': 8.113943099975586}
Losses {'ner': 7.526875972747803}
Losses {'ner': 7.086795091629028}
Losses {'ner': 7.3712087869644165}
Losses {'ner': 6.647868990898132}
Losses {'ner': 5.628359913825989}
Losses {'ner': 5.779063642024994}
Losses {'ner': 5.494428098201752}
Losses {'ner': 5.153343170881271}
Losses {'ner': 5.164166569709778}
Losses {'ner': 4.061458259820938}
Losses {'ner': 4.878674566745758}
Losses {'ner': 4.588281527161598}
Losses {'ner': 4.756910711526871}
Losses {'ner': 4.621913880109787}
Losses {'ner': 4.877700709737837}
Losses {'ner': 3.6048336401581764}
Losses {'ner': 4.3541263826191425}
Losses {'ner': 4.475983753800392}
Losses {'ner': 4.827014146372676}
Losses {'ner': 3.93739452958107}
Losses {'ner': 3.7484133914113045}
Losses {'ner': 4.811004608869553}
Losses {'ne

### Testing trained model

In [5]:
test_model(train_data, nlp_model)

Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities [('Shaka Khan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), ('Khan', 'PERSON', 1), ('?', '', 2)]
