In [23]:
import plac, random, spacy
from pathlib import Path

In [24]:
TRAIN_DATA = [
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
    ('The number reached 744 in Cagayan Valley.', {
        'entities': [(4, 10, 'CAS')]
    }),
    ISABELA, Philippines – The rainy season has just started.
    The municipality of Luna in Isabela province in the northern Philippines already declared state of calamity after their health office recorded 54 cases of dengue in just the month of June.
    Dr Claire Francisco, Luna’s health officer, said on Tuesday, June 30, the number of cases reported for the month is significantly higher compared to the numbers reported in the same month in previous years.
    Although there is no case of death from dengue yet, the increase in the reported cases of the disease prompted the local government unit to declare a state of calamity to be able to release funds for the purchase of medical equipment.
    In April, the Department of Health (DOH) reported 6.49% increase of dengue cases in the country, citing 19,946 cases in the first quarter of 2015.
    The number reached 744 in Cagayan Valley, while Calabarzon had the highest with 3,778 and the Autonomous Region of Muslim Mindanao (ARMM) had the lowest with 312.
    The health department is expecting the number of dengue cases to peak in the months of July and August.",

]

In [25]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline, and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
            print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [28]:
main()
    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

Created blank 'en' model
{'ner': 10.733839458060174}
{'ner': 13.417759239673615}
{'ner': 8.453623229922414}
{'ner': 13.166155847221345}
{'ner': 13.099092297328752}
{'ner': 10.86783561847405}
{'ner': 10.920729713703395}
{'ner': 7.651947411417189}
{'ner': 6.199139598361398}
{'ner': 10.666118910074948}
{'ner': 6.009605101296758}
{'ner': 6.436598177224457}
{'ner': 5.752864415620399}
{'ner': 5.963281960464576}
{'ner': 3.331284738939247}
{'ner': 3.8369747508163776}
{'ner': 2.192544459068151}
{'ner': 2.4059482158870074}
{'ner': 5.3737736658402}
{'ner': 1.9955138574639362}
{'ner': 1.9997916377277383}
{'ner': 3.866743385619974}
{'ner': 1.226710903059622e-06}
{'ner': 3.586977959934833e-08}
{'ner': 8.425989190214598e-09}
{'ner': 1.854983806610145}
{'ner': 0.0066525815629165375}
{'ner': 6.242173822849805e-07}
{'ner': 4.776081213082764e-05}
{'ner': 1.0749665838507797e-05}
{'ner': 1.9691376686299829}
{'ner': 0.05439449723717505}
{'ner': 7.498639216318706e-13}
{'ner': 1.161928365923896e-07}
{'ner': 4