NER, POS, lemmatize, remove stop words

In [2]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from nltk.corpus import gutenberg

nlp = en_core_web_sm.load()
sample_text = gutenberg.raw("sample_text.txt")
sample_text = sample_text.replace('\n',' ')
doc = nlp(str(sample_text))
#print([(x.text, x.label_) for x in doc.ents])
#displacy.render(doc, jupyter=True, style='ent')
#print([(x.orth_, x.pos_, x.lemma_) for x in [y for y in doc if not y.is_stop and y.pos_ != 'PUNCT']])
print(dict([(str(x), x.label_) for x in doc.ents]))

{'Emma Woodhouse': 'PERSON', 'nearly twenty-one years': 'DATE', 'two': 'CARDINAL'}


Custom NER

In [7]:
import spacy
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random

def offseter(lbl, doc, matchitem):
    o_one = len(str(doc[0:matchitem[1]]))
    subdoc = doc[matchitem[1]:matchitem[2]]
    o_two = o_one + len(str(subdoc))
    return (o_one, o_two, lbl)

#nlp = spacy.blank('en')
nlp = spacy.load('en')

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

#ner.add_label(label)

label = 'CHANCEL'
matcher = PhraseMatcher(nlp.vocab)
for i in ['Angela Merkel', 'Angela', 'Merkel',]:
    matcher.add(label, None, nlp(i))
    
label1 = 'PRESID'
matcher = PhraseMatcher(nlp.vocab)
for i in ['Vladimir Putin', 'Vladimir', 'Putin',]:
    matcher.add(label1, None, nlp(i))

    
ner.add_label(label)
ner.add_label(label1)

res = []
to_train_ents = []
with open('angela_merkel.txt') as am:
    line = True
    while line:
        line = am.readline()
        mnlp_line = nlp(line)
        matches = matcher(mnlp_line)
        res = [offseter(label, mnlp_line, x)
               for x
               in matches]
        to_train_ents.append((line, dict(entities=res)))
        res1 = [offseter(label1, mnlp_line, x)
               for x
               in matches]
        to_train_ents.append((line, dict(entities=res1)))

@plac.annotations(
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path))
def train(new_model_name='angela', output_dir=None):

    optimizer = nlp.begin_training()
    
    other_pipes = [pipe
                   for pipe
                   in nlp.pipe_names
                   if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(20):
            losses = {}
            random.shuffle(to_train_ents)
            for item in to_train_ents:
                nlp.update([item[0]],
                           [item[1]],
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print(losses)

    if output_dir is None:
        output_dir = "./angela"


    noutput_dir = Path(output_dir)
    if not noutput_dir.exists():
        noutput_dir.mkdir()
        
    nlp.meta['name'] = new_model_name
    nlp.to_disk(output_dir)
        
    random.shuffle(to_train_ents)

    test_text = to_train_ents[1][0]
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

train()

#if __name__ == '__main__':
#plac.call(train)

{'ner': 272.6808154482794}
{'ner': 184.13004564504305}
{'ner': 190.98931958647418}
{'ner': 196.59636726640161}
{'ner': 185.95594599906195}
{'ner': 174.24127839228706}
{'ner': 206.06090385467607}
{'ner': 206.46684956641056}
{'ner': 199.37481365501344}
{'ner': 197.3634408730328}
{'ner': 205.45417466586298}
{'ner': 184.44423763495269}
{'ner': 181.77947196863434}
{'ner': 201.5698425594516}
{'ner': 198.50400538809714}
{'ner': 190.18621065631066}
{'ner': 195.0055002172331}
{'ner': 181.62375770402417}
{'ner': 184.79830548918437}
{'ner': 186.3375208502154}
Entities in 'EU sanctions ineffective?
'


spacy's training code

In [4]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy


# training data
TRAIN_DATA = [
    ('Who is Shakira?', {
        'entities': [(7, 14, 'PERSON')]
    }),
    ('I like Europe and Asia.', {
        'entities': [(7, 13, 'LOC'), (18, 22, 'LOC')]
    })
]

#testing data
TEST_DATA = 'Shakira is a singer. She is popular in both Europe and Asia'


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    # test the trained model
    #for text, _ in TEST_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        #for text, _ in TEST_DATA:
        doc = nlp2(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

            
main()

#if __name__ == '__main__':
#    plac.call(main)

    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
# ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

Created blank 'en' model
{'ner': 14.903143644332888}
{'ner': 10.633922471025244}
{'ner': 9.301600298995215}
{'ner': 10.423443384062693}
{'ner': 14.36390769481659}
{'ner': 12.244401382727439}
{'ner': 8.81959519952943}
{'ner': 12.92722288224082}
{'ner': 12.006990670118773}
{'ner': 7.36171617410659}
{'ner': 4.611324977855412}
{'ner': 10.10395162478596}
{'ner': 5.712842330405726}
{'ner': 4.599513058737772}
{'ner': 4.366744292342068}
{'ner': 7.081644639928038}
{'ner': 4.705553461870732}
{'ner': 2.64676956914952}
{'ner': 0.0035026795699626178}
{'ner': 2.016619148856186}
{'ner': 5.928474605568221}
{'ner': 2.020657978787887}
{'ner': 6.8011948058400296e-06}
{'ner': 1.1616648474021694e-05}
{'ner': 1.7275064458807496}
{'ner': 0.6595819481465341}
{'ner': 6.090223747378934e-05}
{'ner': 0.20586453416152736}
{'ner': 1.152571531166199e-07}
{'ner': 6.398419354428285e-09}
{'ner': 1.9986677276934854}
{'ner': 2.6686856842062624e-08}
{'ner': 6.361919774896291e-09}
{'ner': 0.0004923625617193417}
{'ner': 5.0