In [1]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

In [2]:
# Load a model
nlp = spacy.load('en_core_web_sm')
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x25c32e67100>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x25c32d38a00>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x25c32d389a0>)]

In [3]:
# Create a custom component
# Fundamentally, such a component takes in a doc object, process it and returns

def custom_component(doc):
    print('Doc text:', doc.text, '\n')
    print('Doc length:', len(doc))
    
    return doc

In [4]:
# Add the component to pipe
# Options available: 
# last = True/False
# first = True/False
# before/after = [name of the component in pipe]

nlp.add_pipe(custom_component, before='ner')

# View the pipe
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x25c32e67100>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x25c32d38a00>),
 ('custom_component', <function __main__.custom_component(doc)>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x25c32d389a0>)]

In [5]:
# Run the pipeline by creating a doc object
doc = nlp('Hello, world!')

Doc text: Hello, world! 

Doc length: 4


In [6]:
# Add the component:
# Find the animal names in doc from a given set of names, add them to document entities

# Load the model
nlp = spacy.load('en_core_web_sm')

# Given animal names
animals = ['Golden Retriever', 'cat', 'turtle', 'Rattus norvegicus']

# Initialize matcher
matcher = PhraseMatcher(nlp.vocab)

# Create animals pattern, add to matcher
animals_pattern = nlp.pipe(animals)
matcher.add('ANIMALS_PATTERN', None, *animals_pattern)

In [7]:
# Define the custom component
def animals_component(doc):    
    # Find the matches, create spans for each match, add the obtained list to document entities
    matches = matcher(doc)
    animal_spans = [Span(doc, start_index, end_index, label='ANIMAL') for _, start_index, end_index in matches]
    
    doc.ents = tuple(list(doc.ents) + animal_spans)
    
    return doc

In [8]:
# Add the defined component after 'ner' in pipeline
nlp.add_pipe(animals_component, after='ner')

nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x25c35225c70>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x25c330550a0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x25c330553a0>),
 ('animals_component', <function __main__.animals_component(doc)>)]

In [9]:
# Initialize the doc object
doc = nlp('Rachel has a cat and a Golden Retriever')

# Print the entities in doc
print([(entity.text, entity.label_) for entity in doc.ents])

[('Rachel', 'PERSON'), ('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]
