In [None]:
import re

In [None]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
# this regex is to grab any thing that starts with Paul and the any word following it till the wordbreak
pattern = r"Paul [A-Z]\w+"

In [None]:
#  we apply the pattern to the text
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

Reconstruct Spans 
(we are goin to make our custom pipe with these functionality)

In [None]:
import spacy
from spacy.tokens import Span

In [None]:
nlp = spacy.blank('en')
doc = nlp(text)

original_ents = list(doc.ents)
mwt_ents = []

for match in re.finditer(pattern, doc.text):
    start, end= match.span()
    span = doc.char_span(start,end)
    if(span):
        mwt_ents.append((span.start,span.end,span.text))
for ent in mwt_ents:
    start,end,name = ent
    #  we now going to create a span object in spacy
    #  that we can inject in our doc.ents list
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
#  now you can see that we have our entities in the label
for ent in doc.ents:
    print(ent.text, ent.label_)




In [None]:
print(mwt_ents)

now we can use this to create a custom component(pipe)
that does all this 

In [None]:
from spacy.language import Language

@Language.component('prince_ner')
def prince_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end= match.span()
        span = doc.char_span(start,end)
        if(span):
            mwt_ents.append((span.start,span.end,span.text))
    for ent in mwt_ents:
        start,end,name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc


In [None]:
#  lets create another blank model and add our our custom pipe to it
nlp2 = spacy.blank('en')
nlp2.add_pipe('prince_ner')

So now your model has a custom pipe and you can perform the same action but this time with your doc object

In [None]:
doc = nlp2(text)
doc.ents
#  you will see now that the output is the same

lets look at an error that will happen if we apply this to the spacy small model

In [39]:
from spacy.language import Language
from spacy.util import filter_spans
@Language.component('cinema_ner')
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end= match.span()
        span = doc.char_span(start,end)
        if(span):
            mwt_ents.append((span.start,span.end,span.text))
    for ent in mwt_ents:
        start,end,name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    #  this now will look at all spans and if there are spans that overlap it gives priority to the long one
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return doc


In [40]:
nlp3 = spacy.load('en_core_web_sm')
nlp3.add_pipe('cinema_ner')

<function __main__.cinema_ner(doc)>

In [41]:
#  if you run this code below you will get the error
doc3 = nlp3(text)
doc.ents

(Paul Newman, Paul Hollywood)

This error tells us that one of our tokens from the finditer() overlapped with one that our “ner” component found. This is a problem that can be rectified with spaCy’s filter_spans. This gives primacy to longer spans. Notice how we have allowed the Paul Hollywood entity to be a PERSON, rather than CINEMA. This is because Hollywood is shorter than Paul Hollywood.