<a href="https://colab.research.google.com/github/soujanya-vattikolla/NLP-with-spaCy/blob/main/MultiWordTokenEntities_and_RegExinspaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re

In [None]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [None]:
pattern = r"Paul [A-Z]\w+"

In [None]:
matches = re.finditer(pattern, text)
for match in matches:
    print (match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [None]:
import spacy
from spacy.tokens import Span

In [None]:
nlp = spacy.blank("en")
doc = nlp(text)
print ("Before doc.ents:", doc.ents)
original_ents = list(doc.ents)
multiwordtoken_ents = []      # multitokenentity
for match in re.finditer(pattern, doc.text):   
    start, end = match.span()
    span = doc.char_span(start, end)
    print("span values:", span)
    if span is not None:
        multiwordtoken_ents.append((span.start, span.end, span.text))  # appending the start, end and text to multiwordtoken
for ent in multiwordtoken_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
print("After doc.ents:",doc.ents)
for ent in doc.ents:
    print ("text:{} and label:{} ".format(ent.text , ent.label_))

Before doc.ents: ()
span values: Paul Newman
span values: Paul Hollywood
After doc.ents: (Paul Newman, Paul Hollywood)
text:Paul Newman and label:PERSON 
text:Paul Hollywood and label:PERSON 


In [None]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    multiwordtoken_ents = []      # multitokenentity
    for match in re.finditer(pattern, doc.text):   
        start, end = match.span()
        span = doc.char_span(start, end)
        print("span values:", span)
        if span is not None:
            multiwordtoken_ents.append((span.start, span.end, span.text))  # appending the start, end and text to multiwordtoken
    for ent in multiwordtoken_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return (doc)

In [None]:
nlp_ner = spacy.blank("en")
nlp_ner.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [None]:
doc_ner = nlp_ner(text)
print (doc_ner.ents)

span values: Paul Newman
span values: Paul Hollywood
(Paul Newman, Paul Hollywood)


### Using filter_spans

In [None]:
from spacy.language import Language
from spacy.util import filter_spans
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents) # helps in filter out the all the spans identified
    doc.ents = filtered
    return (doc)

In [None]:
nlp_filter = spacy.load("en_core_web_sm")
nlp_filter.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [None]:
doc_filter = nlp_filter(text)
for ent_filter in doc_filter.ents:
    print (ent_filter.text, ent_filter.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
