In [1]:
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import requests
import re
import numpy as np

nlp = spacy.load('en_core_web_sm')

In [2]:
#We want to search for any first bein with Ryan.
#Then followed by any capitalized letter [A-Z]
#Then we grab the entire 2nd word \w+

pattern1 = r"Ryan [A-Z]\w+"
text1 = 'Ryan Bobby is much cooler than Ryan Gosling'
matches = re.findall(pattern1, text1)
print (matches)



['Ryan Bobby', 'Ryan Gosling']


In [3]:
matches = re.finditer(pattern1, text1)
for i in matches:
    print(i)

<re.Match object; span=(0, 10), match='Ryan Bobby'>
<re.Match object; span=(31, 43), match='Ryan Gosling'>


In [4]:
from spacy.tokens import Span

In [5]:
nlp = spacy.blank('en')
doc = nlp(text1)
doc.ents

print(doc.char_span(0,2))

None


In [6]:
#We Build custom components to our function
nlp = spacy.blank('en')
doc = nlp(text1)
original_ents = list(doc.ents)

#Multi Words Token
mwt_ents = []
for match in re.finditer(pattern1, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
#     print(span)
    if span is not None:
        #Now, the span aligninng with the token, not the characters anymore
        mwt_ents.append((span.start, span.end, span.text))
        
print(mwt_ents)
print('############')

#Now we want to inject it into our function entities
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label = "PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents

for ent in doc.ents:
    print(ent.text, ent.label_)

[(0, 2, 'Ryan Bobby'), (6, 8, 'Ryan Gosling')]
############
Ryan Bobby PERSON
Ryan Gosling PERSON


In [7]:
from spacy.language import Language
pattern1 = r"Ryan [A-Z]\w+"
@Language.component('paul_ner')
def paul_ner(doc):
    
    original_ents = list(doc.ents)

    #Multi Words Token
    mwt_ents = []
    for match in re.finditer(pattern1, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            #Now, the span aligninng with the token, not the characters anymore
            mwt_ents.append((span.start, span.end, span.text))

    #Now we want to inject it into our function entities
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return (doc)


In [8]:
nlp2 = spacy.blank('en')
nlp2.add_pipe('paul_ner')

<function __main__.paul_ner(doc)>

In [9]:
doc2 = nlp2('Ryan Bobby is much cooler than Ryan Gosling')
print(doc2.ents)

(Ryan Bobby, Ryan Gosling)


In [18]:
from spacy.language import Language
from spacy.util import filter_spans
pattern1 = r"Gosling"
@Language.component('gosling_ner')
def gosling_ner(doc):
    
    original_ents = list(doc.ents)

    #Multi Words Token
    mwt_ents = []
    for match in re.finditer(pattern1, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            #Now, the span aligninng with the token, not the characters anymore
            mwt_ents.append((span.start, span.end, span.text))

    #Now we want to inject it into our function entities
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "CINEMA")
        original_ents.append(per_ent)
    #Give priority to the linger tokens
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return (doc)

In [19]:
nlp3 = spacy.load('en_core_web_sm')
nlp3.add_pipe('gosling_ner')

<function __main__.gosling_ner(doc)>

In [17]:
doc3 = nlp3('Ryan Bobby is much cooler than Ryan Gosling')
for i in doc3.ents:
    print(i.text, i.label_)

Ryan Bobby PERSON
Ryan Gosling PERSON
