In [2]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [3]:
doc=nlp(u'i may want to visit tokyo in the next May, and see  the famous tokyo tower')

In [4]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [5]:
show_ents(doc)

tokyo - GPE - Countries, cities, states
the next May - DATE - Absolute or relative dates or periods
  - NORP - Nationalities or religious or political groups
tokyo - GPE - Countries, cities, states


In [6]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [7]:
doc=nlp(u"Tesla have bought $500 worth shares of Paytm company")
show_ents(doc)

500 - MONEY - Monetary values, including unit
Paytm - ORG - Companies, agencies, institutions, etc.


In [8]:
from spacy.tokens import Span
ORG=doc.vocab.strings[u'ORG']
new_ent=Span(doc,8,9,label=ORG)
doc.ents=list(doc.ents)+[new_ent]

In [9]:
show_ents(doc)

500 - MONEY - Monetary values, including unit
Paytm - ORG - Companies, agencies, institutions, etc.


In [16]:
doc2=nlp(u'readme-10 is a cheapest and high end smartphone. '
         u'readme 10 is also most sold smartphone')

In [17]:
show_ents(doc2)

readme-10 - PERSON - People, including fictional
10 - CARDINAL - Numerals that do not fall under another type


In [18]:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)

In [19]:
phrase_list=['readme-10','readme 10']
pattern_list=[nlp(pattern) for pattern in phrase_list]
matcher.add('newProduct',None,*pattern_list)
matches=matcher(doc2)
matches

[(4452177204818730156, 0, 1), (4452177204818730156, 9, 11)]

In [20]:
from spacy.tokens import Span

PROD = doc2.vocab.strings[u'PRODUCT']
print(matches)
new_ents=[Span(doc2,match[1],match[2],label=PROD) for match in matches]
doc2.ents=list(doc2.ents)+new_ents
show_ents(doc2)

[(4452177204818730156, 0, 1), (4452177204818730156, 9, 11)]
readme-10 - PRODUCT - Objects, vehicles, foods, etc. (not services)
readme 10 - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [25]:
from spacy import displacy
colors = {'PRODUCT': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'ORG': 'radial-gradient(yellow, green)'}
options = {'ents': ['ORG', 'PRODUCT'], 'colors':colors}
displacy.render(doc2,style='ent',jupyter=True,options=options)

## adding segmentation rule

In [34]:
# SPACY'S DEFAULT BEHAVIOR
nlp=spacy.load('en_core_web_sm')
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [35]:
## setting custom boundary  and adding to the existing boundary rule
def set_custom_boundary(doc):
    for token in doc[:-1]:
        if token.text==';':
            doc[token.i+1].is_sent_start=True
    return doc
nlp.add_pipe(set_custom_boundary, before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundary', 'parser', 'ner']

In [36]:
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)


"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [38]:
nlp = spacy.load('en_core_web_sm')  # reset to the original

mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)

for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [39]:
# CHANGING THE RULES
from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens


sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

In [40]:
doc = nlp(mystring)
for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n']
['third', 'sentence', '.']
