In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
  else:
    print('no named entities found.')

In [None]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [None]:
doc = nlp(u'Can I please have 500 dollars of Microsoft stock?')

for ent in doc.ents:
  print(ent.text, ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

500 dollars 4 6 18 29 MONEY
Microsoft 7 8 33 42 ORG


Adding a NAMED entity to a span

In [None]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
from spacy.tokens import Span

ORG = doc.vocab.strings[u'ORG']

new_ent = Span(doc,0,1,label=ORG)
doc.ents = list(doc.ents) + [new_ent]

In [None]:
show_ents(doc)


Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


adding named entities to all matching span

In [None]:
doc = nlp(u'our company plans to introduce a new vacuum cleaner'
          u'if succesful, the vacuum cleaner will be our first product')
show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [None]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [None]:
phrase_list = ['vacuum cleaner','vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [None]:
matcher.add('newproduct',None,*phrase_patterns)
matches = matcher(doc)
matches

[(2689272359382549672, 12, 14)]

In [None]:
from spacy.tokens import Span
PROD = doc.vocab.strings[u'PRODUCT']
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in matches]
doc.ents = list(doc.ents) + new_ents

In [None]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


Counting Entities

In [None]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [None]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

noun chunks

In [None]:
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')
for chunk in doc.noun_chunks:
  print(chunk.text+'-'+chunk.root.text+'-'+chunk.root.dep_+'-'+chunk.root.head.text)

Autonomous cars-cars-nsubj-shift
insurance liability-liability-dobj-shift
manufacturers-manufacturers-pobj-toward


In [None]:
len(doc.noun_chunks)#it is a genrator function so first we need to convert it into list

TypeError: object of type '_cython_3_1_1.generator' has no len()

In [None]:
len(list(doc.noun_chunks))

3

visualizing ner

In [None]:
from spacy import displacy

In [None]:
doc = nlp(u'over the last quarter Apple sold nearly 20 thousand ipods for  a profit of $6 million.'
          u'by contrast,sony sold only 7 thousand Walkman music players')
displacy.render(doc,style='ent',jupyter=True)

In [None]:
for sent in doc.sents:
  displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [None]:
doc2 = nlp(u'over the last quarter Apple sold nearly 20 thousand ipods for  a profit of $6 million.'
           u'By contrast,my kids sold a lot of lemonade')
displacy.render(doc2,style='ent',jupyter=True)

In [None]:
for sent in doc2.sents:
  docx = nlp(sent.text)
  if docx.ents:
    displacy.render(docx,style='ent',jupyter=True)
  else:
    print(docx.text)

By contrast,my kids sold a lot of lemonade


In [None]:
options = {'ents':['PRODUCT','org']}
displacy.render(doc,style='ent',jupyter=True,options=options)
#

In [None]:
colors = {'org': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)','PRODUCT': 'radial-gradient(yellow,green)'}
options = {'ents':['PRODUCT','org'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)