In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' _ '+ent.label_+' _ '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [5]:
doc = nlp(u' May I go to Washingtion, DC next May to see the Washington Monument?')
show_ents(doc)

Washingtion _ GPE _ Countries, cities, states
DC _ GPE _ Countries, cities, states
next May _ DATE _ Absolute or relative dates or periods
the Washington Monument _ ORG _ Companies, agencies, institutions, etc.


In [6]:
for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.start_char, ent.label_)

Washingtion 5 6 13 13 GPE
DC 7 8 26 26 GPE
next May 8 10 29 29 DATE
the Washington Monument 12 15 45 45 ORG


In [7]:
doc = nlp(u' Tesla to build a U.K. Factory for $6 million')
show_ents(doc)

U.K. Factory _ ORG _ Companies, agencies, institutions, etc.
$6 million _ MONEY _ Monetary values, including unit


In [8]:
# Adding a Named Entity to span
from spacy.tokens import Span

# Get the hash value of the ORG entity label 
ORG = doc.vocab.strings[u'ORG']

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing DOc object
doc.ents = list(doc.ents) + [new_ent]

In [9]:
show_ents(doc)

  _ ORG _ Companies, agencies, institutions, etc.
U.K. Factory _ ORG _ Companies, agencies, institutions, etc.
$6 million _ MONEY _ Monetary values, including unit


In [10]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner.'
          u'If successful, the vacuum cleaner will be our first product.')

In [11]:
show_ents(doc)

first _ ORDINAL _ "first", "second", etc.


In [12]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher 
matcher = PhraseMatcher(nlp.vocab)

In [13]:
# Create the desired Pharse patterns:
phrase_list = ['vacuum cleaner','vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [14]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

In [15]:
# Apply the matcher to our Doc oblect
matches = matcher(doc)

In [16]:
# See what matches occur
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [17]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

In [18]:
PROD = doc.vocab.strings[u'PRODUCT']
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in matchers]
doc.ents = list(doc.ents) + new_ents

NameError: name 'matchers' is not defined

In [19]:
show_ents(doc)

first _ ORDINAL _ "first", "second", etc.


In [20]:
doc = nlp(u'Originally priced at $29.50,\n the sweater was marked down to five dollars.')

In [21]:
show_ents(doc)

29.50 _ MONEY _ Monetary values, including unit
five dollars _ MONEY _ Monetary values, including unit


In [22]:
len([ent for ent in doc.ents if ent.label_== 'MONEY'])

2

# Problem wiith line Brake

In [23]:
# Quick function to remove ents formed on whitespace:
from spacy.language import Language
@Language.component("remove_whitespace_entities")
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

In [24]:
# Insert this into the pipeline AFTER the ner components
nlp.add_pipe('remove_whitespace_entities', after='ner')

<function __main__.remove_whitespace_entities(doc)>

In [45]:
show_ents(doc)

29.50 _ MONEY _ Monetary values, including unit
five dollars _ MONEY _ Monetary values, including unit


# Visualizing Name Entity

In [46]:
# Perform standerd import 
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [48]:
doc = nlp(u'Over the last quarter Apply sold nearly 20 thousand ipods for a profile of $6 million.'
          u'By contract, sony sold only 7 thousand Walkman music players.')
displacy.render(doc, style='ent', jupyter= True)

In [49]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)