In [1]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
#Weite a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
            
        else:
              print('No named entities found.')

In [3]:
# Process the text
doc = nlp("May I go to Washington, DC next May to see the Washington Monuments?")
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
Washington Monuments - GPE - Countries, cities, states
No named entities found.


In [4]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text,ent.start, ent.end, ent.start_char,ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [5]:
print("Tokens:")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tokens:
Can AUX aux
I PRON nsubj
please INTJ intj
borrow VERB ROOT
500 NUM nummod
dollars NOUN dobj
from ADP prep
you PRON pobj
to PART aux
buy VERB advcl
some DET det
Microsoft PROPN compound
stock NOUN dobj
? PUNCT punct


In [6]:
doc = nlp(u'Tesla to build a U.K factory for $6 million')
show_ents(doc)

U.K - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit
No named entities found.


## Right now, Spacy does not recognize "Tesla" as a company

In [8]:
from spacy.tokens import Span

#Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']

# Create a Span for the new entity
new_ent = Span(doc,0,1, label=ORG)

#Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

ValueError: [E1010] Unable to set entity information for token 0 which is included in more than one span in entities, blocked, missing or outside.

## In the code above, the arguments passed to Span() are:


In [9]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit
No named entities found.


# Adding a name Entity to All matching Spans
 what if we want to tag all accourance of "Tesla" ? In this section we show how to use the 
 PhaseMatcher to identify a series of spans in the Doc:

In [10]:
doc = nlp(u' Our comapny plans to introduce a new vaccum cleaner.'
          u'If successful, rthe vaccum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.
No named entities found.


In [11]:
# Import phraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [12]:
# Create the desired phrase patterns:
phrase_list = ['vaccum cleaner' , 'vaccum-cleaner']
phrase_patterns = [nlp(txt) for txt in phrase_list]

In [13]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(2689272359382549672, 8, 10), (2689272359382549672, 15, 17)]

In [14]:
# here we create spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [15]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.
No named entities found.


# Counting Entities

In [16]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit
No named entities found.


In [17]:
len([ent for ent in doc.ents if ent.label =='MONEY'])

0

In [18]:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit
No named entities found.


Noun Chunks

Noun Chunks components:

'.text'           The original noun chunk text
'.root.text'      The original text of the word connecting the noun chunk to the rest of the parse.
'.root.dep_'      Dependency relation connecting the root to its head.
'/rrot.head.text' The text of the root token's head.

In [19]:
doc = nlp(u"Autonomus cars shift insurance liability towards manufactures.")

for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' _ '+chunk.root.dep_+' _ '+chunk.root.head.text)

Autonomus cars - cars _ nsubj _ shift
insurance liability - liability _ dobj _ shift
manufactures - manufactures _ pobj _ towards


Doc.noun_chunks is a generator function:

Previouslywe mentioned that doc objects do not retain a list of sentences, but they through the Doc.sents generator.
it's the same with Doc.noun_chunks - lists can be created if needed

In [21]:
len(list(doc.noun_chunks))

3