In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back")

## NER
Seeks to locate and classify entity mentions in unstructure text into pre-defiend categories

In [13]:
def show_entities(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) 
    else:
        print('No entities found')

In [14]:
doc = nlp(u"Jim bought 300 shares ")

In [15]:
show_entities(doc)

Jim - PERSON - People, including fictional
300 - CARDINAL - Numerals that do not fall under another type


#### Adding custom NERS


In [25]:
doc = nlp(u"Tesla to build a U.K. factory for $6 million")

In [26]:
show_entities(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [27]:
from spacy.tokens import Span

In [28]:
ORG = doc.vocab.strings[u"ORG"]

In [29]:
ORG

383

In [30]:
new_entity = Span(doc,0,1,label=ORG)

In [31]:
doc.ents = list(doc.ents) + [new_entity]

In [32]:
show_entities(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


### Multiple Named Entities

In [33]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

In [34]:
show_entities(doc)

No entities found


In [35]:
from spacy.matcher import PhraseMatcher

In [36]:
matcher = PhraseMatcher(nlp.vocab)

In [37]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [39]:
phrase_matchers = [nlp(text) for text in phrase_list]

In [41]:
matcher.add('newproduct',None,*phrase_matchers)

In [42]:
found_matches = matcher(doc)

In [43]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [53]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [54]:
PROD

386

In [55]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [58]:
new_ents

[vacuum cleaner, vacuum-cleaner]

In [59]:
doc.ents = list(doc.ents) + new_ents

In [60]:
show_entities(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [66]:
doc = nlp(u"Originally I paid $29 for this car toy, but now it is marken down to $13")

In [67]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

## Displacy on NER

In [68]:
from spacy import displacy

In [69]:
displacy.render(doc, style='ent', jupyter=True)

In [73]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [74]:
displacy.render(doc, style='ent', jupyter=True)

In [75]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)