In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [8]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [18]:
text = u"""Since the early 1970s, the health effects of indoor air pollution have been investigated with increasing intensity. Consequently, a large body of literature is now available on diverse aspects of indoor air pollution: sources, concentrations, health effects, engineering, and policy. This review begins with a review of the principal pollutants found in indoor environments and their sources. Subsequently, exposure to indoor air pollutants and health effects are considered, with an emphasis on those indoor air quality problems of greatest concern at present: passive exposure to tobacco smoke, nitrogen dioxide from gas-fueled cooking stoves, formaldehyde exposure, radon daughter exposure, and the diverse health problems encountered by workers in newer sealed office buildings. The review concludes by briefly addressing assessment of indoor air quality, control technology, research needs, and clinical implications."""

In [19]:
doc = nlp(text)

In [20]:
show_ents(doc)

the early 1970s - DATE - Absolute or relative dates or periods


In [23]:
doc = nlp(u'Hi, how are you?')

In [24]:
show_ents(doc)

No entities found


In [25]:
doc = nlp(u"May I go to Delhi, India next May to see Qutub Minar?")

In [26]:
show_ents(doc)

Delhi - GPE - Countries, cities, states
India - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
Qutub Minar - PERSON - People, including fictional


In [29]:
doc = nlp(u"Can I please have 10000 dollars of Tata stock?")

In [30]:
show_ents(doc)

10000 dollars - MONEY - Monetary values, including unit
Tata - ORG - Companies, agencies, institutions, etc.


In [31]:
doc = nlp(u"Toluene is a chemical and has can be represented by C7H8. It has cyclic structure.")

In [32]:
show_ents(doc)

Toluene - ORG - Companies, agencies, institutions, etc.


In [50]:
doc = nlp(u"Tesla to build a P.O.K. factor for $100 million. Is Tesla good?")

In [51]:
show_ents(doc)

P.O.K. - ORG - Companies, agencies, institutions, etc.
$100 million - MONEY - Monetary values, including unit


In [52]:
# adding custom entities

In [54]:
from spacy.tokens import Span

In [55]:
ORG = doc.vocab.strings[u"ORG"]

In [56]:
ORG

383

In [57]:
new_ent = Span(doc, 0, 1, label=ORG)

In [58]:
doc.ents = list(doc.ents) + [new_ent]

In [59]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
P.O.K. - ORG - Companies, agencies, institutions, etc.
$100 million - MONEY - Monetary values, including unit


In [60]:
# Adding multiple phrases as NER

In [61]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner is best in business")

In [64]:
show_ents(doc)

No entities found


In [65]:
# adding vacuum cleaner and vacuum-cleaner as entities

In [66]:
from spacy.matcher import PhraseMatcher

In [67]:
matcher = PhraseMatcher(nlp.vocab)

In [68]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [69]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [70]:
matcher.add('VacuumCleaner', phrase_patterns)

In [71]:
found_matches = matcher(doc)

In [72]:
found_matches

[(6066037630898584017, 6, 8), (6066037630898584017, 11, 14)]

In [73]:
from spacy.tokens import Span

In [74]:
PRODUCT = doc.vocab.strings[u"PRODUCT"]

In [76]:
for match_id, start, end in found_matches:
    new_ent = Span(doc, start, end, label=PRODUCT)
    doc.ents = list(doc.ents) + [new_ent]
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [77]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by $10.")

In [78]:
show_ents(doc)

29.95 - MONEY - Monetary values, including unit
10 - MONEY - Monetary values, including unit


In [79]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10]