In [40]:
import spacy

In [41]:
nlp = spacy.load('en_core_web_sm')

In [42]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)) +  '\n')

In [43]:
doc1 = nlp(u'Hi how are you?')
show_ents(doc1)

In [44]:
doc2 = nlp(u"Barack Hussein Obama II is an American politician who served as the 44th president of the United States from 2009 to 2017. ")
show_ents(doc2)

Barack Hussein - PERSON - People, including fictional

American - NORP - Nationalities or religious or political groups

44th - ORDINAL - "first", "second", etc.

the United States - GPE - Countries, cities, states

2009 to 2017 - DATE - Absolute or relative dates or periods



In [45]:
doc3 = nlp(u"Apple Inc. is an American multinational technology company that specializes in consumer electronics, software and online services headquartered in Cupertino, California, United States. Apple is the largest technology company by revenue (totaling US$365.8 billion in 2021) and as of June 2022, is the world's biggest company by market capitalization, the fourth-largest personal computer vendor by unit sales and second-largest mobile phone manufacturer. ")
show_ents(doc3)

Apple Inc. - ORG - Companies, agencies, institutions, etc.

American - NORP - Nationalities or religious or political groups

Cupertino - GPE - Countries, cities, states

California - GPE - Countries, cities, states

United States - GPE - Countries, cities, states

Apple - ORG - Companies, agencies, institutions, etc.

US$365.8 billion - MONEY - Monetary values, including unit

2021 - DATE - Absolute or relative dates or periods

June 2022 - DATE - Absolute or relative dates or periods

fourth - ORDINAL - "first", "second", etc.

second - ORDINAL - "first", "second", etc.



In [46]:
doc4 = nlp(u"Tesla to build a U.K. factory for $6 million")
show_ents(doc4)

U.K. - GPE - Countries, cities, states

$6 million - MONEY - Monetary values, including unit



In [47]:
# There are some specific words in the text which spacy doesn't recognize(Obama, Tesla..). So we shoul be able to add our own entities
from spacy.tokens import Span

In [48]:
ORG = doc4.vocab.strings[u"ORG"]
ORG

383

In [49]:
new_ent = Span(doc4, 0, 1, label=ORG)

In [50]:
doc4.ents = list(doc4.ents) + [new_ent]

In [51]:
show_ents(doc4)

Tesla - ORG - Companies, agencies, institutions, etc.

U.K. - GPE - Countries, cities, states

$6 million - MONEY - Monetary values, including unit



In [52]:
# adding multiple name entities
doc = nlp(u"Our company created a brand new vacuum cleaner."
          u"This new vacuum-cleaner is the best in show.")

In [53]:
show_ents(doc)

In [54]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

In [55]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrease_patterns = [nlp(text) for text in phrase_list]

In [56]:
matcher.add('newproduct', None, *phrease_patterns)

In [57]:
found_matches = matcher(doc)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [58]:
from spacy.tokens import Span

In [59]:
PROD = doc.vocab.strings[u'PRODUCT']

In [60]:
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]

In [61]:
doc.ents = list(doc.ents) + new_ents

In [62]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)

vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)



In [63]:
# how many PRODUCT entity in the unicode string
len([ent for ent in doc.ents if ent.label_ == "PRODUCT"])

2

In [68]:
# visualizing names entity recognition
from spacy import displacy
tesla = nlp(u"Tesla, Inc. is an American multinational automotive and clean energy company headquartered in Austin, Texas with a market capitalization of more than US$840 billion.")

In [69]:
displacy.render(tesla, style='ent', jupyter=True)

In [92]:
options = {'ents':['GPE', 'ORG'], 'colors':{'GPE':'radial-gradient(pink, purple)', 'ORG':'linear-gradient(90deg, #832C8A, #E23B6B)'}}

In [93]:
displacy.render(tesla, style='ent', jupyter=True, options=options)