In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [12]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + " - "+ ent.label_ + " - " +str(spacy.explain(ent.label_)))
    else:
        print("no entity found")

In [71]:
doc1 = nlp(u"hi how are you")

In [72]:
show_ents(doc1)

no entity found


In [17]:
doc2= nlp(" May i go to Washington, DC next May to see the Washington \
          Monument")

In [18]:
show_ents(doc2)

May - DATE - Absolute or relative dates or periods
Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
Washington - GPE - Countries, cities, states


In [19]:
doc3 = nlp("Can i please have 500 dollar of Microsoft stock")

In [21]:
show_ents(doc3)

500 dollar - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [52]:
doc4= nlp(u"Tesla to build a U.K. factory for $6 million")

In [53]:
show_ents(doc4)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
"""Here spacy is not recognising 'Tesla' as a named entity, 
so we can tell spacy that 'Tesla' should be under named entity 'ORG',
such things would be super useful when we are building our own custom 
dataset """

In [54]:
from spacy .tokens import Span

In [55]:
ORG = doc.vocab.strings[u"ORG"]

In [56]:
ORG

383

In [57]:
new_ent = Span(doc4,0, 1, label=ORG)

In [60]:
doc4.ents = list(doc4.ents) + [new_ent]

In [61]:
show_ents(doc4)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
"""Adding multiple named entities in your document"""

In [83]:
doc4 = nlp(u"our company created a brand new vaccum cleaner."
          u"This new vaccum-cleaner is best in the show.")



In [84]:
show_ents(doc4)

no entity found


In [85]:
from spacy.matcher import PhraseMatcher

In [86]:
matcher = PhraseMatcher(nlp.vocab)

In [94]:
phrase_list = ["vaccum cleaner", "vaccum-cleaner"]

In [95]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [96]:
matcher.add('newproduct', None, *phrase_patterns)

In [97]:
found_matches= matcher(doc4)

In [98]:
print(found_matches)

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]


In [99]:
from spacy.tokens import Span

In [101]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [104]:
new_ents = [Span(doc4, match[1], match[2], label= PROD) for match in found_matches] 


In [105]:
doc4.ents = list(doc4.ents) +new_ents

In [106]:
show_ents(doc4)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [107]:
"""Counting named entities"""

'Counting named entities'

In [108]:
doc5= nlp(u"Originally priced at $29.50, the sweater was marked down to five dollars.")

In [117]:
len([ent for ent in doc5.ents if ent.label_=="MONEY"])

2

In [119]:
"""Visualisation"""

'Visualisation'

In [122]:
from spacy import displacy

In [120]:
doc6 = nlp("Over the last quarter Apple sold nearly 20000 iPods for a profit \
           of $6 million dollar")

In [125]:
displacy.render(doc6, style= "ent", jupyter = True)