In [1]:
#named entity recognition: spacy has an ner pipeline component that identifies token spans fitting a predetermined set of named entities these are available as the ents propertty

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
#write a function to displacy basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
           print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
        else:
            print('no named entities found.')

In [4]:
doc = nlp(u'May I go to Washington,Dc next may to see the Washington Monument?')

doc

May I go to Washington,Dc next may to see the Washington Monument?

In [5]:
show_ents(doc)

Washington,Dc - GPE - Countries, cities, states
the Washington Monument - ORG - Companies, agencies, institutions, etc.
no named entities found.


In [6]:
doc = nlp(u'can i please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text,ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [7]:
#NER tags:tagg are accessible through tthe .label_property of an entity.

In [8]:
doc = nlp(u'can i please borrow 500 dollars from you to buy some Microsoft stock?')

for token in doc:
    print(token.text,end='|')

can|i|please|borrow|500|dollars|from|you|to|buy|some|Microsoft|stock|?|

In [9]:
#adding a named entity as span: normally we would have spacy build and add entity..

In [10]:
doc=nlp(u'tesla to build a U.K.factory for $6 million')

show_ents(doc)

$6 million - MONEY - Monetary values, including unit
no named entities found.


In [11]:
from spacy.tokens import Span
#get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']

#create a span for the new entity
new_ent = Span(doc, 0 , 1,label=ORG)

#add the entity to the existing doc object
doc.ents = list(doc.ents) + [new_ent]

In [12]:
doc

tesla to build a U.K.factory for $6 million

In [13]:
show_ents(doc)

tesla - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit
no named entities found.


In [14]:
#adding named entities to all matching spans:-what if we want to tag all occurence of tesla? in this section how to use the phrasematcher to identity a series of span in the doc

In [15]:
doc=nlp(u'our company plans to introduce a new vaccum cleaner,'
        u'if succesful,the vaccum-cleaner will be our first product.')
show_ents(doc)

first - ORDINAL - "first", "second", etc.
no named entities found.


In [16]:
#import phrasematcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [17]:
#create the desired phrase patterns:
phrase_list = ['vaccum cleaner','vaccum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [18]:
#apply the patterns to our matcher object:
matcher.add('newproduct',None,*phrase_patterns)

#apply the matcher to our matcher objects:
matches = matcher(doc)

#see what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 17)]

In [19]:
#here we create spans from each match,and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc,match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [20]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.
no named entities found.


In [21]:
#counting entities:while spacy may not have a built-in tool for counting entitiess,we can pass a conditional statement into a list comprehensive.

In [22]:
doc=nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit
no named entities found.


In [23]:
len([ent for ent in doc.ents if ent.label_ =='MONEY'])

2

In [24]:
#problem with line breaks:-there's a know issue with spa y v2.0.12 where some linebreaks are interpreted as GPE entities.

In [28]:
spacy.__version__

'3.8.2'

In [27]:
doc=nlp(u'originally priced at $29.50,\n the sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit
no named entities found.


In [33]:
#noun chunks:
#text=the originalnoun chunk text.
#root text=the original text connected to noun chunks to the rest of the phrase
#root.dep=dependency relation connecting the root to its head.
#root.head.text=the text of the root token's head.

In [32]:
doc=nlp(u'Autonomous cars shift insurance liability towards manufactures.')
for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufactures - manufactures - pobj - towards


In [34]:
len(doc.noun_chunk)*it is not use in noun chunks

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'noun_chunk'

In [36]:
len(list(doc.noun_chunks))

3