### Definition
##### Named entity recognition is a process where the named entity gets identified and linked to its class. 

### Difference between PoS Tagging and NER

##### In POS tagging we focus on the part of speech of any word in any sentence.
##### whereas in NER we focus more on the recognition of different names of the object, person, place, time etc.

In [5]:
import pandas as pd

In [2]:
import tensorflow

In [7]:
#pip install spacy

In [106]:
### Name Entity Recognition
#!pip install -U pip setuptools wheel
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

In [8]:
import spacy

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [21]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print('name/noun:',ent.text+' '+ 'start character:',str(ent.start_char)+' '+'end character:',str(ent.end_char)+' '+'entity',ent.label_)
    else:
        print('No name entity found')

In [22]:
doc1=nlp('Apple is looking at buying U.K. start up for $1 billion')

In [23]:
show_ents(doc1)

name/noun: Apple start character: 0 end character: 5 entity ORG
name/noun: U.K. start character: 27 end character: 31 entity GPE
name/noun: $1 billion start character: 45 end character: 55 entity MONEY


In [24]:
doc2=nlp('Pakistan is my country. Islamabad is its Capital. I  will go to Lahore after two days')
show_ents(doc2)

name/noun: Pakistan start character: 0 end character: 8 entity GPE
name/noun: Islamabad start character: 24 end character: 33 entity GPE
name/noun: Lahore start character: 64 end character: 70 entity GPE
name/noun: two days start character: 77 end character: 85 entity DATE


In [84]:
from spacy import displacy

In [85]:
displacy.render(doc2,style='ent',jupyter=True)

In [25]:
### user defined NER

In [31]:
from spacy.tokens import Span

In [27]:
doc=nlp('Tasla to build U.K factory $6 billion.')
show_ents(doc)

name/noun: U.K start character: 15 end character: 18 entity GPE
name/noun: $6 billion start character: 27 end character: 37 entity MONEY


In [32]:
# get hash value of ORG string
ORG = doc.vocab.strings[u'ORG']
# create a span for new entity
new_ent = Span(doc,0,1,label=ORG)
# add the entity to existing doc object
doc.ents = list(doc.ents) + [new_ent]

In [33]:
show_ents(doc)

name/noun: Tasla start character: 0 end character: 5 entity ORG
name/noun: U.K start character: 15 end character: 18 entity GPE
name/noun: $6 billion start character: 27 end character: 37 entity MONEY


In [38]:
doc=nlp('We are selling Vacume cleaner.')
# get hash value of ORG string
PRODT = doc.vocab.strings[u'PRODUCT']
# create a span for new entity
new_ent = Span(doc,0,1,label=PRODT)
# add the entity to existing doc object
doc.ents = list(doc.ents) + [new_ent]

In [39]:
doc3=nlp('We are selling Vacume cleaner.')
show_ents(doc3)

name/noun: Vacume start character: 15 end character: 21 entity GPE


In [40]:
### phrase matcher in nlp

In [41]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [78]:
docc = nlp(u"our company plans to introduce a new vaccum cleaner."
           u"if successful, the vaccum-cleaner will be our first product")

In [79]:
show_ents(docc)

name/noun: first start character: 98 end character: 103 entity ORDINAL


In [80]:
phrase_list = ['vaccum cleaner','vaccum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [81]:
matcher.add('newproduct',None,*phrase_patterns)
matches=matcher(docc)

In [82]:
matches

[(2689272359382549672, 12, 15)]

In [83]:
product = docc.vocab.strings[u'PRODUCT']
new_ents = [Span(doc, match[1],match[2],label=product) for match in matches]

IndexError: [E035] Error creating span with start 12 and end 15 for Doc of length 6.

In [86]:
### Name Entity Recognition using NLTK

In [95]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [96]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [97]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [100]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [102]:
displacy.render(,style='ent',jupyter=True)

ValueError: [E096] Invalid object passed to displaCy: Can only visualize `Doc` or Span objects, or dicts if set to `manual=True`.

In [104]:
nltk.chunk.conllstr2tree(sent, chunk_types=['NP']).draw()

AttributeError: 'list' object has no attribute 'split'