# Named Entity Recognition (NER)

In [0]:
'Where is Eiffel Tower ? ' # Building 
'Who is Alan Turing ? ' # Person

## Corpus

In [0]:
import nltk

nltk.download('brown')
print('---')

from nltk.corpus import brown

brown_sents = brown.sents(categories='news')
print(brown_sents[0])

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
---
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


## POS Tags

In [0]:
nltk.download('averaged_perceptron_tagger')
print('---')

tagged_sentence = nltk.pos_tag(brown_sents[0])
print(tagged_sentence)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
---
[('The', 'DT'), ('Fulton', 'NNP'), ('County', 'NNP'), ('Grand', 'NNP'), ('Jury', 'NNP'), ('said', 'VBD'), ('Friday', 'NNP'), ('an', 'DT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NNP'), ('recent', 'JJ'), ('primary', 'JJ'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'DT'), ('evidence', 'NN'), ("''", "''"), ('that', 'IN'), ('any', 'DT'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


# nltk

In [0]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
print('---')

ner_sentence = nltk.ne_chunk(tagged_sentence)

for ne in ner_sentence:
  print(ne)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
---
('The', 'DT')
(ORGANIZATION Fulton/NNP County/NNP Grand/NNP Jury/NNP)
('said', 'VBD')
('Friday', 'NNP')
('an', 'DT')
('investigation', 'NN')
('of', 'IN')
("Atlanta's", 'NNP')
('recent', 'JJ')
('primary', 'JJ')
('election', 'NN')
('produced', 'VBD')
('``', '``')
('no', 'DT')
('evidence', 'NN')
("''", "''")
('that', 'IN')
('any', 'DT')
('irregularities', 'NNS')
('took', 'VBD')
('place', 'NN')
('.', '.')


In [0]:
sentence = 'Can machines think ?'
sentence = [token for token in sentence.split()]

pt = nltk.pos_tag(sentence)
ne = nltk.ne_chunk(pt)

for n in ne:
  print(n)

('Can', 'MD')
('machines', 'NNS')
('think', 'VB')
('?', '.')


In [0]:
sentence = 'Can machines think ? said by Alan Turing .'
sentence = [token for token in sentence.split()]

pt = nltk.pos_tag(sentence)
ne = nltk.ne_chunk(pt)

for n in ne:
  print(n)

('Can', 'MD')
('machines', 'NNS')
('think', 'VB')
('?', '.')
('said', 'VBD')
('by', 'IN')
(PERSON Alan/NNP Turing/NNP)
('.', '.')


In [0]:
sentence = 'Can machines think ? said by Alan Turing .'
sentence = [token.lower() for token in sentence.split()]

pt = nltk.pos_tag(sentence)
ne = nltk.ne_chunk(pt)

for n in ne:
  print(n)

('can', 'MD')
('machines', 'NNS')
('think', 'VB')
('?', '.')
('said', 'VBD')
('by', 'IN')
('alan', 'NN')
('turing', 'NN')
('.', '.')


# spacy

https://spacy.io/

In [0]:
!pip install spacy



In [0]:
import spacy

spacy_nlp = spacy.load('en')

In [0]:
sentence = 'Can machines think ? said by Alan Turing .'

document = spacy_nlp(sentence)

In [0]:
print(sentence)

print('\n=== POS ===')
for token in document:
  print(f'Token: {token}, POS: {token.pos_}')

print('\n=== NER ===')
for token in document.ents:
  print(f'Token: {token}, NE: {token.label_}')

Can machines think ? said by Alan Turing .

=== POS ===
Token: Can, POS: VERB
Token: machines, POS: NOUN
Token: think, POS: VERB
Token: ?, POS: PUNCT
Token: said, POS: VERB
Token: by, POS: ADP
Token: Alan, POS: PROPN
Token: Turing, POS: PROPN
Token: ., POS: PUNCT

=== NER ===
Token: Alan Turing, NE: PERSON


In [0]:
sentence = ' '.join(brown_sents[0])
doc = spacy_nlp(sentence)
print(sentence, '\n')

for token in doc.ents:
  print(f'Token: {token}, NE: {token.label_}')

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place . 

Token: The Fulton County Grand Jury, NE: ORG
Token: Friday, NE: DATE
Token: Atlanta, NE: GPE


## useful function to visualize 

In [0]:
from spacy import displacy

displacy.render(doc, jupyter=True)

In [0]:
displacy.render(doc, jupyter=True, style='ent')

In [0]:
sentence = 'Where is Eiffel Tower?'

doc = spacy_nlp(sentence)

print(sentence)
print()

for token in doc.ents:
  print(f'Token {token}, NE: {token.label_}')

Where is Eiffel Tower?

Token Eiffel Tower, NE: FAC


In [0]:
displacy.render(doc, jupyter=True, style='ent')