In [None]:
import nltk
import pandas as pd

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [None]:
text = " Sundar Pichai, the CEO of Google Inc. is walking in the streets of California."


In [None]:
words = nltk.word_tokenize(text)
words

['Sundar',
 'Pichai',
 ',',
 'the',
 'CEO',
 'of',
 'Google',
 'Inc.',
 'is',
 'walking',
 'in',
 'the',
 'streets',
 'of',
 'California',
 '.']

In [None]:
#Part of speech tagging
pos_tags = nltk.pos_tag(words)
pos_tags

[('Sundar', 'NNP'),
 ('Pichai', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('CEO', 'NNP'),
 ('of', 'IN'),
 ('Google', 'NNP'),
 ('Inc.', 'NNP'),
 ('is', 'VBZ'),
 ('walking', 'VBG'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('streets', 'NNS'),
 ('of', 'IN'),
 ('California', 'NNP'),
 ('.', '.')]

In [None]:
#check nltk help for description of the tag
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [None]:
chunks = nltk.ne_chunk(pos_tags, binary=True) #either NE or not NE
for chunk in chunks:
    print(chunk)

(NE Sundar/NNP Pichai/NNP)
(',', ',')
('the', 'DT')
('CEO', 'NNP')
('of', 'IN')
(NE Google/NNP Inc./NNP)
('is', 'VBZ')
('walking', 'VBG')
('in', 'IN')
('the', 'DT')
('streets', 'NNS')
('of', 'IN')
(NE California/NNP)
('.', '.')


In [None]:

entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

(NE Sundar/NNP Pichai/NNP)
(NE Google/NNP Inc./NNP)
(NE California/NNP)


Unnamed: 0,Entities,Labels
0,Google Inc.,NE
1,California,NE
2,Sundar Pichai,NE


In [None]:
# by using Sentence
entities = []
labels = []

sentence = nltk.sent_tokenize(text)
for sent in sentence:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)),binary=False):
        if hasattr(chunk,'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())
            
entities_labels = list(set(zip(entities,labels)))

entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Pichai,ORGANIZATION
1,Google Inc.,ORGANIZATION
2,California,GPE
3,Sundar,PERSON
4,CEO,ORGANIZATION
