In [1]:
import nltk
import pandas as pd
from nltk import pos_tag
from nltk import ne_chunk
from nltk.tokenize import word_tokenize


In [2]:
text = """
    In 1995 he joined the Computer Science Department at Carnegie Mellon University (CMU) as a research computer scientist. In 1998 he became an assistant professor and co-director of the Robot Learning Laboratory at CMU. As a faculty member at CMU, he co-founded the Master's Program in Automated Learning and Discovery, which later would become a Ph.D. program in the broad area of machine learning and scientific discovery. In 2001 Thrun spent a sabbatical year at Stanford University. He returned to CMU to an endowed professorship, the Finmeccanica Associate Professor of Computer Science and Robotics.
"""
words = word_tokenize(text)
pt_words = pos_tag(words)

In [3]:
chunks = ne_chunk(pt_words, binary=True)


for chunk in chunks:
    print(chunk)

('In', 'IN')
('1995', 'CD')
('he', 'PRP')
('joined', 'VBD')
('the', 'DT')
(NE Computer/NNP Science/NNP Department/NNP)
('at', 'IN')
(NE Carnegie/NNP Mellon/NNP University/NNP)
('(', '(')
(NE CMU/NNP)
(')', ')')
('as', 'IN')
('a', 'DT')
('research', 'NN')
('computer', 'NN')
('scientist', 'NN')
('.', '.')
('In', 'IN')
('1998', 'CD')
('he', 'PRP')
('became', 'VBD')
('an', 'DT')
('assistant', 'NN')
('professor', 'NN')
('and', 'CC')
('co-director', 'NN')
('of', 'IN')
('the', 'DT')
(NE Robot/NNP Learning/NNP Laboratory/NNP)
('at', 'IN')
(NE CMU/NNP)
('.', '.')
('As', 'IN')
('a', 'DT')
('faculty', 'NN')
('member', 'NN')
('at', 'IN')
(NE CMU/NNP)
(',', ',')
('he', 'PRP')
('co-founded', 'VBD')
('the', 'DT')
(NE Master/NNP)
("'s", 'POS')
('Program', 'NNP')
('in', 'IN')
(NE Automated/NNP Learning/NNP)
('and', 'CC')
(NE Discovery/NNP)
(',', ',')
('which', 'WDT')
('later', 'RB')
('would', 'MD')
('become', 'VB')
('a', 'DT')
('Ph.D.', 'NNP')
('program', 'NN')
('in', 'IN')
('the', 'DT')
('broad', 'JJ'

In [4]:
entities = []
labels = []

for chunk in chunks:
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,CMU,NE
1,Carnegie Mellon University,NE
2,Computer Science,NE
3,Discovery,NE
4,Finmeccanica Associate Professor,NE
5,Robot Learning Laboratory,NE
6,Stanford University,NE
7,Master,NE
8,Automated Learning,NE
9,Computer Science Department,NE


In [5]:
entities = []
labels = []

sentence = nltk.sent_tokenize(text)

for sent in sentence:
    for chunk in ne_chunk(pos_tag(word_tokenize(sent)), binary=False):
        if hasattr(chunk, "label"):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())

entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Carnegie Mellon University,ORGANIZATION
1,Computer Science,ORGANIZATION
2,Robotics,PERSON
3,Robot,ORGANIZATION
4,Stanford University,ORGANIZATION
5,Finmeccanica Associate,ORGANIZATION
6,Discovery,PERSON
7,Automated,GPE
8,Computer Science Department,ORGANIZATION
9,Master,ORGANIZATION


In [6]:
import spacy
from spacy import displacy

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
doc = nlp(text)
doc.ents
entities = []
labels = []

position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)

df = pd.DataFrame({'Entities': entities, 'Labels': labels, 'Position Start': position_start, 'Position End': position_end})
df

Unnamed: 0,Entities,Labels,Position Start,Position End
0,(1995),DATE,8,12
1,"(the, Computer, Science, Department)",ORG,23,54
2,"(Carnegie, Mellon, University)",ORG,58,84
3,(1998),DATE,128,132
4,"(the, Robot, Learning, Laboratory)",ORG,185,214
5,(CMU),ORG,218,221
6,(CMU),ORG,246,249
7,"(the, Master, 's, Program)",ORG,265,285
8,"(Automated, Learning)",GPE,289,307
9,(Discovery),PRODUCT,312,321


In [9]:
spacy.explain('GPE')

'Countries, cities, states'