In [6]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

In [7]:
text = '''
Hello, everyone! This is the LONGEST TEXT EVER! I was inspired by the various other "longest texts ever" 
on the internet, and I wanted to make my own. So here it is! This is going to be a WORLD RECORD! This is 
actually my third attempt at doing this. The first time, I didn't save it. The second time, the Neocities 
editor crashed. Now I'm writing this in Notepad, then copying it into the Neocities editor instead of typing 
it directly in the Neocities editor to avoid crashing. It sucks that my past two attempts are gone now. 
Those actually got pretty long. Not the longest, but still pretty long. I hope this one won't get lost somehow. 
Anyways, let's talk about WAFFLES! I like waffles. 
'''

In [8]:
#tokenize to words
words = word_tokenize(text)

In [9]:
#pos tagging
pos_tags = pos_tag(words)
pos_tags

[('Hello', 'NNP'),
 (',', ','),
 ('everyone', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('LONGEST', 'NNP'),
 ('TEXT', 'NNP'),
 ('EVER', 'NNP'),
 ('!', '.'),
 ('I', 'PRP'),
 ('was', 'VBD'),
 ('inspired', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('various', 'JJ'),
 ('other', 'JJ'),
 ('``', '``'),
 ('longest', 'JJS'),
 ('texts', 'NN'),
 ('ever', 'RB'),
 ("''", "''"),
 ('on', 'IN'),
 ('the', 'DT'),
 ('internet', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('I', 'PRP'),
 ('wanted', 'VBD'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('my', 'PRP$'),
 ('own', 'JJ'),
 ('.', '.'),
 ('So', 'RB'),
 ('here', 'RB'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('WORLD', 'NNP'),
 ('RECORD', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('actually', 'RB'),
 ('my', 'PRP$'),
 ('third', 'JJ'),
 ('attempt', 'NN'),
 ('at', 'IN'),
 ('doing', 'VBG'),
 ('this', 'DT'),
 ('.', '.'),
 ('The', 'DT'),

In [10]:
#checks every chunk if there's a Named Entity
chunks = ne_chunk(pos_tags, binary=False)
for chunk in chunks:
    print(chunk)

(GPE Hello/NNP)
(',', ',')
('everyone', 'NN')
('!', '.')
('This', 'DT')
('is', 'VBZ')
('the', 'DT')
(ORGANIZATION LONGEST/NNP)
('TEXT', 'NNP')
('EVER', 'NNP')
('!', '.')
('I', 'PRP')
('was', 'VBD')
('inspired', 'VBN')
('by', 'IN')
('the', 'DT')
('various', 'JJ')
('other', 'JJ')
('``', '``')
('longest', 'JJS')
('texts', 'NN')
('ever', 'RB')
("''", "''")
('on', 'IN')
('the', 'DT')
('internet', 'NN')
(',', ',')
('and', 'CC')
('I', 'PRP')
('wanted', 'VBD')
('to', 'TO')
('make', 'VB')
('my', 'PRP$')
('own', 'JJ')
('.', '.')
('So', 'RB')
('here', 'RB')
('it', 'PRP')
('is', 'VBZ')
('!', '.')
('This', 'DT')
('is', 'VBZ')
('going', 'VBG')
('to', 'TO')
('be', 'VB')
('a', 'DT')
(ORGANIZATION WORLD/NNP)
('RECORD', 'NN')
('!', '.')
('This', 'DT')
('is', 'VBZ')
('actually', 'RB')
('my', 'PRP$')
('third', 'JJ')
('attempt', 'NN')
('at', 'IN')
('doing', 'VBG')
('this', 'DT')
('.', '.')
('The', 'DT')
('first', 'JJ')
('time', 'NN')
(',', ',')
('I', 'PRP')
('did', 'VBD')
("n't", 'RB')
('save', 'VB')
('it'

In [11]:
#creating table with list of entities
entities = []
labels = []
for chunk in chunks:
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,LONGEST,ORGANIZATION
1,Notepad,GPE
2,WAFFLES,ORGANIZATION
3,Neocities,ORGANIZATION
4,WORLD,ORGANIZATION
5,Hello,GPE


In [12]:
s_entities = []
s_labels = []

sentence = nltk.sent_tokenize(text)
for sent in sentence:
    for chunk in ne_chunk(pos_tag(word_tokenize(sent)), binary=False):
        if hasattr(chunk, 'label'):
            s_entities.append(' '.join(c[0] for c in chunk))
            s_labels.append(chunk.label())

s_entities_labels = list(set(zip(s_entities, s_labels)))
s_entities_df = pd.DataFrame(s_entities_labels)
s_entities_df.columns = ["Entities", "Labels"]
s_entities_df

Unnamed: 0,Entities,Labels
0,LONGEST,ORGANIZATION
1,Notepad,GPE
2,WAFFLES,ORGANIZATION
3,Neocities,ORGANIZATION
4,WORLD,ORGANIZATION
5,Hello,GPE


In [13]:
# using SpaCy
import spacy
from spacy import displacy

In [14]:
# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

In [17]:
doc = nlp(text)

m_entities = []
m_labels = []
position_start = []
position_end = []
exp = []

for ent in doc.ents:
    m_entities.append(ent)
    m_labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    exp.append(spacy.explain(ent.label_))
    
df = pd.DataFrame({"Entities" : m_entities, "Labels" : m_labels, "Position_Start" : position_start, "Position_End" : position_end, "Exp." : exp})

df

Unnamed: 0,Entities,Labels,Position_Start,Position_End,Exp.
0,(third),ORDINAL,225,230,"""first"", ""second"", etc."
1,(first),ORDINAL,258,263,"""first"", ""second"", etc."
2,(second),ORDINAL,292,298,"""first"", ""second"", etc."
3,(Notepad),GPE,360,367,"Countries, cities, states"
4,(two),CARDINAL,507,510,Numerals that do not fall under another type
