In [1]:
#import necessary packages
#Spacy's named entity recognition has been trained on the OntoNotes 5 Corpus
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import utils

In [2]:
nlp = en_core_web_sm.load()


In [3]:
#just an example as to how we can apply spacy for nlp
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google a record $5.1 billion', 'ORG'), ('Wednesday', 'DATE')]


In [4]:
#using BILUO Tagging scheme
#B --> Begin
#I --> In
#L --> Last
#U --> Unit
#O --> Out
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (Google, 'B', 'ORG'), (a, 'I', 'ORG'), (record, 'I', 'ORG'), ($, 'I', 'ORG'), (5.1, 'I', 'ORG'), (billion, 'I', 'ORG'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]


In [5]:
#Extracting named entity from an article
dawn_bb = utils.url_to_string('https://www.dawn.com/news/1689940/a-stable-tenure')
article = nlp(dawn_bb)
len(article.ents)
#185 entities in this article with 10 unique labels

185

In [6]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 32,
         'DATE': 62,
         'ORG': 30,
         'FAC': 6,
         'LAW': 4,
         'NORP': 8,
         'GPE': 19,
         'CARDINAL': 20,
         'TIME': 1,
         'ORDINAL': 1,
         'MONEY': 1,
         'PRODUCT': 1})

In [7]:
#Following are the three most frequent tokens
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('2022', 9), ('Pakistan', 7), ('May 16, 2022', 3)]

In [8]:
#Randomly select one sentence to analyze
sentences = [x for x in article.sents]
print(sentences[20])

Only four democratically elected leaders have ever come close to that mark in our 75-year history.


In [9]:
#displacy.render generates the raw markup
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [10]:
#displacy visualizer helps you to analyze the dependencies
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [11]:
#extract part-of-speech and lemmatize this sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('democratically', 'ADV', 'democratically'),
 ('elected', 'VERB', 'elect'),
 ('leaders', 'NOUN', 'leader'),
 ('come', 'VERB', 'come'),
 ('close', 'ADV', 'close'),
 ('mark', 'NOUN', 'mark'),
 ('75', 'NUM', '75'),
 ('year', 'NOUN', 'year'),
 ('history', 'NOUN', 'history')]

In [12]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Only four': 'CARDINAL', '75-year': 'DATE'}

In [13]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Only, 'B', 'CARDINAL'), (four, 'I', 'CARDINAL'), (democratically, 'O', ''), (elected, 'O', ''), (leaders, 'O', ''), (have, 'O', ''), (ever, 'O', ''), (come, 'O', ''), (close, 'O', ''), (to, 'O', ''), (that, 'O', ''), (mark, 'O', ''), (in, 'O', ''), (our, 'O', ''), (75, 'B', 'DATE'), (-, 'I', 'DATE'), (year, 'I', 'DATE'), (history, 'O', ''), (., 'O', '')]


In [14]:
#Visualize the entity of an entire article
displacy.render(article, jupyter=True, style='ent')