In [5]:
import spacy
from pprint import pprint
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [6]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])


[('European', 'NORP'),
 ('Google', 'ORG'),
 ('a record $5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [7]:
#token
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'B', 'MONEY'),
 (record, 'I', 'MONEY'),
 ($, 'I', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [8]:
#extracting named entity from an article
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

187

In [9]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 90,
         'GPE': 18,
         'ORG': 36,
         'DATE': 31,
         'FAC': 2,
         'CARDINAL': 6,
         'NORP': 2,
         'ORDINAL': 1,
         'PRODUCT': 1})

In [10]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 31), ('F.B.I.', 18), ('Trump', 12)]

In [11]:
sentences = [x for x in article.sents]
print(sentences[21])

“A lengthy investigation and multiple rounds of congressional testimony failed to produce a shred of evidence that Special Agent Strzok’s personal views ever affected his work.


In [12]:
displacy.render(nlp(str(sentences[21])), jupyter=True, style='ent')

  "__main__", mod_spec)


In [13]:
displacy.render(nlp(str(sentences[21])), style='dep', jupyter = True, options = {'distance': 120})

In [14]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[21])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('lengthy', 'ADJ', 'lengthy'),
 ('investigation', 'NOUN', 'investigation'),
 ('multiple', 'ADJ', 'multiple'),
 ('rounds', 'NOUN', 'round'),
 ('congressional', 'ADJ', 'congressional'),
 ('testimony', 'NOUN', 'testimony'),
 ('failed', 'VERB', 'fail'),
 ('produce', 'VERB', 'produce'),
 ('shred', 'NOUN', 'shred'),
 ('evidence', 'NOUN', 'evidence'),
 ('Special', 'PROPN', 'Special'),
 ('Agent', 'PROPN', 'Agent'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('personal', 'ADJ', 'personal'),
 ('views', 'NOUN', 'view'),
 ('affected', 'VERB', 'affect'),
 ('work', 'NOUN', 'work')]

In [15]:
dict([(str(x), x.label_) for x in nlp(str(sentences[21])).ents])

{}

In [16]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[21]])

[(“, 'O', ''), (A, 'O', ''), (lengthy, 'O', ''), (investigation, 'O', ''), (and, 'O', ''), (multiple, 'O', ''), (rounds, 'O', ''), (of, 'O', ''), (congressional, 'O', ''), (testimony, 'O', ''), (failed, 'O', ''), (to, 'O', ''), (produce, 'O', ''), (a, 'O', ''), (shred, 'O', ''), (of, 'O', ''), (evidence, 'O', ''), (that, 'O', ''), (Special, 'O', ''), (Agent, 'O', ''), (Strzok, 'O', ''), (’s, 'O', ''), (personal, 'O', ''), (views, 'O', ''), (ever, 'O', ''), (affected, 'O', ''), (his, 'O', ''), (work, 'O', ''), (., 'O', '')]


In [18]:
displacy.render(article,jupyter=True,style='ent')