In [None]:
# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

In [1]:
import spacy
from spacy import displacy
from collections import Counter
# python -m spacy download en
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [3]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (Google, 'B', 'ORG'), (a, 'O', ''), (record, 'O', ''), ($, 'B', 'MONEY'), (5.1, 'I', 'MONEY'), (billion, 'I', 'MONEY'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]


In [5]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

172

In [6]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 82,
         'GPE': 16,
         'CARDINAL': 5,
         'ORG': 39,
         'DATE': 23,
         'NORP': 2,
         'ORDINAL': 1,
         'FAC': 1,
         'PRODUCT': 3})

In [7]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12)]

In [8]:
sentences = [x for x in article.sents]
print(sentences[25])

Strzok’s text exchanges with Ms. Page demonstrated animosity toward Mr. Trump.


In [9]:
displacy.render(nlp(str(sentences[25])), jupyter=True, style='ent')

In [10]:
displacy.render(nlp(str(sentences[25])), style='dep', jupyter = True, options = {'distance': 120})

In [11]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[25])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Strzok', 'PROPN', 'Strzok'),
 ('’s', 'PROPN', '’s'),
 ('text', 'NOUN', 'text'),
 ('exchanges', 'NOUN', 'exchange'),
 ('Ms.', 'PROPN', 'Ms.'),
 ('Page', 'PROPN', 'Page'),
 ('demonstrated', 'VERB', 'demonstrate'),
 ('animosity', 'NOUN', 'animosity'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Trump', 'PROPN', 'Trump')]

In [12]:
dict([(str(x), x.label_) for x in nlp(str(sentences[25])).ents])

{'Strzok’s': 'ORG', 'Page': 'PERSON', 'Trump': 'PERSON'}

In [13]:
displacy.render(article, jupyter=True, style='ent')