In [38]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
nlp = spacy.load('en_core_web_sm')
#nltk.download()

In [39]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [40]:
#Then we apply word tokenization and part-of-speech tagging to the sentence.
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [41]:
preprocess(ex)

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [42]:
doc=nlp(ex)
[(token.text,token.pos_,token.tag_) for token in doc]

[('European', 'ADJ', 'JJ'),
 ('authorities', 'NOUN', 'NNS'),
 ('fined', 'VERB', 'VBD'),
 ('Google', 'PROPN', 'NNP'),
 ('a', 'DET', 'DT'),
 ('record', 'NOUN', 'NN'),
 ('$', 'SYM', '$'),
 ('5.1', 'NUM', 'CD'),
 ('billion', 'NUM', 'CD'),
 ('on', 'ADP', 'IN'),
 ('Wednesday', 'PROPN', 'NNP'),
 ('for', 'ADP', 'IN'),
 ('abusing', 'VERB', 'VBG'),
 ('its', 'ADJ', 'PRP$'),
 ('power', 'NOUN', 'NN'),
 ('in', 'ADP', 'IN'),
 ('the', 'DET', 'DT'),
 ('mobile', 'ADJ', 'JJ'),
 ('phone', 'NOUN', 'NN'),
 ('market', 'NOUN', 'NN'),
 ('and', 'CCONJ', 'CC'),
 ('ordered', 'VERB', 'VBD'),
 ('the', 'DET', 'DT'),
 ('company', 'NOUN', 'NN'),
 ('to', 'PART', 'TO'),
 ('alter', 'VERB', 'VB'),
 ('its', 'ADJ', 'PRP$'),
 ('practices', 'NOUN', 'NNS')]

In [11]:
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_sm')

In [13]:
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)


In [45]:
toto=tokenizer(ex)

In [50]:
[t for t in toto if t.is_alpha]

[European,
 authorities,
 fined,
 Google,
 a,
 record,
 billion,
 on,
 Wednesday,
 for,
 abusing,
 its,
 power,
 in,
 the,
 mobile,
 phone,
 market,
 and,
 ordered,
 the,
 company,
 to,
 alter,
 its,
 practices]

In [51]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

187

In [53]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 5,
         'DATE': 29,
         'EVENT': 1,
         'GPE': 35,
         'LOC': 1,
         'NORP': 5,
         'ORDINAL': 1,
         'ORG': 25,
         'PERSON': 84,
         'WORK_OF_ART': 1})

In [55]:
#The following are three most frequent tokens.

items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10)]

In [56]:
#Let’s randomly select one sentence to learn more.

sentences = [x for x in article.sents]
print(sentences[20])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [57]:
#Let’s run displacy.render to generate the raw markup.

displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')