In [1]:
import spacy
from spacy import displacy
from collections import Counter

import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
from bs4 import BeautifulSoup
import requests
import re

In [3]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [6]:
covid_news = url_to_string('https://www.bbc.com/news/world-52748894')
article = nlp(covid_news)
len(article.ents)

132

In [7]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'DATE': 18,
         'ORG': 24,
         'LOC': 5,
         'CARDINAL': 22,
         'PERSON': 23,
         'PRODUCT': 4,
         'TIME': 2,
         'MONEY': 1,
         'GPE': 24,
         'NORP': 7,
         'ORDINAL': 2})

There are 132 entities in the article and they are represented as 11 unique labels

In [8]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('US', 7), ('WHO', 5), ('Wednesday', 4)]

The most frequent tokens are: US, WHO, Wednesday

In [9]:
sentences = [x for x in article.sents]
print(sentences[70])

Greek officials say they hope to start their tourism season next month despite the global pandemic


In [10]:
displacy.render(nlp(str(sentences[70])), jupyter=True, style='ent')

In [11]:
displacy.render(nlp(str(sentences[70])), style='dep', jupyter = True, options = {'distance': 120})

In [12]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[70]))if not y.is_stop and y.pos_ != 'PUNCT']]

[('Greek', 'ADJ', 'greek'),
 ('officials', 'NOUN', 'official'),
 ('hope', 'VERB', 'hope'),
 ('start', 'VERB', 'start'),
 ('tourism', 'NOUN', 'tourism'),
 ('season', 'NOUN', 'season'),
 ('month', 'NOUN', 'month'),
 ('despite', 'SCONJ', 'despite'),
 ('global', 'ADJ', 'global'),
 ('pandemic', 'NOUN', 'pandemic')]

In [13]:
dict([(str(x), x.label_) for x in nlp(str(sentences[70])).ents])

{'Greek': 'NORP', 'next month': 'DATE'}

In [14]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[70]])

[(Greek, 'B', 'NORP'), (officials, 'O', ''), (say, 'O', ''), (they, 'O', ''), (hope, 'O', ''), (to, 'O', ''), (start, 'O', ''), (their, 'B', 'DATE'), (tourism, 'I', 'DATE'), (season, 'I', 'DATE'), (next, 'I', 'DATE'), (month, 'I', 'DATE'), (despite, 'O', ''), (the, 'O', ''), (global, 'O', ''), (pandemic, 'O', '')]


In [15]:
displacy.render(article, jupyter=True, style='ent')