In [0]:
from datetime import date
today=date.today()

d=today.strftime("%m-%d-%y")

In [0]:
cnn_url="https://www.cnn.com/world/live-news/coronavirus-pandemic-{}/index.html".format(d)

In [0]:
print(cnn_url)

https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html


In [0]:
from bs4 import BeautifulSoup
import requests

In [0]:
html= requests.get(cnn_url).text

In [0]:
soup = BeautifulSoup(html)
print(soup.title)

<title data-rh="true">Coronavirus pandemic: Live updates - CNN</title>


In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [0]:
for link in soup.find_all("h2"):
    print("Headline : {}".format(link.text))
    for ent in nlp(link.text).ents: 
      print("\tText : {}, Entity : {}".format(ent.text, ent.label_))

Headline : What you need to know
Headline : NY Federal Reserve scaling back emergency intervention into financial markets
	Text : NY Federal Reserve, Entity : ORG
Headline : Treasury says 80 million people will receive stimulus payments this week
	Text : Treasury, Entity : ORG
	Text : 80 million, Entity : CARDINAL
	Text : this week, Entity : DATE
Headline : ESPN asks commentators to take a 15% pay cut because of coronavirus
	Text : ESPN, Entity : ORG
	Text : 15%, Entity : PERCENT
Headline : West Coast states make pact to work together on reopening based on health outcomes
	Text : West Coast, Entity : LOC
Headline : US stocks end mixed
	Text : US, Entity : GPE
Headline : New Jersey law school grads can temporarily practice law without taking the bar, court rules
	Text : New Jersey, Entity : GPE
Headline : More than 17% of the New York police force is out sick
	Text : More than 17%, Entity : PERCENT
	Text : New York, Entity : GPE
Headline : More than 130,000 unemployment claims filed in 

In [0]:
nbc_url='https://www.nbcnews.com/health/coronavirus'
cnbc_rss_url='https://www.cnbc.com/id/10000108/device/rss/rss.html'

In [0]:
urls=[cnn_url, nbc_url,cnbc_rss_url]
formats=['html.parser','html.parser','xml']
tags=['h2','h2','description']
website=['CNN', 'NBC','CNBC']

In [0]:
crawl_len=0
news_dict=[]
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content,formats[crawl_len])

    for link in soup.find_all(tags[crawl_len]):

      if(len(link.text.split(" ")) > 4):

        entities=[]

        entities=[(ent.text, ent.label_) for ent in nlp(link.text).ents]

        news_dict.append({'website':website[crawl_len],'url': url,'headline':link.text,'entities':entities})
    
    crawl_len=crawl_len+1


In [0]:
news_dict

[{'entities': [],
  'headline': 'What you need to know',
  'url': 'https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html',
  'website': 'CNN'},
 {'entities': [('Kansas', 'GPE'), ('4', 'CARDINAL')],
  'headline': 'Kansas religious gathering tied to 4 coronavirus deaths, state officials say',
  'url': 'https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html',
  'website': 'CNN'},
 {'entities': [('Today', 'DATE'), ('daily', 'DATE'), ('US', 'GPE')],
  'headline': 'Today is the peak for daily deaths in the US, coronavirus model says',
  'url': 'https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html',
  'website': 'CNN'},
 {'entities': [],
  'headline': 'Pompeo: "There’ll be a time for recriminations" over the novel coronavirus',
  'url': 'https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html',
  'website': 'CNN'},
 {'entities': [('Cuomo', 'ORG'), ('180', 'CARDINAL')],
  'headline': 'Cuomo says federal g

In [0]:
import pandas as pd
new_df = pd.DataFrame(news_dict)

In [0]:
pd.set_option('max_colwidth', 800)
new_df.head(20)

Unnamed: 0,website,url,headline,entities
0,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,What you need to know,[]
1,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Kansas religious gathering tied to 4 coronavirus deaths, state officials say","[(Kansas, GPE), (4, CARDINAL)]"
2,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Today is the peak for daily deaths in the US, coronavirus model says","[(Today, DATE), (daily, DATE), (US, GPE)]"
3,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Pompeo: ""There’ll be a time for recriminations"" over the novel coronavirus",[]
4,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Cuomo says federal government would need to ""do a 180"" to reopen states","[(Cuomo, ORG), (180, CARDINAL)]"
5,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,Pentagon awards $415 million contract to reuse N95 masks,"[(Pentagon, ORG), ($415 million, MONEY), (N95, ORG)]"
6,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Colorado governor says meat packing plant in his state will be closed for""as long as it takes""","[(Colorado, GPE)]"
7,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Nearly 1,000 prisoners in Washington state may be released early","[(Nearly 1,000, CARDINAL), (Washington, GPE)]"
8,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Connecticut governor: Trump administration won't ""try and mandate a one size fits all"" on reopening states","[(Connecticut, GPE), (Trump, LOC), (one, CARDINAL)]"
9,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"US to receive 750,000 coronavirus tests from South Korea","[(US, GPE), (750,000, CARDINAL), (South Korea, GPE)]"


In [0]:
news_final_df = pd.concat([new_df[['website','url','headline']],new_df['entities'].apply(pd.Series)],axis=1)

In [0]:
news_final_df.head()

Unnamed: 0,website,url,headline,0,1,2,3,4,5,6
0,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,What you need to know,,,,,,,
1,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Kansas religious gathering tied to 4 coronavirus deaths, state officials say","(Kansas, GPE)","(4, CARDINAL)",,,,,
2,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Today is the peak for daily deaths in the US, coronavirus model says","(Today, DATE)","(daily, DATE)","(US, GPE)",,,,
3,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Pompeo: ""There’ll be a time for recriminations"" over the novel coronavirus",,,,,,,
4,CNN,https://www.cnn.com/world/live-news/coronavirus-pandemic-04-13-20/index.html,"Cuomo says federal government would need to ""do a 180"" to reopen states","(Cuomo, ORG)","(180, CARDINAL)",,,,,


In [0]:
new_df.to_csv("web_scapping.csv")