In [None]:
from datetime import date
today = date.today()

d = today.strftime('%m-%d-%y')
print(d)

#https://edition.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-01-14-21/index.html
cnn_url = 'https://edition.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-{}/index.html'.format(d)
print(cnn_url)

01-18-21
https://edition.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-01-18-21/index.html


In [None]:
from bs4 import BeautifulSoup
import requests

html = requests.get(cnn_url).text
soup = BeautifulSoup(html)
print(soup.title)

<title>Error</title>


In [None]:
import spacy #for analysing context, countries and topics involved in the news
nlp = spacy.load('en_core_web_sm')

In [None]:
for link in soup.find_all('h2'):
  print("Headline: {}".format(link.text))
  for ent in nlp(link.text).ents:
    print("\tText: {}, Entity: {}".format(ent.text, ent.label_))


Headline: It could be you, or it could be us, but there's no page here.


Doing this for multiple URLs individually will be time consuming.\
Here's an approach to compress the approach for multiple site-crawling.

In [None]:
#Create variables for all the site URLs
nbc_url='https://www.nbcnews.com/health/coronavirus'
cnbc_rss_url='https://www.cnbc.com/id/10000108/device/rss/rss.html'

In [None]:
#create a list of sites, format parsers per site and differentiator per site 
news_urls = [cnn_url, nbc_url, cnbc_rss_url]
parsers = ['html.parser','html.parser','xml']
tags = ['h2','h2','description']
website = ['CNN','NBC','CNBC']

In [None]:
crawl_len = 0
for url in news_urls:
  #get the response content from each site
  response = requests.get(url)
  soup = BeautifulSoup(response.text, parsers[crawl_len])

  #crawl the relevant content and print it
  for link in soup.find_all(tags[crawl_len]):
    if(len(link.text.split(" ")) > 4):
      print("Headline: {}".format(link.text))
      for ent in nlp(link.text).ents:
        print("\tText: {}, Entity: {}".format(ent.text, ent.label_))

  crawl_len = crawl_len + 1

Headline: It could be you, or it could be us, but there's no page here.
Headline: Nursing homes make big push to change minds of workers who refused vaccine
Headline: Fauci: Weeks, not months before new vaccines are submitted for approval
	Text: Weeks, Entity: DATE
	Text: months, Entity: DATE
Headline: Black Americans are getting vaccinated at lower rates than white Americans
	Text: Americans, Entity: NORP
	Text: Americans, Entity: NORP
Headline: Chaos in a fake vaccine line: Inside NYC's bumpy distribution rollout
	Text: NYC, Entity: ORG
Headline: Los Angeles becomes first county to hit 1 million Covid-19 cases
	Text: Los Angeles, Entity: GPE
	Text: first, Entity: ORDINAL
	Text: 1 million, Entity: CARDINAL
Headline: Biden to deploy FEMA, National Guard as part of national vaccination plan
	Text: FEMA, Entity: ORG
	Text: National Guard, Entity: ORG
Headline: Needle in a haystack: Despite efforts to boost rollout, vaccination rates fail to meet demand
Headline: Health officials recommen

Crawling and storing data in a dataframe\
Columns: 
entities, headline, url, website

In [None]:
crawl_len=0
news_dict=[]
for url in news_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content,parsers[crawl_len])

    for link in soup.find_all(tags[crawl_len]):

      if(len(link.text.split(" ")) > 4):

        entities=[]

        entities=[(ent.text, ent.label_) for ent in nlp(link.text).ents]

        news_dict.append({'website':website[crawl_len],'url': url,'headline':link.text,'entities':entities})
    
    crawl_len=crawl_len+1

In [None]:
news_dict

[{'entities': [],
  'headline': "It could be you, or it could be us, but there's no page here.",
  'url': 'https://edition.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-01-18-21/index.html',
  'website': 'CNN'},
 {'entities': [],
  'headline': 'Nursing homes make big push to change minds of workers who refused vaccine',
  'url': 'https://www.nbcnews.com/health/coronavirus',
  'website': 'NBC'},
 {'entities': [('Weeks', 'DATE'), ('months', 'DATE')],
  'headline': 'Fauci: Weeks, not months before new vaccines are submitted for approval',
  'url': 'https://www.nbcnews.com/health/coronavirus',
  'website': 'NBC'},
 {'entities': [('Americans', 'NORP'), ('Americans', 'NORP')],
  'headline': 'Black Americans are getting vaccinated at lower rates than white Americans',
  'url': 'https://www.nbcnews.com/health/coronavirus',
  'website': 'NBC'},
 {'entities': [('NYC', 'ORG')],
  'headline': "Chaos in a fake vaccine line: Inside NYC's bumpy distribution rollout",
  'url': 'https://

In [None]:
import pandas as pd
news_df = pd.DataFrame(news_dict)
pd.set_option('max_colwidth',800)

news_df.head()

Unnamed: 0,website,url,headline,entities
0,CNN,https://edition.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-01-18-21/index.html,"It could be you, or it could be us, but there's no page here.",[]
1,NBC,https://www.nbcnews.com/health/coronavirus,Nursing homes make big push to change minds of workers who refused vaccine,[]
2,NBC,https://www.nbcnews.com/health/coronavirus,"Fauci: Weeks, not months before new vaccines are submitted for approval","[(Weeks, DATE), (months, DATE)]"
3,NBC,https://www.nbcnews.com/health/coronavirus,Black Americans are getting vaccinated at lower rates than white Americans,"[(Americans, NORP), (Americans, NORP)]"
4,NBC,https://www.nbcnews.com/health/coronavirus,Chaos in a fake vaccine line: Inside NYC's bumpy distribution rollout,"[(NYC, ORG)]"


Breaking the entities with list column to different series

In [None]:
pd.concat([news_df[['website','url','headline']], news_df['entities'].apply(pd.Series)], axis=1)

Unnamed: 0,website,url,headline,0,1,2,3,4,5
0,CNN,https://edition.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-01-18-21/index.html,"It could be you, or it could be us, but there's no page here.",,,,,,
1,NBC,https://www.nbcnews.com/health/coronavirus,Nursing homes make big push to change minds of workers who refused vaccine,,,,,,
2,NBC,https://www.nbcnews.com/health/coronavirus,"Fauci: Weeks, not months before new vaccines are submitted for approval","(Weeks, DATE)","(months, DATE)",,,,
3,NBC,https://www.nbcnews.com/health/coronavirus,Black Americans are getting vaccinated at lower rates than white Americans,"(Americans, NORP)","(Americans, NORP)",,,,
4,NBC,https://www.nbcnews.com/health/coronavirus,Chaos in a fake vaccine line: Inside NYC's bumpy distribution rollout,"(NYC, ORG)",,,,,
...,...,...,...,...,...,...,...,...,...
77,CNBC,https://www.cnbc.com/id/10000108/device/rss/rss.html,"The former Food and Drug Administration commissioner said the advice is particularly important for the elderly, who are at higher risk of death from Covid.\n","(Food and Drug Administration, ORG)","(Covid, PRODUCT)",,,,
78,CNBC,https://www.cnbc.com/id/10000108/device/rss/rss.html,Delta lost a record $12.39 billion in 2020 but is setting its sights on a recovery in 2021 even though it expects more difficult months ahead.,"(Delta, ORG)","(a record $12.39 billion, MONEY)","(2020, DATE)","(2021, DATE)","(months ahead, DATE)",
79,CNBC,https://www.cnbc.com/id/10000108/device/rss/rss.html,"Dow and S&P 500 futures rose but Nasdaq futures fell Thursday, a day after a mixed session left all three benchmarks less than 1% away from record highs.","(Nasdaq, ORG)","(Thursday, DATE)","(a day, DATE)","(three, CARDINAL)","(less than 1%, PERCENT)",
80,CNBC,https://www.cnbc.com/id/10000108/device/rss/rss.html,Preliminary findings in Public Health England's SIREN study found antibodies from past Covid infection provide 83% protection against reinfection.\n,"(Public Health England's, ORG)","(SIREN, ORG)","(Covid, NORP)","(83%, PERCENT)",,


In [None]:
news_df.to_csv("web_scraping.csv")