In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
import re
from fake_useragent import UserAgent
import random
import time
import pytz

# Function to get proxies so we can rotate through them
def scrape_proxies():
    proxies = []
    headers = {'User-Agent': UserAgent().random}
    resp = requests.get('https://www.sslproxies.org/', headers=headers)
    site = resp.content
    soup = BeautifulSoup(site, 'html.parser')
    proxies_table = soup.find(id='proxylisttable')
    for row in proxies_table.tbody.find_all('tr'):
        proxies.append(row.find_all('td')[0].string + ':' + row.find_all('td')[1].string)
    return proxies


# Function to scrape the html into beautiful soup
def scrape_html(url, max_n_tries, proxies):
    num_tries = 0
    max_tries = max_n_tries
    while num_tries < max_tries: 
        current_proxy = {"http": "http://{}".format(random.choice(proxies))}
        headers = {'User-Agent': UserAgent().random}
        resp = requests.get(
            url,
            headers=headers,
            proxies=current_proxy
        )
        if resp.status_code == 200:
            site = resp.content
            soup = BeautifulSoup(site, 'html.parser')
            return soup
        else:
            num_tries += 1
            time.sleep(random.randint(1, 2))    


# Get all the href tags from the business homepage and add them to a list
resp = requests.get('https://www.cnn.com/business')
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, from_encoding=encoding)

links = [link['href'] for link in soup.find_all('a', href=True)]

# Not all hrefs are actual article links
# Article links always start with the date
# so we'll do a regex for that
article_links = []

for link in links:
    regexp = re.compile(r'(\d+/\d+/\d+)')
    if regexp.search(link):
        article_links.append('https://www.cnn.com' + link)
        
# There are duplicate links so we'll remove those
article_links = list(set(article_links))

# There are also video links so we'll remove those too
article_links = [each for each in article_links if '/videos/' not in each]

# Scrape each link and add to lists
links, titles, body = [], [], []

for url in article_links:
    links.append(url)
    
    proxies = scrape_proxies()
    soup = scrape_html(url, 5, proxies)
    
    try:
        titles.append(soup.h1.text)
    except AttributeError:
        titles.append(np.nan)
        
    p = soup.findAll('div', {'class': 'zn-body__paragraph'})    
    p_text = []

    for each in p:
        try:
            p_text.append(each.text)
        except AttributeError:
            p_text.append(np.nan)
            
    body.append(' '.join(p_text))
    
    time.sleep(random.randint(1, 2))

# Create a df from the lists
df = pd.DataFrame({
    'url': links,
    'title': titles,
    'body': body
})

# Add in today's date
df['date'] = pd.to_datetime(datetime.today())

df.head()

Unnamed: 0,url,title,body,date
0,https://www.cnn.com/2020/02/19/business/mcdona...,A new use for McDonald's used cooking oil: 3D ...,Simpson is director of the school's Environmen...,2020-04-16 07:29:14.124708
1,https://www.cnn.com/2020/04/09/tech/instacart-...,People are luring Instacart shoppers with big ...,"But an hour later, Arambula checked her earnin...",2020-04-16 07:29:14.124708
2,https://www.cnn.com/2020/03/16/tech/xbox-serie...,Microsoft reveals details about its next-gener...,The Series X will be capable of running 4K gra...,2020-04-16 07:29:14.124708
3,https://www.cnn.com/2020/04/15/business/oil-pr...,US faces an 'unprecedented' decline in oil pro...,US crude futures dropped as low as $19.20 a ba...,2020-04-16 07:29:14.124708
4,https://www.cnn.com/2020/04/08/business/restau...,They used to sell food to top chefs. Now you'r...,Fruit and vegetable wholesalers that sold fres...,2020-04-16 07:29:14.124708
