**Requirements specified for Web Scraping**

In [1]:
# Import all modules that are required
import numpy as np    # python libary used to work with arrays
import pandas as pd   # converts dictionary or Numpy array to a Pandas data frame

import requests
from bs4 import BeautifulSoup
from IPython.display import HTML


*Setting up 'requests' to make HTTPS requests*

In [2]:
requests.packages.urllib3.disable_warnings()

import warnings
warnings.filterwarnings("ignore")

*Start the scraping by sending HTTP request*

In [3]:
snapshot_url = 'https://www.cnbc.com/finance/'

In [6]:
snapshot = requests.get(snapshot_url)
snapshot

<Response [200]>

In [7]:
type(snapshot)

requests.models.Response

In [9]:
snapshot = requests.get(snapshot_url)
raw_html = snapshot.text
print(raw_html[:500])

<!DOCTYPE html><html lang="en" prefix="og=https://ogp.me/ns#" itemType="https://schema.org/WebPage"><head><script src="//fm.cnbc.com/applications/cnbc.com/resources/newrelic/agent.js" defer=""></script><link rel="preload" as="script" href="https://sb.scorecardresearch.com/beacon.js"/><title itemProp="name">Finance News</title><meta name="viewport" content="initial-scale=1.0, width=device-width"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta property="AssetType" content="franchise"/


**Parse the HTML with BeautifulSoup**

In [10]:
soup = BeautifulSoup(raw_html, 'html.parser')

In [11]:
soup.select_one('title')

<title itemprop="name">Finance News</title>

In [12]:
soup

<!DOCTYPE html>
<html itemtype="https://schema.org/WebPage" lang="en" prefix="og=https://ogp.me/ns#"><head><script defer="" src="//fm.cnbc.com/applications/cnbc.com/resources/newrelic/agent.js"></script><link as="script" href="https://sb.scorecardresearch.com/beacon.js" rel="preload"/><title itemprop="name">Finance News</title><meta content="initial-scale=1.0, width=device-width" name="viewport"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="franchise" property="AssetType"/><meta content="10000664" property="pageNodeId"/><meta content="Latest investing news and finance headlines straight from Wall Street." itemprop="description" name="description"/><link href="https://www.cnbc.com/finance/" itemprop="url" rel="canonical"/><link href="/favicon.ico" rel="icon" type="image/png"/><meta content="website" property="og:type"/><meta content="Finance" property="og:title"/><meta content="Latest investing news and finance headlines straight from Wall Street." property="og:d

**Extracting data**

*Get all News Titles*

In [14]:
card_titles = soup.select('.Card-title')
len(card_titles)

34

In [15]:
HTML(card_titles[0].prettify())

In [16]:
# all titles
titles = []
for i in range(0, len(card_titles)):
  titles.append(card_titles[i].select_one('div').text)

In [17]:
titles

['Dow futures rise more than 300 points after Wall Street wraps up historically strong month',
 'Virus surge is leading to a double-dip recession and dollar crash: Stephen Roach',
 "Tesla's stock will be added to the S&P 500 in a single step.",
 'Stocks making the biggest moves after hours: Zoom Video, Sunnova Energy, BioNTech & more',
 'Stocks making the biggest moves midday: Moderna, IHS Markit, Nikola, Carnival',
 "Winklevoss twins say bitcoin will be the decade's best performing asset",
 'Stocks making the biggest moves premarket: Moderna, IHS Markit, Slack & more',
 "Huge November gains may make the usual year-end 'Santa Claus rally' less likely",
 'Dow falls more than 200 points, but still posts biggest monthly gain since 1987',
 'Market bull predicts holiday season surge will boost stocks another 10%',
 "Congress stalled on stimulus talks as millions of Americans face 'benefits cliff'",
 'Stocks making the biggest moves midday: Amazon, Tesla, AstraZeneca & more',
 'S&P 500 close

*Get all News Dates*

In [18]:
card_times = soup.select('.Card-time')
len(card_times)

34

In [19]:
str(card_times[0].prettify())

'<span class="Card-time">\n an hour ago\n</span>'

In [20]:
card_times = soup.find_all('span', {'class': 'Card-time'})

In [21]:
# all times
times = []
for i in range(0, len(card_times)):
  times.append(card_times[i].text)
times

['an hour ago',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Mon, Nov 30th 2020',
 'Sun, Nov 29th 2020',
 'Sat, Nov 28th 2020',
 'Fri, Nov 27th 2020',
 'Fri, Nov 27th 2020',
 'Fri, Nov 27th 2020',
 'Wed, Nov 25th 2020',
 'Wed, Nov 25th 2020',
 'Wed, Nov 25th 2020',
 'Wed, Nov 25th 2020',
 'Wed, Nov 25th 2020',
 'Tue, Nov 24th 2020',
 'Wed, Nov 25th 2020',
 'Tue, Nov 24th 2020',
 'Tue, Nov 24th 2020',
 'Tue, Nov 24th 2020',
 'Tue, Nov 24th 2020',
 'Tue, Nov 24th 2020',
 'Mon, Nov 23rd 2020',
 'Tue, Nov 24th 2020',
 'Tue, Nov 24th 2020',
 'Mon, Nov 23rd 2020',
 'Mon, Nov 23rd 2020',
 'Mon, Nov 23rd 2020',
 'Mon, Nov 23rd 2020',
 'Mon, Nov 23rd 2020']

In [23]:
# time of publication, today's data
from datetime import date

times = []
for i in range(0, len(card_times)):
  times.append(card_times[i].text + ", today: " + str(date.today()))
times

['an hour ago, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Mon, Nov 30th 2020, today: 2020-12-01',
 'Sun, Nov 29th 2020, today: 2020-12-01',
 'Sat, Nov 28th 2020, today: 2020-12-01',
 'Fri, Nov 27th 2020, today: 2020-12-01',
 'Fri, Nov 27th 2020, today: 2020-12-01',
 'Fri, Nov 27th 2020, today: 2020-12-01',
 'Wed, Nov 25th 2020, today: 2020-12-01',
 'Wed, Nov 25th 2020, today: 2020-12-01',
 'Wed, Nov 25th 2020, today: 2020-12-01',
 'Wed, Nov 25th 2020, today: 2020-12-01',
 'Wed, Nov 25th 2020, today: 2020-12-01',
 'Tue, Nov 24th 2020, today: 2020-12-01',
 'Wed, Nov 25th 2020, today: 2020-12-01',
 'Tue, Nov 24th 2020, today: 2020-12-01',
 'Tue, Nov 24th 2020, today: 2020-12-01',
 'Tue, Nov 24th 2020, today: 2020-12-01',

*Get all News Classes*

In [24]:
card_classes = soup.select('.Card-eyebrow')
len(card_classes)

29

In [25]:
str(card_classes[0].prettify())

'<a class="Card-eyebrow" href="https://www.cnbc.com/markets/">\n <div>\n  Markets\n </div>\n</a>'

In [26]:
# all classes
classes = []
for i in range(0, len(card_classes)):
  classes.append(card_classes[i].select_one('div').text)

In [27]:
classes = ['Top News', 'Top News', 'Top News', 'Top News', 'Top News'] + classes

In [28]:
classes

['Top News',
 'Top News',
 'Top News',
 'Top News',
 'Top News',
 'Markets',
 'Market Insider',
 'Trader Talk',
 'Markets',
 'Trading Nation',
 'Personal Finance',
 'Market Insider',
 'Markets',
 'Market Insider',
 'Federal Reserve',
 'Market Insider',
 'Personal Finance',
 'Finance',
 'Market Insider',
 'Trading Nation',
 'Markets',
 'Market Insider',
 'Finance',
 'Market Insider',
 'Bitcoin',
 'Market Insider',
 'Trading Nation',
 'Markets',
 'Personal Finance',
 'Cryptocurrency',
 'Market Insider',
 'Market Insider',
 'Cryptocurrency',
 'Tech']

In [29]:
len(classes)

34

**Create a Pandas DataFrame**

In [31]:
matrix = []
for i in range(0, len(classes)):
  node = {}
  node['Title'] = titles[i]
  node['Date'] = times[i]
  node['Class'] = classes[i]
  matrix.append(node)

df = pd.DataFrame(matrix)
df

Unnamed: 0,Title,Date,Class
0,Dow futures rise more than 300 points after Wa...,"an hour ago, today: 2020-12-01",Top News
1,Virus surge is leading to a double-dip recessi...,"Mon, Nov 30th 2020, today: 2020-12-01",Top News
2,Tesla's stock will be added to the S&P 500 in ...,"Mon, Nov 30th 2020, today: 2020-12-01",Top News
3,Stocks making the biggest moves after hours: Z...,"Mon, Nov 30th 2020, today: 2020-12-01",Top News
4,Stocks making the biggest moves midday: Modern...,"Mon, Nov 30th 2020, today: 2020-12-01",Top News
5,Winklevoss twins say bitcoin will be the decad...,"Mon, Nov 30th 2020, today: 2020-12-01",Markets
6,Stocks making the biggest moves premarket: Mod...,"Mon, Nov 30th 2020, today: 2020-12-01",Market Insider
7,Huge November gains may make the usual year-en...,"Mon, Nov 30th 2020, today: 2020-12-01",Trader Talk
8,"Dow falls more than 200 points, but still post...","Mon, Nov 30th 2020, today: 2020-12-01",Markets
9,Market bull predicts holiday season surge will...,"Sun, Nov 29th 2020, today: 2020-12-01",Trading Nation
