In [1]:
# HCS Workshop 2, Web Scraping

In [None]:
# Author: Skyler Wu (based on Will Cooper)

In [4]:
# import requests package and set up page
# https://dataquestio.github.io/web-scraping-pages/simple.html

import requests

page = requests.get("https://dataquestio.github.io/web-scraping-pages/simple.html")
# print status code (200 = "success". 404 = error, etc.)
print(page.status_code)
# prints the actual HTML that's in the page. the exact same thing as the inspect element stuff.
print(page.content)

200
b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [5]:
# import BeautifulSoup and make a "BeautifulSoup object"
# sudo apt-get install python-bs4

# How to import:
from bs4 import BeautifulSoup 

# standard = store all page content as variable called "soup"
# 'html.parser' because we know that it's written in html. also xml stuff. html.parser covers most stuff.
soup = BeautifulSoup(page.content, 'html.parser')

# makes the html code look a lot easier to read.
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [11]:
# list soup children, title, etc.
# children of the soup. list command converts stuff to a list.
list(soup.children)
list(soup.title) # gives exactly what is in between the title tags.

# note tags can be nested!
print(soup.title.parent.name)

# returns the paragraph tag.
print(soup.p)

head
<p>Here is some simple content for this page.</p>


In [13]:
# more printing

# listing the children makes more comma separation.
html = list(soup.children)[2]
list(html.children)

body = list(html.children)[3] #4th element of the children list
list(body.children) # lists body's children.
p = list(body.children)[1] 
list(p.children) #how to narrow it down to some text that you want to find.

['Here is some simple content for this page.']

In [24]:
# find function - returns very first thing it finds with the tag that you indicates
# or if use find-all, returns all instances of the tag you're looking for.

soup.find('p').get_text() # to get rid of the tags on first instance found,
soup.findAll('p') # finds ALL paragraph tags. returns as list. CANNOT use get_text()!

soup.find('head').get_text() # still pretty much returns the text that was in the head tags.



bs4.element.ResultSet

In [32]:
# Example: weather

# URL to a weather website. plugged in longitude and latitude.
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")

# set soup variable with BeautifulSoup class.
soup = BeautifulSoup(page.content, 'html.parser')

# set "id" to a string that we want. "id" is one of the arguments that you can put into a tag.
# check inspect element to find specifics.
seven_day = soup.find(id = 'seven-day-forecast')

# you have to inspect element to find "tombstone-container"
# not intuitive.
# "class" itself is a keyword in python, so we have to use "class_"
forecast_items = seven_day.find_all(class_ = 'tombstone-container')
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  NOW: Multiple
  <br/>
  hazards in effect
 </p>
 <p>
  <img alt="" class="forecast-icon" src="newimages/medium/nfew.png" title=""/>
 </p>
 <p class="short-desc">
  Click HERE for Details
 </p>
</div>


In [40]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
#temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
#print(temp)

NOW: Multiplehazards in effect
Click HERE for Details


In [None]:
# more sources?
# https://beautiful-soup-4.readthedocs.io/en/Latest/
# https://www.dataquest.io/blog/web-scraping-tutorial-python/


In [52]:
# 'https://markets.businessinsider.com/stocks'
page = requests.get('https://markets.businessinsider.com/stocks')
soup = BeautifulSoup(page.content, 'html.parser')

# can hover lines of HTML code in inspect element until I find the element that I want.
# look for the relevant classnames. should have a special ID.

stocks = soup.find(id = 'shares_topflop_StockPricesSharesTopFlop')
asdf = stocks.find('a') # would return the first row of the table. based on inspect element.

#in html, you have a "table" class, and then "tr" means "table row"
prices = stocks.find_all(class_='row-hover')
# ^ this still looks ugly.

names = []
for i in range(len(prices)):
        names.append(prices[i].find('a').get_text())

print(names)
# for price in prices: # actually prints all of the stocks!
        # print(price.get_text())

10


In [55]:
# regular expressions, searching by text
# https://docs.python.org/3/library/re.html

import re

bruh = stocks.find_all('a', text = re.compile("Apple"))
print(bruh)

[]


In [9]:
# https://stackoverflow.com/questions/47928608/how-to-use-beautifulsoup-to-parse-google-search-results-in-python
# quick example of using BeautifulSoup to Google for you

import urllib
from bs4 import BeautifulSoup
import requests
import webbrowser

# create text variable.
text = "hello-world"
text = urllib.parse.quote_plus(text)

#concatenating strings together because that's the format of a google search query.
url = 'https://google.com/search?q=' + text

# requests library gets our url.
response = requests.get(url)

# this will open another tab in our browser that has the output in it.
with open('output.html', 'wb') as f:
    f.write(response.content)
webbrowser.open('output.html')

# use beautiful soup to parse the html stuff that Google returned in plain text form.
soup = BeautifulSoup(response.text, 'html.parser')

for g in soup.find_all(class_="Bneaweajwlf"):
    print(g.get_text())
    print('---')



In [None]:
# Project Time:
# find any website you want to do some scraping.
# write a python script to nicely print out interesting information on the page.
# That's it!

# requirements are kind of light!