# Web scraping tutorial (Python/Beautiful Soup)

## Tutorial

#### Load necessary libraries

In [64]:
# pip install requests
import requests

# pip install beautifulsoup4
from bs4 import BeautifulSoup as bs 

# pip install pandas
import pandas as pd

#### Load first page

In [5]:
# load webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

# convert contents of webpage to a beautiful soup object
soup = bs(r.content)

# print out the html
# use .prettify() for proper indentation
print(soup.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

#### Start web scraping

In [10]:
# the find method looks for the first instance of something
# example: find the first instance of the h2 tag

first_header = soup.find('h2')
print(first_header)

<h2>About me</h2>


In [11]:
# the find_all method looks for every instance of something
# example: find all instances of the h2 tag

headers = soup.find_all('h2')
print(headers)

[<h2>About me</h2>, <h2>Social Media</h2>, <h2>Photos</h2>, <h2> Table </h2>, <h2>Mystery Message Challenge!</h2>, <h2>Footnotes</h2>]


In [12]:
# pass in a list of elements to look for
# example: find all instances of h1 and h2 tags

find_headers = soup.find_all(['h1', 'h2'])
print(find_headers)

[<h1>Welcome to my page!</h1>, <h2>About me</h2>, <h2>Social Media</h2>, <h2>Photos</h2>, <h2> Table </h2>, <h2>Mystery Message Challenge!</h2>, <h2>Footnotes</h2>]


In [15]:
# pass in a list of elements to look for
# it only finds the first instance of one of the parameters 
# if an h1 tag is first in the webpage, then it only finds that one
# if an h2 tag is first in the webpage, then it only finds that one
# order of parameters does not matter

find_headers = soup.find(['h1', 'h2'])
print(find_headers)

<h1>Welcome to my page!</h1>


In [20]:
# pass in attributes to the find/find_all functions
# get the paragraph with the footer id
paragraph = soup.find_all('p', attrs = {'id': 'footer'})
print(paragraph)

[<p id="footer">1. This was actually a minivan that I named Debora. Maybe not my dream car, but I loved her nonetheless.</p>]


In [23]:
# you can nest instances of find/find_all
body = soup.find('body')
print(body.prettify())

# get the first div from the body section
div = body.find('div')
print(div.prettify())

<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </a>
 </p>
 <p>
  I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.
 </p>
 <h3>
  Hobbies
 </h3>
 <p>
  Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skat

In [26]:
# use find/find_all to search for specific strings

import re # import regex

# find all anchor tags with file in the string using regex
file_string = soup.find_all('a', string = re.compile('File'))
print(file_string)

[<a href="challenge/file_1.html">File 1</a>, <a href="challenge/file_2.html">File 2</a>, <a href="challenge/file_3.html">File 3</a>, <a href="challenge/file_4.html">File 4</a>, <a href="challenge/file_5.html">File 5</a>, <a href="challenge/file_6.html">File 6</a>, <a href="challenge/file_7.html">File 7</a>, <a href="challenge/file_8.html">File 8</a>, <a href="challenge/file_9.html">File 9</a>, <a href="challenge/file_10.html">File 10</a>]


#### Select (CSS selector)

In [31]:
# select all paragraph tags
paragraph_content = soup.select('p')
print(paragraph_content)

[<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>, <p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></p>, <p>I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.</p>, <p>Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are <i>Settlers of Catan</i> and <i>Othe

In [33]:
# select all list item tags in unordered lists
list_item = soup.select('ul li')
print(list_item)

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>, <li>Middle name is Ronald</li>, <li>Never had been on a plane until college</li>, <li>Dunkin Donuts coffee is better than Starbucks</li>, <li>A favorite book series of mine is <i>Ender's Game</i></li>, <li>Current video game of choice is <i>Rocket League</i></li>, <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>, <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>, <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>, <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>, <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>, <li><a href="challenge/file_

#### Get different properties of HTML

In [36]:
# get string of h1 tag
# use .string for only one element
header = soup.find('h1')
print(header.prettify())
print(header.string)

<h1>
 Welcome to my page!
</h1>

Welcome to my page!


In [44]:
# get string of div with multiple tags
# get string of fun facts list
# use .get_text() for multiple elements
fun_facts = soup.find('ul')
print(fun_facts.prettify())
print(fun_facts.get_text())

<ul class="fun-facts">
 <li>
  Owned my dream car in high school
  <a href="#footer">
   <sup>
    1
   </sup>
  </a>
 </li>
 <li>
  Middle name is Ronald
 </li>
 <li>
  Never had been on a plane until college
 </li>
 <li>
  Dunkin Donuts coffee is better than Starbucks
 </li>
 <li>
  A favorite book series of mine is
  <i>
   Ender's Game
  </i>
 </li>
 <li>
  Current video game of choice is
  <i>
   Rocket League
  </i>
 </li>
 <li>
  The band that I've seen the most times live is the
  <i>
   Zac Brown Band
  </i>
 </li>
</ul>


Owned my dream car in high school 1
Middle name is Ronald
Never had been on a plane until college
Dunkin Donuts coffee is better than Starbucks
A favorite book series of mine is Ender's Game
Current video game of choice is Rocket League
The band that I've seen the most times live is the Zac Brown Band



In [48]:
# get a specific property from an element
# get hypertext reference (href) from element
link = soup.find('a')
link['href']

'https://www.youtube.com/kgmit'

In [50]:
# get id property from an element
footer = soup.select('p#footer')
footer[0]['id']

'footer'

#### Code navigation

In [53]:
# path syntax

# get the body 
soup.body

# get all divs inside the body
soup.body.div

<div class="row">
<div class="column">
<img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>
</div>
<div class="column">
<img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>
</div>
<div class="column">
<img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>
</div>
</div>

In [54]:
 # know the terms: parent, sibling, child

# find all next siblings of first div in the body element
soup.body.find('div').find_next_siblings()

[<div></div>,
 <h2> Table </h2>,
 <br/>,
 <table class="hockey-stats">
 <thead>
 <tr>
 <th class="season" data-sort="">S</th>
 <th class="team" data-sort="team">Team</th>
 <th class="league" data-sort="league">League</th>
 <th class="regular gp" data-sort="gp">GP</th>
 <th class="regular g" data-sort="g">G</th>
 <th class="regular a" data-sort="a">A</th>
 <th class="regular tp" data-sort="tp">TP</th>
 <th class="regular pim" data-sort="pim">PIM</th>
 <th class="regular pm" data-sort="pm">+/-</th>
 <th class="separator"> </th>
 <th class="postseason">POST</th>
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>
 <th class="postseason g" data-sort="playoffs-g">G</th>
 <th class="postseason a" data-sort="playoffs-a">A</th>
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>
 </tr>
 </thead>
 <tbody>
 <tr class="team-continent-NA">
 <td class="season sorte

## Practice

#### Load the webpage

In [55]:
# load webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

# convert contents of webpage to a beautiful soup object
webpage = bs(r.content)

# print out the html
# use .prettify() for proper indentation
print(webpage.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

#### 3 ways to grab all social links from webpage

In [61]:
# use CSS select to grab all anchor tags in unordered list with 'socials' class 
links = webpage.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [63]:
# use find/find_all

# find unordered list with 'socials' class
ulist = webpage.find('ul', attrs = {'class': 'socials'})

# grab all anchor tags from list
links = ulist.find_all('a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [62]:
# use CSS select to grab anchor tags from all individual list elements with 'socials' class
links = webpage.select('li.social a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

#### Scrape the MIT hockey stats table



In [70]:
# select the table html 
# CSS select turns it into a list, so use [0] index to grab the first and only element from list
table = webpage.select('table.hockey-stats')[0]

columns = table.find('thead').find_all('th') # get all columns in table header
column_names = [c.string for c in columns] # get names of columns

table_rows = table.find('tbody').find_all('tr') # get all table rows

l = [] # initialize new list

for tr in table_rows:
    
    td = tr.find_all('td') # find all table data in each table row
    
    row = [str(tr.get_text()).strip() for tr in td] # convert all data into strings and strip white space
    
    l.append(row) # add row details to this list

df = pd.DataFrame(l, columns = column_names) # create data frame
df.head() # show data frame

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


#### Grab all fun facts containing the word "is"

In [81]:
# grab unordered list of fun facts
facts = webpage.select('ul.fun-facts li')

# assign variable to all sentences in the list with 'is' in them
facts_with_is = [fact.find(string = re.compile('is')) for fact in facts]

# only keep sentences with 'is' in them
# use .find_parent() to get html tags and complete the cutoff sentences
# use .get_text() to remove tags and keep the sentences
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

#### Download an image

In [82]:
# get webpage directory to download image
url = 'https://keithgalli.github.io/web-scraping/'

# load webpage content
r = requests.get(url + 'webpage.html')

# convert webpage to beautiful soup content
webpage = bs(r.content)

# grab all images on the webpage
images = webpage.select('div.row div.column img')

# get url of first image
image_url = images[0]['src'] 

full_url = url + image_url

img_data = requests.get(full_url).content

# download and open image in browser
with open('image_name.jpg', 'wb') as handler:
    handler.write(img_data)