## Load Necessary Libraries

In [3]:
import requests
from bs4 import BeautifulSoup as bs

## Load Our First Page

In [10]:
# load page content
res = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# convert to beautifulsoup obj
soup = bs(res.content)

# print html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Using Beautiful Soup to Scrape

In [20]:
# find first instance of h2 tag (can take a list)
first_header = soup.find('h2')

# fina all h2 tag instances (can take a list)
headers = soup.find_all('h2')

In [21]:
# pass attributes
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
# you can nest find and find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [30]:
import re

# we can search for specific text inside a tag
paragraphs = soup.find_all('p', string=re.compile('Some'))
paragraphs

headers = soup.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

## Select

In [36]:
# https://www.w3schools.com/cssref/css_selectors.php
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [32]:
# select all paragraphs within a div
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [34]:
# select p element directly after h2
content = soup.select('h2 ~ p')
content

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [35]:
# select bold text within paragraph with tag
content = soup.select('p#paragraph-id b')
content

[<b>Some bold text</b>]

In [37]:
paragraphs = soup.select('body > p')

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<i>Some italicized text</i>]
[]


## Get different properties of HTML

In [40]:
# use .string
header = soup.find('h2')
print(header.string)

# use .get_text when multiple child elements
div = soup.find('div')
print(div.prettify())
print(div.get_text())

A Header
<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [42]:
# get property from an element
link = soup.find('a')
link['href']

paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

'paragraph-id'

## Code Navigation

In [46]:
# path syntax
# know terms: Parent, sibling, child
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Excercises

go to https://keithgalli.github.io/web-scraping/webpage.html

In [50]:
res = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
soup = bs(res.content)
print(soup.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

### Task 1: Grab all social links from webpage (do this in 3 diff ways)

In [95]:
# use select
links = soup.select('ul.socials a')
refs = [link['href'] for link in links]
refs

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [101]:
# use find_all
ul = soup.find('ul', attrs={'class': 'socials'})
links = ul.find_all('a')
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [106]:
# use select
links = soup.select('li.social a')
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [107]:
table = soup.select('table.hockey-stats')
table

[<table class="hockey-stats">
 <thead>
 <tr>
 <th class="season" data-sort="">S</th>
 <th class="team" data-sort="team">Team</th>
 <th class="league" data-sort="league">League</th>
 <th class="regular gp" data-sort="gp">GP</th>
 <th class="regular g" data-sort="g">G</th>
 <th class="regular a" data-sort="a">A</th>
 <th class="regular tp" data-sort="tp">TP</th>
 <th class="regular pim" data-sort="pim">PIM</th>
 <th class="regular pm" data-sort="pm">+/-</th>
 <th class="separator"> </th>
 <th class="postseason">POST</th>
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>
 <th class="postseason g" data-sort="playoffs-g">G</th>
 <th class="postseason a" data-sort="playoffs-a">A</th>
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>
 </tr>
 </thead>
 <tbody>
 <tr class="team-continent-NA">
 <td class="season sorted">
                   2014-15
          