# Web Scrpaing Tutorial

#### BeutifulSoup documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
# Imports

import requests
from bs4 import BeautifulSoup as bs

##### Load page

In [2]:
# Load the web data
r=requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to a BeautifulSoup object
soup=bs(r.content)

# Print the html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



##### Using BeautifulSoup to scrape

+ find and find_all

In [3]:
first_header=soup.find('h2')
print(first_header)

headers=soup.find_all('h2')
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


+ Passing in a list of elements to look for

In [4]:
first_header=soup.find(['h1','h2'])
print(first_header)

headers=soup.find_all(['h1','h2'])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


+ Pass in atrributes to the find/find_all functions

In [5]:
paragraph=soup.find_all('p', attrs={'id':'paragraph-id'})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


+ Nest find/find_all calls

In [6]:
body=soup.find('body')
print(f'Body:\n'+str(body))

div=body.find('div')
print(f'\nDiv:\n'+str(div))

header=div.find('h1')
print(f'\nHeader:\n'+str(header))

Body:
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

Div:
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

Header:
<h1>HTML Webpage</h1>


+ Search for specific text in your find/find_all calls

In [7]:
import re

string_search=soup.find_all('p', string=re.compile('Some'))
print(string_search)

headers=soup.find_all('h2', string=re.compile('(H|h)eader')) # (H|h) means that it can be either H or h
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


+ Select CSS

In [10]:
content=soup.select('div p') # Only get the p paragraph
print(content)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]


In [11]:
paragraphs=soup.select('h2 ~ p') # Get all the p paragraphs that are next to h2
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [12]:
bold_text=soup.select('p#paragraph-id b')
print(bold_text)

[<b>Some bold text</b>]


In [16]:
paragraphs=soup.select('body > p') # Get all the p paragraphs that are next to body
print(paragraphs)

# this loop will print all the i tags inside the p tags
for prg in paragraphs:
  print(prg.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [18]:
# Grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

##### Get different properties of the HTML

In [24]:
header=soup.find('h2')
print(header.string)

div=soup.find('div')
print(f'\nDiv:\n'+str(div.prettify()))

div_text=soup.find('div')
print(f'\nText of the div: '+str(div_text.get_text()))

A Header

Div:
<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


Text of the div: 
HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [28]:
# Get a specific property for an element
link=soup.find('a')
print(link['href'])

paragraphs=soup.select('p#paragraph-id')
print(paragraphs[0]['id'])

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id


##### Code navigation

In [30]:
# Path syntax
print(soup.body.div.h1.string)

HTML Webpage


In [32]:
# Know the terms: Parent, Sibling and child
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]