#### Load in the necessary libraries 

In [1]:
import requests
from bs4 import BeautifulSoup as bs


#### Load the webpage 

In [2]:
#Load the content

r = requests.get("https://keithgalli.github.io/web-scraping/example.html")
#Convert to BeautifulSoup object
soup = bs(r.content,"html.parser")
 #Print the html
print(soup.prettify())


<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



#### Start using Beautiful Soup to scrape

#### find and find_all

In [3]:
first_header = soup.find("h2")


headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

#### Pass in a list of elements to look for

In [4]:
first_header = soup.find(["h2","h1"])
headers = soup.find_all(["h1","h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

#### Passing attributes in find/find_all

In [5]:
paragraph = soup.find_all("p", attrs = {"id" : "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

#### Nesting find/find_all calls

In [6]:
body = soup.find("body")
div = body.find("div")
#header = div.find("h1")
#header


#### Search for specific strings with find/find_all

In [7]:
import re
paragraphs = soup.find_all("p",text = re.compile("Some"))
paragraphs

headers = soup.find_all("h2",text = re.compile("(H|h)eader"))
headers



[<h2>A Header</h2>, <h2>Another header</h2>]

#### CSS Selector

In [8]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [9]:
content = soup.select("p a")
content

[<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]

In [10]:
paragraph = soup.select("h2 ~ p")
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
bold_text = soup.select("p#paragraph-id b ")
bold_text

[<b>Some bold text</b>]

In [12]:
paragraphs = soup.select("body > p")
print(paragraphs)
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


#### Grab by element with specific property

In [13]:
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

#### Get different properties of the HTML

In [14]:
#.string to get the string, not the tabs
header = soup.find("h2")
header.string     


div = soup.find("div")
print(div.prettify())

#.get_text() to get all texts- if multiple child elements, use get_text()
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



#### Get a specific property from an element

In [15]:
link = soup.find("a")
link['href']

paragraphs = soup.select("p#paragraph-id")
paragraphs[0]["id"]

'paragraph-id'

#### Code navigation

In [19]:
#Path syntax
soup.body.h2.string

'A Header'

In [18]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>

