# This notebook will cover all the basics of web scraping.
To get started, we will import the required libraries:

In [2]:
import requests
from bs4 import BeautifulSoup as bs

### Now, let's load our first page:

In [5]:
#Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

#convert to a beautifulsoup object
soup = bs(r.content)

#print out the HTML
print(soup.prettify()) #we could just do print(soup), but prettify formates it nicely.

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Our first methods, find/find_all:

In [8]:
first_header = soup.find('h2') #we pass into find the tag we want, only returns the first match
#a better one is:
headers = soup.find_all('h2') #creates a list of all h2 elements
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [12]:
#we can pass in a list of elements to look for:
headers = soup.find_all(['h1', 'h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [14]:
#we can pass in attributes too,
paragraph = soup.find_all("p")
#what if you wanted a paragraph with a specific paragraph id?
paragraph = soup.find_all("p", attrs ={"id":"paragraph id"}) #we can pass in a second parameter, called attrs, and it takes a dictionary to match attributes.
paragraph

[]

In [15]:
#you can nest find and find_all
body = soup.find('body') #gets the body element
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [17]:
div = body.find('div') #so in the body we created, we can use the find/find_all function.
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [19]:
#we can nest further,
header = div.find_all('h1')
header

[<h1>HTML Webpage</h1>]

In [21]:
#we can also search for specific things in our find/find_all calls.
string_search = soup.find_all('p', string="Some bold text") #only works with the EXACT match, unless you use regex.
string_search

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [27]:
#importing regex to improve string search
import re

string_search = soup.find_all('p', string=re.compile('Some')) #all strings that contain "some"

header_string = soup.find_all('h2', string=re.compile("(H|h)eader")) #all header text that contains the word "header" (case sensitive, unless you use regex, which we did here)
header_string

[<h2>A Header</h2>, <h2>Another header</h2>]