# Tutorial Video:
https://www.youtube.com/watch?v=GjKQ6V_ViQE

# Libraries required to work

In [1]:
import requests
from bs4 import BeautifulSoup as bs

import pandas
import numpy

In [2]:
URL_SAMPLE = "https://keithgalli.github.io/web-scraping/example.html"
URL_WEB =  "https://keithgalli.github.io/web-scraping/webpage.html"

## Loading the page: "URL_SAMPLE"

In [3]:
#Loading the page content
response = requests.get(URL_SAMPLE)

#Converting into soup object
soup = bs(response.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### find and find_all
- find: returns a specific tag found
- find_all: returns a list of all specific tag found

#### returns only the first it can find

In [4]:
soup.find("h2")

<h2>A Header</h2>

#### Applying multiple tags
The order in the list doesn't matter

In [5]:
print(soup.find(["h1", "h2"]))
print(soup.find(["h2", "h1"]))

<h1>HTML Webpage</h1>
<h1>HTML Webpage</h1>


#### returns all the tags it can find

In [6]:
soup.find_all("h2")

[<h2>A Header</h2>, <h2>Another header</h2>]

#### Applying multiple tags
The order in the list doesn't matter

In [7]:
print(soup.find_all(["h1", "h2"]))
print(soup.find_all(["h2", "h1"]))

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


#### Passing Attributes to find/findall

In [8]:
soup.find_all("p")

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [9]:
soup.find_all("p", attrs={"id": "paragraph-id"})

[<p id="paragraph-id"><b>Some bold text</b></p>]

#### Nesting find/find_all

In [10]:
soup.find_all("body")

[<body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>]

Looking from **body** > **div** > **h1**

In [11]:
soup.find("body").find("div").find("h1")

<h1>HTML Webpage</h1>

#### Search specific strings in find/find_all with the help of regex

In [13]:
import re

In [14]:
soup.find_all("p",text=re.compile("Some"))

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

Two headers starts with capitalized and non-capitalized letter

In [16]:
soup.find_all("h2",text=re.compile("(H|h)eader"))

[<h2>A Header</h2>, <h2>Another header</h2>]

### select (similar with how CSS selectors work)
https://www.w3schools.com/cssref/css_selectors.asp