# Scraping web pages

### https://www.youtube.com/watch?v=GjKQ6V_ViQE

In [1]:
# Loading libraries
import requests
from bs4 import BeautifulSoup as bs

In [3]:
# Load our first page
req = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# convert to a beautiful soup
soup = bs(req.content)

first_head = soup.find('h2')

In [4]:
first_head

<h2>A Header</h2>

In [5]:
headers = soup.find_all('h2')

In [6]:
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [8]:
first_head = soup.find(['h2', 'h1'])
first_head

<h1>HTML Webpage</h1>

In [9]:
first_head = soup.find_all(['h2', 'h1'])
first_head

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [11]:
paragraph = soup.find_all('p', attrs={'id':'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [15]:
# you can nest find and find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [16]:
soup.prettify()

'<html>\n <head>\n  <title>\n   HTML Example\n  </title>\n </head>\n <body>\n  <div align="middle">\n   <h1>\n    HTML Webpage\n   </h1>\n   <p>\n    Link to more interesting example:\n    <a href="https://keithgalli.github.io/web-scraping/webpage.html">\n     keithgalli.github.io/web-scraping/webpage.html\n    </a>\n   </p>\n  </div>\n  <h2>\n   A Header\n  </h2>\n  <p>\n   <i>\n    Some italicized text\n   </i>\n  </p>\n  <h2>\n   Another header\n  </h2>\n  <p id="paragraph-id">\n   <b>\n    Some bold text\n   </b>\n  </p>\n </body>\n</html>\n'

In [19]:
# we can search a string in our find find_all
import re

soup.find_all('p', string=re.compile('Some'))

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [23]:
soup.find_all('h2', string=re.compile('(H|h)eader'))

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (css selector)

In [24]:
content = soup.select('p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [25]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [26]:
content = soup.select('h2 ~ p')
content

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:
content = soup.select('p#paragraph-id b')
content

[<b>Some bold text</b>]

In [32]:
paragraphs = soup.select('body > p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [33]:
for paragraph in paragraphs:
    print(paragraph.select('i'))

[<i>Some italicized text</i>]
[]


In [34]:
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties from HTML

In [36]:
# use .string single element
header = soup.find('h2')
print(header)
print(header.string)

<h2>A Header</h2>
A Header


In [39]:
# if you have multiple elements use get_text()
div = soup.find('div')
print(div.prettify())
print(div.get_text(" ", strip=False))

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


 HTML Webpage 
 Link to more interesting example:  keithgalli.github.io/web-scraping/webpage.html 



### Get a specific property from an element

In [40]:
# to get a link
link = soup.find('a')
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [45]:
# to get id, class
paragraphs = soup.select('p#paragraph-id')
print(paragraphs)
print(paragraphs[0])
print(paragraphs[0]['id'])

[<p id="paragraph-id"><b>Some bold text</b></p>]
<p id="paragraph-id"><b>Some bold text</b></p>
paragraph-id


In [46]:
# path syntax
soup.body.div.h1.string

'HTML Webpage'

In [49]:
# know the terms: Parent, sibling, child
soup.body.find('div')

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [50]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Exercises

In [51]:
req = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

soup = bs(req.content)

In [53]:
# soup.prettify()

In [93]:
# grab all of the social links from the webpage
social_links = soup.find_all('li', class_=re.compile('social *'))
social_links

[<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>,
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>,
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>,
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>]