# Loading necessary libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs

# Common use cases of Beautiful Soup

## Loading web page

In [46]:
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

## Create Beautiful Soup object

In [122]:
soup = bs(r.content)
# print(soup.prettify())

## Web Scraping using Beautiful Soup

In [14]:
first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [15]:
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [16]:
first_header = soup.find(['h2','h1'])
first_header

<h1>HTML Webpage</h1>

In [17]:
headers = soup.find_all(['h2','h1'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [18]:
paragraph = soup.find_all('p',attrs={'id':'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

## Finding specific strings using the regex module

In [22]:
import re

In [23]:
paragraphs = soup.find_all('p',string=re.compile('Some'))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [24]:
headers = soup.find_all('h2',string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

## Using select method

In [26]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [27]:
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [29]:
paragraphs = soup.select('body > p')
print(paragraphs)
for paragraph in paragraphs:
  print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [30]:
align_middle = soup.select('[align=middle]')
align_middle

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Retrieve different properties of the HTML

In [33]:
header = soup.find('h2')
print(header)
print(header.string)

<h2>A Header</h2>
A Header


In [35]:
div = soup.find('div')
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [39]:
link = soup.find('a')
print(link)
print(link['href'])

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>
https://keithgalli.github.io/web-scraping/webpage.html


In [45]:
paragraphs = soup.select('p#paragraph-id')
print(paragraphs)
print(paragraphs[0]['id'])

[<p id="paragraph-id"><b>Some bold text</b></p>]
paragraph-id


# Exercises

## Loading web page

In [47]:
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

## Create Beautiful Soup object

In [117]:
webpage = bs(r.content)
# print(webpage.prettify())

## Grab all social links on webpage in 3 different ways

In [56]:
links_list = webpage.select('ul.socials a')
links = [link['href'] for link in links_list]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [61]:
links_string = webpage.find('ul',attrs={'class':'socials'})
links_list = links_string.find_all('a')
links = [link['href'] for link in links_list]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [62]:
links_list = webpage.select('li.social a')
links = [link['href'] for link in links_list]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Scrape an HTML table into a Pandas Dataframe

### Import Pandas library

In [64]:
import pandas as pd

### Retrieve the table

In [118]:
# table = webpage.select('table.hockey-stats')
# print(table[0].prettify())
table = webpage.body.find('table',attrs={'class':'hockey-stats'})
# table

In [86]:
columns_tag = table.find('thead').find_all('th')
columns = [column.string for column in columns_tag]
print(columns)

['S', 'Team', 'League', 'GP', 'G', 'A', 'TP', 'PIM', '+/-', '\xa0', 'POST', 'GP', 'G', 'A', 'TP', 'PIM', '+/-']


In [120]:
table_rows = table.find('tbody').find_all('tr')
# table_rows

In [85]:
rows = []
for tr in table_rows:
  td = tr.find_all('td')
  row = [str(tr.get_text()).strip() for tr in td]
  rows.append(row)
print(rows)

[['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3', '9', '12', '20', '', '|', '', '', '', '', '', '', ''], ['2015-16', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '9', '1', '1', '2', '2', '', '|', '', '', '', '', '', '', ''], ['2016-17', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '12', '5', '5', '10', '8', '0', '|', '', '', '', '', '', '', ''], ['2017-18', 'Did not play', '', '', '', '', '', '', '', '|', '', '', '', '', '', '', ''], ['2018-19', 'MIT (Mass. Inst. of Tech.)', 'ACHA III', '8', '5', '10', '15', '8', '', '|', '', '', '', '', '', '', '']]


### Create Pandas DataFrame

In [87]:
df = pd.DataFrame(rows,columns=columns)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Grab all fun facts that contain the word “is”

In [94]:
fun_facts = webpage.select('ul.fun-facts li')
fun_facts_with_is = [fun_fact for fact in fun_facts\
                     if (fun_fact := fact.find(string=re.compile('is')))\
                     is not None]
fun_facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 'A favorite book series of mine is ',
 'Current video game of choice is ',
 "The band that I've seen the most times live is the "]

## Retrieve content from url scraped from the HTML

In [121]:
# print(webpage.body.prettify())

In [116]:
files_tag = webpage.select('div.block a')
files = [file_link['href'] for file_link in files_tag]
url = 'https://keithgalli.github.io/web-scraping/'
urls = [url + link for link in files]
for i in range(10):
  page = requests.get(urls[i])
  bs_page = bs(page.content)
  secret_word_element = bs_page.find('p',attrs={'id':'secret-word'})
  secret_word = secret_word_element.string
  print(secret_word,end=' ')

Make sure to smash that like button and subscribe !!! 