### Import necesary libraries

In [1]:
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup as bs

### Load the webpage

In [2]:
#Load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

#Convert to a Beautiful Soup object
soup = bs(r.content)

#Print out html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### Start using Beautiful Soup to Scrape

#### find and find_all

In [3]:
#The find command finds the first occurence
first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [4]:
#find_all finds all occurences
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
#Can also pass in a list of elements to look for
first_header = soup.find(['h1', 'h2'])
first_header

<h1>HTML Webpage</h1>

In [6]:
#Same thing for find_all
headers = soup.find_all(['h1', 'h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [7]:
#Can pass in attributes to find/find_all method
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [8]:
#Can nest find and find_all calls
body = soup.find('body')
div = body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [9]:
#Can go further
header = div.find('h1')
header

<h1>HTML Webpage</h1>

### Locate pagraphs that contain the word string in them

In [10]:
#Search specific strings using find/find_all
paragraphs = soup.find_all('p', string='Some bold text')
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
#We generally don't want to search for a full string
#ideally should be able to search with a substring
paragraphs = soup.find_all('p', string=re.compile('Some'))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Find strings in header tags with word 'header' in it. Note the different capitalizations

In [12]:
headers = soup.find_all('h2', string=re.compile("H|header"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)

In [13]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [14]:
#Selecting all paragraph tags in a page
#Similar to find_all
content = soup.select('p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [15]:
#Just grab paragraphs inside divs
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [16]:
#Grab all paragraphs preceeded by h2
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [17]:
#Grab specific elements with ids
#Grab bold elements after a paragraph with id 'paragraph-id'
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [18]:
#Nested calls
paragraphs = soup.select('body > p') #direct descendant of paragraph
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
for paragraph in paragraphs:
    print(paragraph.select('i'))

[<i>Some italicized text</i>]
[]


In [20]:
#Grab an element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properteis of the HTML

In [21]:
#Get string inside an element without tags
header = soup.find("h2")
print(header)
print(header.string)

<h2>A Header</h2>
A Header


In [22]:
div = soup.find("div")
print(div.prettify())
print('----------------------')
print(div.string)

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

----------------------
None


It is not clear to the div.string command whether to print out the HTML Webpage or the Link to more ... as there 2 elements at child level

In [23]:
#So when you run into a problem of div.string returning None then use div.get_text()
#This occurs when there are multiple child elements
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [24]:
#Get a specifc property from an element
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [25]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

### Code Navigation

In [26]:
#Path syntax
#soup
#soup.body
#soup.body.h1
soup.body.h1.string

'HTML Webpage'

In [27]:
#Know the terms - parent, sibling and child
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



The 'div align="middle"'has as its parent the 'body' which naturally implies the 'body''s child is div. The h2 is a sibling to the div as it has the same level of indent within the indent and has the parent as body

In [28]:
#BeautifulSoup has function calls like find_parent(), find_next_sibling() etc

In [29]:
#Using the find_next_sibling()
div = soup.body.find("div")
div.find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Examples

https://keithgalli.github.io/web-scraping/webpage.html

In [50]:
#Defining the url
url = "https://keithgalli.github.io/web-scraping/" 

### Load webpage

In [30]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")
webpage = bs(r.content)
print(webpage.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

### Grab all social links from webpage

In [31]:
#Using find
ul = webpage.find('ul', attrs={'class':'socials'})
links = ul.find_all('a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [32]:
#Using select
links = webpage.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [33]:
#Directly using the class id on list item
links = webpage.select('li.social a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape the table included on the webpage

Good idea to work with pandas when scraping tables

In [34]:
table = webpage.select("table.hockey-stats")[0]
table

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"

In [35]:
#Extracing table headers so that they can be column names
columns = table.find('thead').find_all('th')
column_names = [c.string for c in columns]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [36]:
#Extracting the table rows
table_rows = table.find('tbody').find_all('tr')

In [37]:
#Extracting to pandas dataframe
l = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
    
df = pd.DataFrame(l, columns=column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Grab all the fun facts that use the word "is" in it

In [42]:
facts = webpage.select('ul.fun-facts li')
facts_with_is = [fact.find_all(string=re.compile("is")) for fact in facts]

#Adding an extra list comprehension to skip the sentences that don't have is
#i.e not a None type
facts_with_is = [fact for fact in facts_with_is if fact]

facts_with_is

[['Middle name is Ronald'],
 ['Dunkin Donuts coffee is better than Starbucks'],
 ['A favorite book series of mine is '],
 ['Current video game of choice is '],
 ["The band that I've seen the most times live is the "]]

#### Download images

In [43]:
print(webpage)

<head>
<title>Keith Galli's Page</title>
<style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
</style>
</head>
<body>
<h1>Welcome to my page!</h1>
<img src="./images/selfie1.jpg" width="300px"/>
<h2>About me</h2>
<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>
<p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></p>
<p>I grew up in the great state o

In [47]:
images = webpage.select("div.row div.column img")
image_url = images[0]['src']
image_url

'images/italy/lake_como.jpg'

In [51]:
full_url = url + image_url

In [54]:
img_data = requests.get(full_url).content

with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)

In [53]:
print(full_url)

https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg


### Solving the mystery challenge

In [66]:
#Getting the divs of the tables
divs = webpage.find_all('div', attrs={'class':'block'})

#Defining an empty list to store the urls that need to be scraped
urls_to_scrape = []

#Looping through the divs and then the links and appending
for div in divs:
    for link in div.find_all('a'):
        urls_to_scrape.append(link['href'])

urls_to_scrape

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

In [67]:
#Converting the urls to full urls
urls_to_scrape = [url+mini_url for mini_url in urls_to_scrape]
urls_to_scrape

['https://keithgalli.github.io/web-scraping/challenge/file_1.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_2.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_3.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_4.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_5.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_6.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_7.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_8.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_9.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_10.html']

In [77]:
#Define an empty string to collect the secret message
secret = ''

#Loop through the urls and then request the html
for url in urls_to_scrape:
    #Make request to the url 
    r = requests.get(url)
    
    #Get html
    webpage = bs(r.content)
    
    #Find the paragraph with id secret-word and append to secret list
    secret += webpage.select("p#secret-word")[0].get_text()
    
    #Add space
    secret += ' '

#Display secret
secret

'Make sure to smash that like button and subscribe !!! '