# Libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs

# Load the page

In [2]:
# Load the webpage
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to a beautifulsoup object
soup = bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Scrapping Data

### find and findall

In [3]:
first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [4]:
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

# Exercise

In [5]:
# Load the webpage
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

# Convert to a beautifulsoup object
soup = bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

### Grab all the social links from the webpage

In [6]:
print(soup.body.prettify())

<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </a>
 </p>
 <p>
  I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.
 </p>
 <h3>
  Hobbies
 </h3>
 <p>
  Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skat

#### 1 method

In [7]:
socials = soup.select('ul.socials a')
for social in socials:
    print(social.get('href'))

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


#### 2 method

In [8]:
socials = soup.find('ul', attrs={'class': 'socials'})
for social in socials.find_all('a'):
    print(social.get('href'))

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


#### 3 method

In [9]:
socials = soup.select('li.social a')
socials_list = [social['href'] for social in socials]
socials_list

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape an HTML table into a Pandas DataFrame

In [10]:
import pandas as pd
import numpy as np

table = soup.select('table.hockey-stats')
table

[<table class="hockey-stats">
 <thead>
 <tr>
 <th class="season" data-sort="">S</th>
 <th class="team" data-sort="team">Team</th>
 <th class="league" data-sort="league">League</th>
 <th class="regular gp" data-sort="gp">GP</th>
 <th class="regular g" data-sort="g">G</th>
 <th class="regular a" data-sort="a">A</th>
 <th class="regular tp" data-sort="tp">TP</th>
 <th class="regular pim" data-sort="pim">PIM</th>
 <th class="regular pm" data-sort="pm">+/-</th>
 <th class="separator"> </th>
 <th class="postseason">POST</th>
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>
 <th class="postseason g" data-sort="playoffs-g">G</th>
 <th class="postseason a" data-sort="playoffs-a">A</th>
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>
 </tr>
 </thead>
 <tbody>
 <tr class="team-continent-NA">
 <td class="season sorted">
                   2014-15
          

#### Selecting thead (columns data)

In [11]:
thead = soup.select('table.hockey-stats thead')
thead

[<thead>
 <tr>
 <th class="season" data-sort="">S</th>
 <th class="team" data-sort="team">Team</th>
 <th class="league" data-sort="league">League</th>
 <th class="regular gp" data-sort="gp">GP</th>
 <th class="regular g" data-sort="g">G</th>
 <th class="regular a" data-sort="a">A</th>
 <th class="regular tp" data-sort="tp">TP</th>
 <th class="regular pim" data-sort="pim">PIM</th>
 <th class="regular pm" data-sort="pm">+/-</th>
 <th class="separator"> </th>
 <th class="postseason">POST</th>
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>
 <th class="postseason g" data-sort="playoffs-g">G</th>
 <th class="postseason a" data-sort="playoffs-a">A</th>
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>
 </tr>
 </thead>]

In [12]:
columns = soup.select('table.hockey-stats th')
columns

columns = [column.string for column in columns]
columns_length = len(columns)
print(columns_length)
columns

17


['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

#### Selecting tbody (rows data)

In [13]:
tbody = soup.select('table.hockey-stats tbody td')
tbody

[<td class="season sorted">
                   2014-15
               </td>,
 <td class="team">
 <i><img src="images/flag.png"/></i>
 <span class="txt-blue">
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
 </span>
 </td>,
 <td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a> </td>,
 <td class="regular gp">17</td>,
 <td class="regular g">3</td>,
 <td class="regular a">9</td>,
 <td class="regular tp">12</td>,
 <td class="regular pim">20</td>,
 <td class="regular pm"></td>,
 <td class="separator"> | </td>,
 <td class="postseason">
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>
 </td>,
 <td class="postseason gp">
 </td>,
 <td class="postseason g">
 </td>,
 <td class="postseason a">
 </td>,
 <td class="postseason tp">
 </td>,
 <td class="postseason pim">
 </td>,
 <td class="postseason pm">
 </td>,
 <td class="season sorted">

In [14]:
rows = [str(row.get_text()).strip() for row in tbody]
rows_length = len(rows)
print(len(rows))
rows

85


['2014-15',
 'MIT (Mass. Inst. of Tech.)',
 'ACHA II',
 '17',
 '3',
 '9',
 '12',
 '20',
 '',
 '|',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '2015-16',
 'MIT (Mass. Inst. of Tech.)',
 'ACHA II',
 '9',
 '1',
 '1',
 '2',
 '2',
 '',
 '|',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '2016-17',
 'MIT (Mass. Inst. of Tech.)',
 'ACHA II',
 '12',
 '5',
 '5',
 '10',
 '8',
 '0',
 '|',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '2017-18',
 'Did not play',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '|',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '2018-19',
 'MIT (Mass. Inst. of Tech.)',
 'ACHA III',
 '8',
 '5',
 '10',
 '15',
 '8',
 '',
 '|',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [15]:
rows = np.array(rows)
rows = rows.reshape(int(rows_length/columns_length), columns_length)
rows

array([['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3',
        '9', '12', '20', '', '|', '', '', '', '', '', '', ''],
       ['2015-16', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '9', '1',
        '1', '2', '2', '', '|', '', '', '', '', '', '', ''],
       ['2016-17', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '12', '5',
        '5', '10', '8', '0', '|', '', '', '', '', '', '', ''],
       ['2017-18', 'Did not play', '', '', '', '', '', '', '', '|', '',
        '', '', '', '', '', ''],
       ['2018-19', 'MIT (Mass. Inst. of Tech.)', 'ACHA III', '8', '5',
        '10', '15', '8', '', '|', '', '', '', '', '', '', '']],
      dtype='<U26')

#### Creating DataFrame

In [16]:
df = pd.DataFrame(data=rows, columns=columns)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Grab all fun facts the use word 'is'

In [17]:
import re

facts = soup.select('ul.fun-facts li')
facts = [fact.find(string=re.compile('is')) for fact in facts]
facts = [fact.find_parent().get_text() for fact in facts if fact]
facts


['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Download an Image

In [18]:
images = soup.select('div.row img')
images

[<img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>,
 <img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>,
 <img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>]

#### Images Names

In [19]:
images_name = [str(image['alt']).replace(' ', '').replace(',','_') for image in images]
images_name

['LakeComo', 'Pontevecchio_Florence', 'Riomaggiore_CinquedeTerre']

#### Images Path

In [20]:
images_path = [image['src'] for image in images]
images_path

url = "https://keithgalli.github.io/web-scraping/"

images_url = [url + image_path for image_path in images_path]
images_url

['https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg',
 'https://keithgalli.github.io/web-scraping/images/italy/pontevecchio.jpg',
 'https://keithgalli.github.io/web-scraping/images/italy/riomaggiore.jpg']

In [21]:
for i in range(len(images_path)):
    img_data = requests.get(images_url[i]).content
    with open(f'{images_name[i]}.jpg', 'wb') as handler:
        handler.write(img_data)

### Solve the mistery challenge!

#### If you scrape the links below grabbing the <p> tag with id="secret-word", you'll discover a secret message :)

In [22]:
mistery = soup.select('div.block li a')
mistery

[<a href="challenge/file_1.html">File 1</a>,
 <a href="challenge/file_2.html">File 2</a>,
 <a href="challenge/file_3.html">File 3</a>,
 <a href="challenge/file_4.html">File 4</a>,
 <a href="challenge/file_5.html">File 5</a>,
 <a href="challenge/file_6.html">File 6</a>,
 <a href="challenge/file_7.html">File 7</a>,
 <a href="challenge/file_8.html">File 8</a>,
 <a href="challenge/file_9.html">File 9</a>,
 <a href="challenge/file_10.html">File 10</a>]

In [23]:
mistery_paths = [mistery_path['href'] for mistery_path in mistery]
mistery_paths

mistery_paths_urls = [url + mistery_path for mistery_path in mistery_paths]
mistery_paths_urls

['https://keithgalli.github.io/web-scraping/challenge/file_1.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_2.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_3.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_4.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_5.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_6.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_7.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_8.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_9.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_10.html']

In [24]:
for mistery_path in mistery_paths_urls:
    mistery_soup = bs(requests.get(mistery_path).content)
    print(mistery_soup.select('#secret-word')[0].string, end=' ')


Make sure to smash that like button and subscribe !!! 