# Web Scraping

## BeautifulSoup & Requests

### Site used for scraping: https://www.scrapethissite.com/pages/forms/

![image.png](attachment:image.png) 

In [1]:
#Import module, then packages

from bs4 import BeautifulSoup
import requests

In [4]:
#Url where we will be pulling data from 

url = 'https://www.scrapethissite.com/pages/forms/'

In [6]:
#'get' function to use the 'request' library and will return a response object
#all these responses are bad: 200, 204: no web content, 400: inavlid, server can't process, 400, 404:server can't be found 
page = requests.get(url)

In [8]:
#'page' sends the request, '.text' retreives the raw html we will be using. 'html' shows how we will parse this information
soup = BeautifulSoup(page.text, 'html')

In [9]:
print(soup)

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robot

In [10]:
#To visually see it better,seeing it 'prettier'

print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta con

## Extracting information from our WebPage

### Find and Find_all

In [12]:
#Rewriting again for practice

from bs4 import BeautifulSoup
import requests

In [13]:
url = 'https://www.scrapethissite.com/pages/forms/'

In [15]:
page = requests.get(url)

In [16]:
soup = BeautifulSoup(page.text, 'html')

In [17]:
print(soup)

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robot

In [18]:
#lets say we want to find the div tag
#this only returns the first div tag response found
soup.find('div')

<div class="container">
<div class="col-md-12">
<ul class="nav nav-tabs">
<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>
<li id="nav-sandbox">
<a class="nav-link" href="/pages/">
<i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                Sandbox
                            </a>
</li>
<li id="nav-lessons">
<a class="nav-link" href="/lessons/">
<i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                Lessons
                            </a>
</li>
<li id="nav-faq">
<a class="nav-link" href="/faq/">
<i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                FAQ
                            </a>
</li>
<li class="pull-right" id="nav-login">
<a class="nav-link" href="/login/">
                                Login

In [19]:
#This returns all the div tags on the page
soup.find_all('div')

[<div class="container">
 <div class="col-md-12">
 <ul class="nav nav-tabs">
 <li id="nav-homepage">
 <a class="nav-link hidden-sm hidden-xs" href="/">
 <img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                 Scrape This Site
                             </a>
 </li>
 <li id="nav-sandbox">
 <a class="nav-link" href="/pages/">
 <i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                 Sandbox
                             </a>
 </li>
 <li id="nav-lessons">
 <a class="nav-link" href="/lessons/">
 <i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                 Lessons
                             </a>
 </li>
 <li id="nav-faq">
 <a class="nav-link" href="/faq/">
 <i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                 FAQ
                             </a>
 </li>
 <li class="pull-right" id="nav-login">
 <a class="nav-link" href="/login/">
        

In [20]:
#We can also search all, specifying attributes
#'div' is a tag, 'class' refers to what comes after it after the '= sign'.. the contents in the class below is copied from the html output above
soup.find_all('div', class_= 'col-md-12')

[<div class="col-md-12">
 <ul class="nav nav-tabs">
 <li id="nav-homepage">
 <a class="nav-link hidden-sm hidden-xs" href="/">
 <img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                 Scrape This Site
                             </a>
 </li>
 <li id="nav-sandbox">
 <a class="nav-link" href="/pages/">
 <i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                 Sandbox
                             </a>
 </li>
 <li id="nav-lessons">
 <a class="nav-link" href="/lessons/">
 <i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                 Lessons
                             </a>
 </li>
 <li id="nav-faq">
 <a class="nav-link" href="/faq/">
 <i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                 FAQ
                             </a>
 </li>
 <li class="pull-right" id="nav-login">
 <a class="nav-link" href="/login/">
                                 

In [22]:
#If I just want to pull in the data from the heading, I go back to the the url on the actual page, isnpect and click on the paraph to find the class I will insert below
soup.find_all('p', class_='lead')

[<p class="lead">
                             Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                             Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                         </p>]

In [29]:
#'find_all' does not have a text attribute, have to use 'find'
#strip just gets rid of the white space

soup.find('p', class_='lead').text.strip()

'Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.\n                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.'

In [30]:
#If we want to pull the teams name from the table

#Checking to see all column names

soup.find_all('th')

[<th>
                             Team Name
                         </th>,
 <th>
                             Year
                         </th>,
 <th>
                             Wins
                         </th>,
 <th>
                             Losses
                         </th>,
 <th>
                             OT Losses
                         </th>,
 <th>
                             Win %
                         </th>,
 <th>
                             Goals For (GF)
                         </th>,
 <th>
                             Goals Against (GA)
                         </th>,
 <th>
                             + / -
                         </th>]

In [31]:
#Checking to see all row names

soup.find_all('td')

[<td class="name">
                             Boston Bruins
                         </td>,
 <td class="year">
                             1990
                         </td>,
 <td class="wins">
                             44
                         </td>,
 <td class="losses">
                             24
                         </td>,
 <td class="ot-losses">
 </td>,
 <td class="pct text-success">
                             0.55
                         </td>,
 <td class="gf">
                             299
                         </td>,
 <td class="ga">
                             264
                         </td>,
 <td class="diff text-success">
                             35
                         </td>,
 <td class="name">
                             Buffalo Sabres
                         </td>,
 <td class="year">
                             1990
                         </td>,
 <td class="wins">
                             31
                         </td>,
 

In [32]:
#Want the row name
#Remember 'find' just retruns the first response in that tag and 'strip' gets rid of white spaces
soup.find('th').text.strip()

'Team Name'