# Web Scraping

### Web page structure

In [3]:
import requests

In [6]:
response = requests.get('http://dataquestio.github.io/web-scraping-pages/simple.html')
content = response.content
print(content)

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


### Retrieving elements from a web page

In [7]:
# Use the BeautifulSoup library to parse the Web page with Python

from bs4 import BeautifulSoup

In [8]:
# Initialize the parser, and pass in the content

parser = BeautifulSoup(content, 'html.parser')

In [10]:
# Get the body tag from the document

body = parser.body

In [11]:
# Get the p tag from the body

p = body.p

In [12]:
# Print the text inside the p tag.

print(p.text)

Here is some simple content for this page.


In [16]:
head = parser.head
title_text = head.title.text

In [17]:
print(head)

<head>
<title>A simple example page</title>
</head>


In [18]:
print(title_text)

A simple example page


### Using find_all method

In [19]:
# Get a list of all occurrences of the body tag in the element

body = parser.find_all("body")
body

[<body>
 <p>Here is some simple content for this page.</p>
 </body>]

In [20]:
# Get the paragraph tag

p = body[0].find_all("p")
p

[<p>Here is some simple content for this page.</p>]

In [21]:
# Get the text

print(p[0].text)

Here is some simple content for this page.


In [22]:
head = parser.find_all("head")
head

[<head>
 <title>A simple example page</title>
 </head>]

In [23]:
t = head[0].find_all("title")
t

[<title>A simple example page</title>]

In [24]:
title_text = t[0].text
title_text

'A simple example page'

### Element IDs

In [25]:
# Get the page content and set up a new parser

response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_ids.html")

In [26]:
content = response.content
content

b'<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <div>\n            <p id="first">\n                First paragraph.\n            </p>\n        </div>\n        <p id="second">\n            <b>\n                Second paragraph.\n            </b>\n        </p>\n    </body>\n</html>'

In [27]:
parser = BeautifulSoup(content, 'html.parser')
parser

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p id="first">
                First paragraph.
            </p>
</div>
<p id="second">
<b>
                Second paragraph.
            </b>
</p>
</body>
</html>

In [28]:
# Pass in the ID attribute to only get the element with that specific ID

first_paragraph = parser.find_all("p", id="first")[0]
print(first_paragraph.text)
second_paragraph = parser.find_all("p", id="second")[0]
print(second_paragraph.text)


                First paragraph.
            


                Second paragraph.
            



### Element Classes

In [29]:
# Get the website that contains classes
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_classes.html")

In [30]:
content = response.content
content

b'<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <div>\n            <p class="inner-text">\n                First paragraph.\n            </p>\n            <p class="inner-text">\n                Second paragraph.\n            </p>\n        </div>\n        <p class="outer-text">\n            <b>\n                First outer paragraph.\n            </b>\n        </p>\n        <p class="outer-text">\n            <b>\n                Second outer paragraph.\n            </b>\n        </p>\n    </body>\n</html>'

In [31]:
parser = BeautifulSoup(content, 'html.parser')
parser

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [32]:
# Get the first inner paragraph.
first_inner_paragraph = parser.find_all("p", class_="inner-text")[0]
print(first_inner_paragraph.text)


                First paragraph.
            


In [33]:
second_inner_paragraph = parser.find_all("p", class_="inner-text")[1]
print(second_inner_paragraph.text)


                Second paragraph.
            


In [34]:
first_outer_paragraph = parser.find_all("p", class_="outer-text")[0]
print(first_outer_paragraph.text)



                First outer paragraph.
            



### Using CSS selectors

In [35]:
# Get the website that contains classes and IDs

response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")

In [36]:
content = response.content
content

b'<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <div>\n            <p class="inner-text first-item" id="first">\n                First paragraph.\n            </p>\n            <p class="inner-text">\n                Second paragraph.\n            </p>\n        </div>\n        <p class="outer-text first-item" id="second">\n            <b>\n                First outer paragraph.\n            </b>\n        </p>\n        <p class="outer-text">\n            <b>\n                Second outer paragraph.\n            </b>\n        </p>\n    </body>\n</html>'

In [37]:
parser = BeautifulSoup(content, 'html.parser')
parser

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [38]:
# Select all of the elements that have the first-item class

first_items = parser.select(".first-item")

In [39]:
# Print the text of the first paragraph

print(first_items[0].text)


                First paragraph.
            


In [40]:
first_outer_text = parser.select(".outer-text")[0].text
first_outer_text

'\n\n                First outer paragraph.\n            \n'

In [41]:
second_text = parser.select("#second")[0].text
second_text

'\n\n                First outer paragraph.\n            \n'

### Using nested CSS selectors

In [43]:
# Get the Superbowl box score data

response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")

In [44]:
content = response.content
content

b'<!DOCTYPE html>\n<html>\n    <head lang="en">\n        <meta charset="UTF-8">\n        <title>2014 Superbowl Team Stats</title>\n    </head>\n    <body>\n\n        <table class="stats_table nav_table" id="team_stats">\n            <tbody>\n                <tr id="teams">\n                    <th></th>\n                    <th>SEA</th>\n                    <th>NWE</th>\n                </tr>\n                <tr id="first-downs">\n                    <td>First downs</td>\n                    <td>20</td>\n                    <td>25</td>\n                </tr>\n                <tr id="total-yards">\n                    <td>Total yards</td>\n                    <td>396</td>\n                    <td>377</td>\n                </tr>\n                <tr id="turnovers">\n                    <td>Turnovers</td>\n                    <td>1</td>\n                    <td>2</td>\n                </tr>\n                <tr id="penalties">\n                    <td>Penalties-yards</td>\n              

In [45]:
parser = BeautifulSoup(content, 'html.parser')
parser

<!DOCTYPE html>

<html>
<head lang="en">
<meta charset="utf-8"/>
<title>2014 Superbowl Team Stats</title>
</head>
<body>
<table class="stats_table nav_table" id="team_stats">
<tbody>
<tr id="teams">
<th></th>
<th>SEA</th>
<th>NWE</th>
</tr>
<tr id="first-downs">
<td>First downs</td>
<td>20</td>
<td>25</td>
</tr>
<tr id="total-yards">
<td>Total yards</td>
<td>396</td>
<td>377</td>
</tr>
<tr id="turnovers">
<td>Turnovers</td>
<td>1</td>
<td>2</td>
</tr>
<tr id="penalties">
<td>Penalties-yards</td>
<td>7-70</td>
<td>5-36</td>
</tr>
<tr id="total-plays">
<td>Total Plays</td>
<td>53</td>
<td>72</td>
</tr>
<tr id="time-of-possession">
<td>Time of Possession</td>
<td>26:14</td>
<td>33:46</td>
</tr>
</tbody>
</table>
</body>
</html>

In [46]:
# Find the number of turnovers the Seahawks committed

turnovers = parser.select("#turnovers")[0]
seahawks_turnovers = turnovers.select("td")[1]
seahawks_turnovers_count = seahawks_turnovers.text
print(seahawks_turnovers_count)

1


In [47]:
patriots_total_plays_count = parser.select("#total-plays")[0].select("td")[2].text
patriots_total_plays_count

'72'

In [48]:
seahawks_total_yards_count = parser.select("#total-yards")[0].select("td")[1].text
seahawks_total_yards_count

'396'