# BeautifulSoup To Work With Webpages

In [1]:
import requests
import webbrowser

import re

from bs4 import BeautifulSoup

from pprint import pprint

In [2]:
webbrowser.open("https://pythonscraping.com/pages/page3.html")

True

In [3]:
resp = requests.get('https://pythonscraping.com/pages/page3.html')
html = resp.text

In [4]:
html

'<html>\n<head>\n<style>\nimg{\n\twidth:75px;\n}\ntable{\n\twidth:50%;\n}\ntd{\n\tmargin:10px;\n\tpadding:10px;\n}\n.wrapper{\n\twidth:800px;\n}\n.excitingNote{\n\tfont-style:italic;\n\tfont-weight:bold;\n}\n</style>\n</head>\n<body>\n<div id="wrapper">\n<img src="../img/gifts/logo.jpg" style="float:left;">\n<h1>Totally Normal Gifts</h1>\n<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is\nhand-curated by well-paid, free-range Tibetan monks.<p>\nWe haven\'t figured out how to make online shopping carts yet, but you can send us a check to:<br>\n123 Main St.<br>\nAbuja, Nigeria\n</br>We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</div>\n<table id="giftList">\n<tr><th>\nItem Title\n</th><th>\nDescription\n</th><th>\nCost\n</th><th>\nImage\n</th></tr>\n\n<tr id="gift1" class="gift"><td>\nVegetable Basket\n</td><td>\nThis vegetable basket is the perfec

In [5]:
soup = BeautifulSoup(html, 'lxml')
soup

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [6]:
print(soup.prettify())

<html>
 <head>
  <style>
   img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
  </style>
 </head>
 <body>
  <div id="wrapper">
   <img src="../img/gifts/logo.jpg" style="float:left;"/>
   <h1>
    Totally Normal Gifts
   </h1>
   <div id="content">
    Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.
    <p>
     We haven't figured out how to make online shopping carts yet, but you can send us a check to:
     <br/>
     123 Main St.
     <br/>
     Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.
    </p>
   </div>
   <table id="giftList">
    <tr>
     <th>
      Item Title
     </th>
     <th>
      Description
     </th>
     <th>
      Cost
     </th>
     <th>
      Image
     </th>
   

In [7]:
soup.style

<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>

#### First row or header of the table

In [8]:
soup.tr

<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>

In [9]:
soup.img

<img src="../img/gifts/logo.jpg" style="float:left;"/>

#### finding all rows within the page

In [10]:
rows = soup.find_all('tr')
rows

[<tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img

In [11]:
len(rows)

6

#### Indexing rows

In [12]:
rows[0]

<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>

In [13]:
rows[1]

<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>

#### table cells findall

In [14]:
cells = soup.find_all('td')
cells

[<td>
 Vegetable Basket
 </td>,
 <td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td>,
 <td>
 $15.00
 </td>,
 <td>
 <img src="../img/gifts/img1.jpg"/>
 </td>,
 <td>
 Russian Nesting Dolls
 </td>,
 <td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td>,
 <td>
 $10,000.52
 </td>,
 <td>
 <img src="../img/gifts/img2.jpg"/>
 </td>,
 <td>
 Fish Painting
 </td>,
 <td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td>,
 <td>
 $10,005.00
 </td>,
 <td>
 <img src="../img/gifts/img3.jpg"/>
 </td>,
 <td>
 Dead Parrot
 </td>,
 <td>
 This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
 </t

In [15]:
len(cells)

20

#### Access each table cells individually

In [16]:
for cell in cells:
    print(cell)

<td>
Vegetable Basket
</td>
<td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td>
<td>
$15.00
</td>
<td>
<img src="../img/gifts/img1.jpg"/>
</td>
<td>
Russian Nesting Dolls
</td>
<td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td>
<td>
$10,000.52
</td>
<td>
<img src="../img/gifts/img2.jpg"/>
</td>
<td>
Fish Painting
</td>
<td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td>
<td>
$10,005.00
</td>
<td>
<img src="../img/gifts/img3.jpg"/>
</td>
<td>
Dead Parrot
</td>
<td>
This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
</td>
<td>
$0.50
</td>
<td>
<img src="../img/gifts/img4.jpg

#### Use .text property to select only the contents

In [17]:
for cell in cells:
    print(cell.text)


Vegetable Basket


This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!


$15.00





Russian Nesting Dolls


Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!


$10,000.52





Fish Painting


If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!


$10,005.00





Dead Parrot


This is an ex-parrot! Or maybe he's only resting?


$0.50





Mystery Box


If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!


$1.50






#### Iterate over all rows and all cells in each row

In [18]:
for tr in soup.find_all('tr'):
    for td in tr.find_all('td'):
        print(td.text.strip()) ### For stripping white spaces

Vegetable Basket
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!
$15.00

Russian Nesting Dolls
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!
$10,000.52

Fish Painting
If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!
$10,005.00

Dead Parrot
This is an ex-parrot! Or maybe he's only resting?
$0.50

Mystery Box
If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!
$1.50



#### Lets scrape other contents

In [19]:
h1_tag = soup.h1
h1_tag

<h1>Totally Normal Gifts</h1>

In [20]:
type(soup.h1)

bs4.element.Tag

#### Attributes

In [21]:
h1_tag.attrs

{}

#### All image tags within web pages

In [22]:
img_tags = soup.find_all(re.compile("^im"))

img_tags

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

In [23]:
for img_tag in img_tags:
    print(img_tag.attrs)

{'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
{'src': '../img/gifts/img1.jpg'}
{'src': '../img/gifts/img2.jpg'}
{'src': '../img/gifts/img3.jpg'}
{'src': '../img/gifts/img4.jpg'}
{'src': '../img/gifts/img6.jpg'}


In [24]:
img_tag = img_tags[0]
img_tag

<img src="../img/gifts/logo.jpg" style="float:left;"/>

#### Source attribute

In [25]:
img_tag['src']

'../img/gifts/logo.jpg'

#### Style attribue

In [26]:
img_tag['style']

'float:left;'

In [27]:
td_tag = soup.td 
td_tag

<td>
Vegetable Basket
</td>

In [28]:
td_tag.string

'\nVegetable Basket\n'

In [29]:
td_tag.text

'\nVegetable Basket\n'

In [30]:
type(td_tag.string)

bs4.element.NavigableString

#### we can see that this is a navigable string

#### Comments

In [31]:
markup = "<b><!--Hey, this is an html comment!--></b>"

soup = BeautifulSoup(markup)

comment = soup.b.string
type(comment)

bs4.element.Comment

#### Attr with multi value attr

In [32]:
markup = """
<p class="highlight important">This is something important</p>
"""
soup = BeautifulSoup(markup)

In [33]:
soup.p 

<p class="highlight important">This is something important</p>

In [34]:
soup.p['class']

['highlight', 'important']

In [35]:
soup.p.get_attribute_list('class')

['highlight', 'important']