In [13]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [21]:
from bs4 import BeautifulSoup
info = BeautifulSoup(html_doc, 'html.parser')
print(info.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [12]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [17]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

#print(soup.prettify())

In [22]:
info.title

<title>The Dormouse's story</title>

In [23]:
info.title.name

'title'

In [24]:
info.title.string

"The Dormouse's story"

In [25]:
info.title.parent.name

'head'

In [26]:
info.p

<p class="title"><b>The Dormouse's story</b></p>

In [27]:
info.p['class']

['title']

In [28]:
info.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [29]:
info.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [30]:
info.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [31]:
#One common task is extracting all the URLs found within a page’s <a> tags:
for link in info.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [32]:
#Another common task is extracting all the text from a page:
print(infobe.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [34]:
from bs4 import BeautifulSoup

with open("https://www.pdfdrive.com/") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

soup = BeautifulSoup("<html>a web page</html>", 'html.parser')

OSError: [Errno 22] Invalid argument: 'https://www.pdfdrive.com/'

In [36]:
#A Tag object corresponds to an XML or HTML tag in the original document:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
type(tag)

bs4.element.Tag

In [37]:
#Every tag has a name, accessible as .name:
tag.name

'b'

In [38]:
#If you change a tag’s name, the change will be reflected in any HTML markup generated by Beautiful Soup:
tag.name = "blockquote"
tag

<blockquote class="boldest">Extremely bold</blockquote>

In [39]:
#A tag may have any number of attributes. The tag <b id="boldest"> has an attribute “id” whose value is “boldest”. You can access a tag’s attributes by treating the tag like a dictionary:

tag = BeautifulSoup('<b id="boldest">bold</b>', 'html.parser').b
tag['id']

'boldest'

In [40]:
#You can access that dictionary directly as .attrs:
tag.attrs

{'id': 'boldest'}

In [41]:
#You can add, remove, and modify a tag’s attributes. Again, this is done by treating the tag as a dictionary:


In [42]:
tag['id'] = 'verybold'
tag['another-attribute'] = 1
tag

<b another-attribute="1" id="verybold">bold</b>

In [43]:
del tag['id']
del tag['another-attribute']
tag

<b>bold</b>

In [44]:
tag['id']

KeyError: 'id'

In [46]:
tag.get('id')

In [47]:
#Multi-valued attributes
#HTML 4 defines a few attributes that can have multiple values. HTML 5 removes a couple of them, but defines a few more. The most common multi-valued attribute is class (that is, a tag can have more than one CSS class). Others include rel, rev, accept-charset, headers, and accesskey. Beautiful Soup presents the value(s) of a multi-valued attribute as a list:

css_soup = BeautifulSoup('<p class="body"></p>', 'html.parser')
css_soup.p['class']

['body']

In [48]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.p['class']

['body', 'strikeout']

In [49]:
#If an attribute looks like it has more than one value, but it’s not a multi-valued attribute as defined by any version of the HTML standard, Beautiful Soup will leave the attribute alone:

id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser')
id_soup.p['id']

'my id'

In [50]:
#When you turn a tag back into a string, multiple attribute values are consolidated:

rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>', 'html.parser')
rel_soup.a['rel']

['index']

In [51]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


In [52]:
#You can disable this by passing multi_valued_attributes=None as a keyword argument into the BeautifulSoup constructor:

no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser', multi_valued_attributes=None)
no_list_soup.p['class']

'body strikeout'

In [53]:
#You can use get_attribute_list to get a value that’s always a list, whether or not it’s a multi-valued atribute:

id_soup.p.get_attribute_list('id')

['my id']

In [54]:
#If you parse a document as XML, there are no multi-valued attributes:

xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

In [55]:
#You probably won’t need to do this, but if you do, use the defaults as a guide. They implement the rules described in the HTML specification:

from bs4.builder import builder_registry
builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES

{'*': ['class', 'accesskey', 'dropzone'],
 'a': ['rel', 'rev'],
 'link': ['rel', 'rev'],
 'td': ['headers'],
 'th': ['headers'],
 'form': ['accept-charset'],
 'object': ['archive'],
 'area': ['rel'],
 'icon': ['sizes'],
 'iframe': ['sandbox'],
 'output': ['for']}

In [56]:
#NavigableString
#A string corresponds to a bit of text within a tag. Beautiful Soup uses the NavigableString class to contain these bits of text:

soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
tag.string

'Extremely bold'

In [57]:
type(tag.string)

bs4.element.NavigableString

In [58]:
#A NavigableString is just like a Python Unicode string, except that it also supports some of the features described in Navigating the tree and Searching the tree. You can convert a NavigableString to a Unicode string with unicode() (in Python 2) or str (in Python 3):

unicode_string = str(tag.string)
unicode_string

'Extremely bold'

In [59]:
type(unicode_string)

str

In [60]:
#You can’t edit a string in place, but you can replace one string with another, using replace_with():

tag.string.replace_with("No longer bold")
tag

<b class="boldest">No longer bold</b>

NavigableString supports most of the features described in Navigating the tree and Searching the tree, but not all of them. In particular, since a string can’t contain anything (the way a tag may contain a string or another tag), strings don’t support the .contents or .string attributes, or the find() method.

If you want to use a NavigableString outside of Beautiful Soup, you should call unicode() on it to turn it into a normal Python Unicode string. If you don’t, your string will carry around a reference to the entire Beautiful Soup parse tree, even when you’re done using Beautiful Soup. This is a big waste of memory.

In [61]:
#BeautifulSoup
#The BeautifulSoup object represents the parsed document as a whole. For most purposes, you can treat it as a Tag object. This means it supports most of the methods described in Navigating the tree and Searching the tree.

#You can also pass a BeautifulSoup object into one of the methods defined in Modifying the tree, just as you would a Tag. This lets you do things like combine two parsed documents:

doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml")
doc.find(text="INSERT FOOTER HERE").replace_with(footer)
# 'INSERT FOOTER HERE'
print(doc)

<?xml version="1.0" encoding="utf-8"?>
<document><content/><footer>Here's the footer</footer></document>


In [62]:
#Since the BeautifulSoup object doesn’t correspond to an actual HTML or XML tag, it has no name and no attributes. But sometimes it’s useful to look at its .name, so it’s been given the special .name “[document]”:

soup.name

'[document]'

In [63]:
#Comments and other special strings
#Tag, NavigableString, and BeautifulSoup cover almost everything you’ll see in an HTML or XML file, but there are a few leftover bits. The main one you’ll probably encounter is the comment:

markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
type(comment)

bs4.element.Comment

In [64]:
#The Comment object is just a special type of NavigableString:

comment

'Hey, buddy. Want to buy a used parser?'

In [65]:
#But when it appears as part of an HTML document, a Comment is displayed with special formatting:

print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [66]:
#Beautiful Soup also defines classes called Stylesheet, Script, and TemplateString, for embedded CSS stylesheets (any strings found inside a <style> tag), embedded Javascript (any strings found in a <script> tag), and HTML templates (any strings inside a <template> tag). These classes work exactly the same way as NavigableString; their only purpose is to make it easier to pick out the main body of the page, by ignoring strings that represent something else. (These classes are new in Beautiful Soup 4.9.0, and the html5lib parser doesn’t use them.)

#Beautiful Soup defines classes for anything else that might show up in an XML document: CData, ProcessingInstruction, Declaration, and Doctype. Like Comment, these classes are subclasses of NavigableString that add something extra to the string. Here’s an example that replaces the comment with a CDATA block:

from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)

print(soup.b.prettify())

<b>
 <![CDATA[A CDATA block]]>
</b>


In [67]:
#Navigating the tree
#Here’s the “Three sisters” HTML document again:

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [68]:
#Tags may contain strings and other tags. These elements are the tag’s children. Beautiful Soup provides a lot of different attributes for navigating and iterating over a tag’s children.

#Note that Beautiful Soup strings don’t support any of these attributes, because a string can’t have children.

Navigating using tag names
The simplest way to navigate the parse tree is to say the name of the tag you want. If you want the <head> tag, just say soup.head:

In [69]:
soup.head

<head><title>The Dormouse's story</title></head>

In [70]:
soup.title

<title>The Dormouse's story</title>

In [71]:
#You can do use this trick again and again to zoom in on a certain part of the parse tree. This code gets the first <b> tag beneath the <body> tag:

soup.body.b

<b>The Dormouse's story</b>

In [72]:
#Using a tag name as an attribute will give you only the first tag by that name:

soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [73]:
#If you need to get all the <a> tags, or anything more complicated than the first tag with a certain name, you’ll need to use one of the methods described in Searching the tree, such as find_all():

soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]