In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [17]:
print(soup.title,"|", soup.title.name, "|", soup.title.string)
print(soup.p, "|",  soup.p['class'],)
print(soup.a, "|", soup.find_all('a'))

<title>The Dormouse's story</title> | title | The Dormouse's story
<p class="title"><b>The Dormouse's story</b></p> | ['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>


In [5]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [6]:
soup.find(id="link3").get("href") # key-value dictionary

'http://example.com/tillie'

In [7]:
soup.get_text() # get text

"\nThe Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

Multi-value attributes

In [8]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']

['body', 'strikeout']

In [9]:
no_list_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html', multi_valued_attributes=None)
no_list_soup.p['class']


'body strikeout'

### NavigableString

In [10]:
tag = soup.title
tag.string

"The Dormouse's story"

In [11]:
type(tag.string)

bs4.element.NavigableString

In [16]:
doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml")
type(doc.find(string="INSERT FOOTER HERE"))
# what is find method

bs4.element.NavigableString

### Going down
.contents, .children

In [19]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [36]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [37]:
head_tag.title.contents

["The Dormouse's story"]

.descendants # unlike .contents, descendants will show all the child, including child of child.

In [25]:
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


### .strings and stripped_strings

In [28]:
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [33]:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


### Going up

In [38]:
title_tag = soup.title
title_tag.parent

<head><title>The Dormouse's story</title></head>

In [39]:
link = soup.a
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


### Going sideways

In [40]:
link = soup.a 
link.next_sibling

',\n'

In [42]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


### Going back and forth

In [43]:
last_a_tag = soup.find('a', id='link3')
last_a_tag

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [45]:
last_a_tag.next_sibling

';\nand they lived at the bottom of a well.'

In [46]:
last_a_tag.next_element

'Tillie'

## Searching the tree

In [55]:
# basically, methods refer to .find() and .find_all()

# pass a string
print(f"all b tags: {soup.find_all('b')}")

# pass regular expression
import re
for tag in soup.find_all(re.compile('^b')):
    print(f"tag name start with b: {tag.name}")

# pass list
soup.find_all(['a', 'b'])

all b tags: [<b>The Dormouse's story</b>]
tag name start with b: body
tag name start with b: b


[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [59]:
# pass the True value
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


### pass function

In [60]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [64]:
soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

## find_all()

In [66]:
# find_all(name, attrs, recursive, string, limit, string, limit, **kwargs)
# name: tags with certain names
soup.find_all("title")


[<title>The Dormouse's story</title>]

In [68]:
# the keyword arguments
print(soup.find_all(id='link2'))

# if the name itself seem as 'data-foo' 
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


[<div data-foo="value">foo!</div>]

In [69]:
# if the name was class
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all('p', class_="body")

[<p class="body strikeout"></p>]

In [71]:
# string arguments
print(soup.find_all(string='Elsie'))
print(soup.find_all(string=re.compile("Dormouse")))

['Elsie']
["The Dormouse's story", "The Dormouse's story"]


In [72]:
# with limit
soup.find_all('a', limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [73]:
# with recursive
soup.html.find_all("title", recursive=False)

[]

In [75]:
# you can breifly call the find_all()
res1 = soup.find_all("a")
res2 = soup("a")
res1 == res2

True

## Output

In [76]:
soup.prettify()

'<html>\n <head>\n  <title>\n   The Dormouse\'s story\n  </title>\n </head>\n <body>\n  <p class="title">\n   <b>\n    The Dormouse\'s story\n   </b>\n  </p>\n  <p class="story">\n   Once upon a time there were three little sisters; and their names were\n   <a class="sister" href="http://example.com/elsie" id="link1">\n    Elsie\n   </a>\n   ,\n   <a class="sister" href="http://example.com/lacie" id="link2">\n    Lacie\n   </a>\n   and\n   <a class="sister" href="http://example.com/tillie" id="link3">\n    Tillie\n   </a>\n   ;\nand they lived at the bottom of a well.\n  </p>\n  <p class="story">\n   ...\n  </p>\n </body>\n</html>'

In [77]:
# text
soup.get_text()

"\nThe Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"