### Navigating the tree

In [2]:
from bs4 import BeautifulSoup

In [3]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">
they were nice little sisters, and they lived at the bottom of a well.
</p>
"""

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [11]:
# Going Down the tree
# Finding the first <p> tag
first_p = soup.find('p')
print(first_p)
first_p_tag = soup.p # Accessing the first <p> tag directly
print(first_p_tag)
body_tag = soup.body  # Accessing the <body> tag
print(body_tag)
again_p_tag = body_tag.p  # Accessing the first <p> tag within <body>
print(again_p_tag)
all_anchors = soup.find_all('a')  # Finding all <a> tags, it will return a list
print(all_anchors)

<p class="title"><b>The Dormouse's story</b></p>
<p class="title"><b>The Dormouse's story</b></p>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">
they were nice little sisters, and they lived at the bottom of a well.
</p>
</body>
<p class="title"><b>The Dormouse's story</b></p>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [None]:
# .contains() method and .children attribute
head_tag = soup.head
print(head_tag)
print(head_tag.contents)

title_tag = head_tag.contents[0]
print(title_tag)

print(title_tag.contents)
# A string does not have .contents, because it can't contain anything

# Instead of getting them as a list, you can iterate over a tag's children using the .children generator:

for child in title_tag.children:
    print(child)

for child in head_tag.descendants:
    print(child)

len(list(head_tag.children))

len(list(soup.descendants))

# If a tag's only child is another tag, and that tag has a .string, then the parent tag is considered to have the same .string as its child:
print(head_tag.string)
# If a tag contains more than one thing, then it's not clear what .string should refer to, so .string is defined to be None:


<head><title>The Dormouse's story</title></head>
[<title>The Dormouse's story</title>]
<title>The Dormouse's story</title>
["The Dormouse's story"]
The Dormouse's story
<title>The Dormouse's story</title>
The Dormouse's story
The Dormouse's story


In [20]:
# .strings and .stripped_strings
for string in soup.strings:
    print(repr(string))

print("-----")
for string in soup.stripped_strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'\nthey were nice little sisters, and they lived at the bottom of a well.\n'
'\n'
-----
"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'they were nice little sisters, and they lived at the bottom of a well.'


In [None]:
# Going Up the tree
title_tag = soup.title
title_tag
print(title_tag)
print(title_tag.string.parent)
print(title_tag.parent)

<title>The Dormouse's story</title>
<head><title>The Dormouse's story</title></head>


In [7]:
for parent in title_tag.parents:
    print(parent.name)

print("-----")
for parent in title_tag.self_and_parents:
    print(parent.name)

head
html
[document]
-----
title
head
html
[document]
