In [2]:
from bs4 import BeautifulSoup

In [3]:
html_doc = """

<html><head><title>The Dormouse's story</title></head>
<body>



<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

In [4]:
# 传入“字符串”,手动指定解析器
soup = BeautifulSoup(html_doc,'lxml')

In [5]:
'''Tag就是html或者xml中的标签'''
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>','lxml')
tag = soup.b
tag

<b class="boldest">Extremely bold</b>

In [6]:
# Tag标签下也有对象，有两个重要的属性对象：name和attributes
tag.name

'b'

In [7]:
# 标签的属性操作和Python中的字典操作一样
print(tag['class'])
print(tag.attrs)

['boldest']
{'class': ['boldest']}


In [8]:
'''NavigableString是可遍历字符串的意思，其实就是标签内包括的字符串'''
print(tag.string)

Extremely bold


In [16]:
'''tag中包含的字符串是不能编辑的，但是可以替换。'''
tag.string.replace_with("No longer bold")
# 原来的soup也改变了
soup

<html><body><b class="boldest">No longer bold</b></body></html>

In [17]:
# BeautifulSoup对象表示的是一个文档的全部内容
soup.name

'[document]'

In [22]:
# comment对象是一个特殊类型的NavigableString对象(注释)
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup,'lxml')
comment = soup.b.string
print(comment)
type(comment)

Hey, buddy. Want to buy a used parser?


bs4.element.Comment

In [49]:
soup = BeautifulSoup(html_doc,'lxml')
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [50]:
# 子节点有 .contents 和 .children 两种用法
# content属性可以将标签所有子节点以 列表形式 返回
soup.head.contents

[<title>The Dormouse's story</title>]

In [52]:
# head下只有一个标签title，查看一下body下的子标签
soup.body.contents # 子节点列表中有很多“\n”，这是因为它把空格包括进去了

['\n',
 <p class="title"><b>The Dormouse's story</b></p>,
 '\n',
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
 and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 '\n',
 <p class="story">...</p>,
 '\n']

In [53]:
'''也可以通过 .chidren 得到相同的结果，
只不过返回的children是一个生成器（generator），而不是一个列表'''
soup.body.children

<list_iterator at 0x201de1ea9b0>

In [54]:
for child in soup.body.children:
    print(child)



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




In [57]:
# 子孙节点使用 .descendants 属性
# .descendants 的用法和 .children 是一样的，会返回一个生成器，需要for..in..进行遍历。
for child in soup.head.descendants:
    print(child)
# itle是head的子节点，而title中的字符串是title的子节点，
# title和title所包含的字符串都是head的子孙节点

<title>The Dormouse's story</title>
The Dormouse's story


In [9]:
# 父节点使用 .parent 属性实现，可以得到父辈的标签
title_tag = soup.title
print(title_tag)
title_tag.parent
title_tag.parent.name

<title>The Dormouse's story</title>


'head'

In [66]:
# 获得全部父节点则使用 .parents 属性实现，可以循环得到所有的父辈的节点
link = soup.a  # 第一个a元素
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


In [5]:
# 兄弟节点使用 .next_sibling 和 .previous_sibling 属性
# next_sibling 获取下一个兄弟节点，而previous_sibling 获取前一个兄弟节点
a_tag = soup.find('a',id='link1')
a_tag.next_sibling
a_tag.previous_element

'Once upon a time there were three little sisters; and their names were\n'

In [6]:
# 回退和前进,针对所有节点的回退和前进,不分辈分
print(a_tag)
a_tag.next_element

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


'Elsie'

In [7]:
a_tag.previous_element

'Once upon a time there were three little sisters; and their names were\n'

In [8]:
# 回退和前进也可以寻找所有的前后节点
for elem in a_tag.next_elements:
    if elem.name is None:
        continue
    print(elem.name)

a
a
p


In [10]:
# 节点内容
print(title_tag)
title_tag.string

<title>The Dormouse's story</title>


"The Dormouse's story"

In [13]:
head_tag = soup.head
print(head_tag.contents)
head_tag.string

[<title>The Dormouse's story</title>]


"The Dormouse's story"