In [1]:
%pip install bs4 lxml

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# 引入套件
from bs4 import BeautifulSoup as bs

# HTML elements
html_doc = """
<html>
<body>
<h1>Welcome</h1>
<p class="title"><b>The Chainsaw Man's anime</b></p>
<a href="http://example.com/Pochita" class="character" id="link1">Pochita</a>
<a href="http://example.com/Power" class="character" id="link2">Power</a>
<a href="http://example.com/Makima" id="link3">Makima</a>
<p class="anime">They are partners in the story</p>
</body>
</html>
"""

# 使用 Beautifule Soup
soup = bs(html_doc, 'lxml')

In [2]:
# 找到所有的 a 標籤的元素
elements = soup.find_all("a")
print(elements)
for ele in elements:
    print(ele)

# 找到所有內容為 Pochita 的元素
elements = soup.find_all(string="Pochita")
print(elements[0])

# 透過 string 找到的元素，可以使用 parent 屬性找到其父元素
print(elements[0].parent)
# 取得父元素的 href 屬性值
print(elements[0].parent['href'])
print(elements[0].parent['class'])


[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>, <a href="http://example.com/Makima" id="link3">Makima</a>]
<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>
<a class="character" href="http://example.com/Power" id="link2">Power</a>
<a href="http://example.com/Makima" id="link3">Makima</a>
Pochita
<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>
http://example.com/Pochita
['character']


In [8]:
# 找到所有的 a 標籤，並且 class 為 character 的元素
elements = soup.find_all("a", class_="character")
print(elements)

[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>]


In [9]:
# 找到所有的 a 標籤，並且 id 為 link2 的元素
element = soup.find("a", id="link2")
print(element)

<a class="character" href="http://example.com/Power" id="link2">Power</a>


In [11]:
# 找到第一個 a 標籤而且 href 為 http://example.com/Pochita 的元素
elements = soup.find_all("a", href="http://example.com/Pochita")
print(elements)

for element in elements:
    print(element)

[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>]
<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>


In [18]:
# 找到所有的 p 標籤
elements = soup.find_all("p")
print(elements)
# 找到 class=title 的 p 標籤
elements = soup.find_all("p", class_="title")
print(elements)


[<p class="title"><b>The Chainsaw Man's anime</b></p>, <p class="anime">They are partners in the story</p>]
[<p class="title"><b>The Chainsaw Man's anime</b></p>]


In [23]:
# 找到網址為 Power 的 a 標籤
elements = soup.find_all("a", href="http://example.com/Power")
print(elements)

# 找到內文為 Power 的 a 標籤
elements = soup.find_all("a", string="Power")
print(elements)
print(elements[0]['href'])


[<a class="character" href="http://example.com/Power" id="link2">Power</a>]
[<a class="character" href="http://example.com/Power" id="link2">Power</a>]
http://example.com/Power


In [25]:
# attribute 多條件篩選 
elements = soup.find_all("a", {'class': 'character', 'id': 'link1'})
print(elements)

[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>]


### 使用 CSS select

In [30]:
# 引入套件
from bs4 import BeautifulSoup as bs

# HTML elements
html_doc = """
<html>
<body>
<h1>Welcome</h1>
<p class="title"><b>The Chainsaw Man's anime</b></p>
<a href="http://example.com/Pochita" class="character" id="link1">Pochita</a>
<a href="http://example.com/Power" class="character" id="link2">Power</a>
<a href="http://example.com/Makima" id="link3">Makima</a>
<p class="anime">They are partners in the story</p>
</body>
</html>
"""


In [36]:
# 使用 Beautifule Soup
soup = bs(html_doc, 'lxml')

# 使用 CSS select 找到所有的 a 標籤的元素
elements = soup.select("a")
print(elements)

# 使用 CSS select 找到所有的 a 標籤，並且 class 為 character 的元素
elements = soup.select("a.character")
print(elements)

# 只找 class 為 character 的元素
elements = soup.select(".character")
print(elements)

# 使用 CSS select 找到所有的 a 標籤，並且 id 為 link2 的元素
elements = soup.select("a#link2")
print(elements)

# 使用 CSS select 找到連結為 Pochita 的元素
elements = soup.select("a[href='http://example.com/Pochita']")
print(elements)

[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>, <a href="http://example.com/Makima" id="link3">Makima</a>]
[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>]
[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>]
[<a class="character" href="http://example.com/Power" id="link2">Power</a>]
[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>]


In [46]:
# 模糊搜尋 href 屬性中包含 Po 的元素
elements = soup.select("a[href*='Po']")
print(elements)

# 模糊搜尋 class 結尾為 ter 的元素
elements = soup.select("a[class$='ter']")
print(elements)

# 模糊搜尋 id 包含 link 的元素
elements = soup.select("[id*='link']")
print(elements)

# 模糊搜尋 href 開頭為 http 的元素
elements = soup.select("a[href^='http']")
print(elements)

[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>]
[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>]
[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>, <a href="http://example.com/Makima" id="link3">Makima</a>]
[<a class="character" href="http://example.com/Pochita" id="link1">Pochita</a>, <a class="character" href="http://example.com/Power" id="link2">Power</a>, <a href="http://example.com/Makima" id="link3">Makima</a>]
