In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# 認識HTML

In [None]:
# 原始 HTML 程式碼
html_doc = """
<html><head><title>Hello World</title></head>
<body><h2>Test Header</h2>
<p>This is a test.</p>
<a id="link1" href="/my_link1">Link 1</a>
<a id="link2" href="/my_link2">Link 2</a>
<p>Hello, <b class="boldtext">Bold Text</b></p>
</body></html>
"""

# 輸出看看原始HTML程式碼長怎樣的網站:https://www.w3schools.com/html/tryit.asp?filename=tryhtml_default

In [None]:
soup = BeautifulSoup(html_doc,'html.parser')

In [None]:
soup

In [None]:
# 輸出排版後的 HTML 程式碼
print(soup.prettify())

## 搜尋節點

In [None]:
print(soup.head)
print(soup.head.title)
print(soup.head.title.text)

In [None]:
# 網頁標題 HTML 標籤
title_tag = soup.title
print(title_tag)

In [None]:
# 網頁的標題文字
print(title_tag.text)

In [None]:
# HTML中'a'表示超連結
a_tags = soup.find_all('a')
for tag in a_tags:
  # 輸出超連結的文字
  print(tag.text)

In [None]:
# HTML中'href'表示超連結網址
for tag in a_tags:
  # 輸出超連結網址
  print(tag.get('href'))

In [None]:
# 搜尋所有超連結與粗體字
# HTML中'a'表示超連結
# HTML中'b'表示粗體字
tags = soup.find_all(["a", "b"])
print(tags)

In [None]:
# 限制搜尋結果數量為2，tag會只保留前2個符合的結果
tag_2 = soup.find_all(["a", "b"], limit=2)
print(tag_2)

In [None]:
# 只抓出第一個符合條件的節點
a_tag = soup.find("a")
print(a_tag)

In [None]:
# 預設會以遞迴搜尋所有的子節點
print(soup.find_all("head"))
print(soup.find_all("title"))
print("============================================")
print(soup.find_all("body"))
print(soup.find_all("h2"))
print(soup.find_all("a"))

In [None]:
# 預設會以遞迴搜尋所有的子節點
print(soup.html.find_all("head"))
print(soup.html.find_all("title"))
print("============================================")
print(soup.html.find_all("body"))
print(soup.html.find_all("h2"))
print(soup.html.find_all("a"))

In [None]:
# 不使用遞迴搜尋，僅尋找次一層的子節點
print(soup.find_all("head", recursive=False))
print(soup.find_all("title", recursive=False))
print("============================================")
print(soup.find_all("body", recursive=False))
print(soup.find_all("h2", recursive=False))
print(soup.find_all("a", recursive=False))

In [None]:
# 不使用遞迴搜尋，僅尋找次一層的子節點
print(soup.html.find_all("head", recursive=False))
print(soup.html.find_all("title", recursive=False))
print("============================================")
print(soup.html.find_all("body", recursive=False))
print(soup.html.find_all("h2", recursive=False))
print(soup.html.find_all("a", recursive=False))

## 以HTML屬性搜尋

In [None]:
# 根據 id 搜尋
link2_tag = soup.find(id='link2')
print(link2_tag)

In [None]:
# 搜尋 id 屬性為 link2 的 a 節點
a_link2_tag = soup.find_all("a", id="link2")
print(a_link2_tag)

## 以CSS搜尋

In [None]:
# 搜尋 class 為 boldtext 的 b 節點
b_tag = soup.find_all("b", class_="boldtext")
print(b_tag)

# 實作1:Yahoo首頁頭條新聞

In [None]:
r = requests.get('https://tw.yahoo.com/')

In [None]:
r.text

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
soup

In [None]:
# 輸出排版後的 HTML 程式碼
print(soup.prettify())

In [None]:
# 以 CSS 的 class 抓出各類頭條新聞
stories = soup.find_all('a', class_='story-title')

In [None]:
for s in stories:
    # 新聞標題
    print("標題：" + s.text)
    # 新聞網址
    print("網址：" + s.get('href'))