# Ch03_BeautifulSoup
是一個Python包，功能包括解析HTML、XML文件、修復含有未閉合標籤等錯誤的文件（此種文件常被稱為tag soup）。<br>
這個擴充包為待解析的頁面建立一棵樹，以便提取其中的資料，這在網路資料採集時非常有用。

# BeautifulSoup的物件

In [4]:
from bs4 import BeautifulSoup 

html_str = "<div id='msg' class='body strikeout a bc d e'>Hello World!</div>"
soup = BeautifulSoup(html_str, "lxml")  #常用的解析器有三種："lxml","html5lib","html.parser"，官方建議使用解析速度較快的"lxml"
tag = soup.div       # 取得<div>標籤物件
print(type(tag))     # Tag型態
print(tag.name)      # 標籤名稱
print(tag["id"])     # 標籤屬性
print(tag["class"])  # 多重值屬性的值清單
print(tag.attrs)     # 標籤所有屬性值的字典

<class 'bs4.element.Tag'>
div
msg
['body', 'strikeout', 'a', 'bc', 'd', 'e']
{'id': 'msg', 'class': ['body', 'strikeout', 'a', 'bc', 'd', 'e']}


# string物件

In [2]:
from bs4 import BeautifulSoup 

html_str = "<div id='msg' class='body strikeout'>Hello World!</div>"
soup = BeautifulSoup(html_str, "lxml") #剖析成樹狀資料
tag = soup.div      #定位資料至標籤<div>，
print(tag.string)        # 印出標籤中字串
print(type(tag.string))  # NavigableString型別

Hello World!
<class 'bs4.element.NavigableString'>


# text屬性 & get_text()函數
標籤內容有子標籤時，string屬性無法成功取得標籤內容，需要使用text屬性或get_text()函數：

In [4]:
from bs4 import BeautifulSoup 

html_str = "<div id='msg'>Hello World! <p> Final Test <p></div>"
soup = BeautifulSoup(html_str, "lxml")
tag = soup.div
print(tag.string)        # 標籤內容有子標籤時，string屬性無法成功取得標籤內容
print(tag.text)          # text屬性
print(type(tag.text)) 
print(tag.get_text())    # get_text()函數
print(tag.get_text("-")) # get_text()函數還可指定參數的分隔字元
print(tag.get_text("-", strip=True)) #刪除空白字元

None
Hello World!  Final Test 
<class 'str'>
Hello World!  Final Test 
Hello World! - Final Test 
Hello World!-Final Test


# comment物件
可以取得HTML網頁的註解文字

In [5]:
from bs4 import BeautifulSoup 

html_str = "<p><!-- 註解文字 --></p>"
soup = BeautifulSoup(html_str, "lxml")
comment = soup.p.string
print(comment)
print(type(comment))   # Comment型態

 註解文字 
<class 'bs4.element.Comment'>


In [1]:
#Ch3_2_1
from bs4 import BeautifulSoup 

html_str = "<p>Hello World!</p>"
soup = BeautifulSoup(html_str, "lxml")
print(soup)

<html><body><p>Hello World!</p></body></html>


In [2]:
#Ch3_2_1a
import requests 
from bs4 import BeautifulSoup

r = requests.get("http://hueyanchen.myweb.hinet.net")
r.encoding = "utf8"
soup = BeautifulSoup(r.text, "lxml")
print(soup)

<html><body><h2>HTTP Server Error 503</h2><p> No available server to handle this request </p></body></html>


In [3]:
#Ch3_2_1b
from bs4 import BeautifulSoup 

with open("index.html", "r", encoding="utf8") as fp:
    soup = BeautifulSoup(fp, "lxml")
    print(soup)


<html>
<head>
<title>
   Example Domain
  </title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
   body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 50px;
        background-color: #fff;
        border-radius: 1em;
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        body {
            background-color: #fff;
        }
        div {
            width: auto;
            margin: 0 auto;
            border-radius: 0;
            padding: 1em;
        }
    }
  </style>
</head>
<body>
<div>
<h1>
    Example Domain
   </h1>
<p>
    This domain is established to be used for il

In [5]:
#Ch3_2_2
from bs4 import BeautifulSoup

with open("test.txt", "r", encoding="utf8") as fp:
    soup = BeautifulSoup(fp, "lxml")
    print(soup.prettify())

<html>
 <head>
  <script src="/_myweb/hiad/new-inner.js" type="text/javascript">
  </script>
  <meta charset="utf-8"/>
  <title>
   測試的HTML5網頁
  </title>
 </head>
 <body>
  <h3>
   從網路取得資料
  </h3>
  <hr/>
  <div>
   <p>
    使用Requests套件送出HTTP請求
   </p>
  </div>
 </body>
</html>


In [6]:
#Ch3_2_2a
import requests
from bs4 import BeautifulSoup

r = requests.get("http://hueyanchen.myweb.hinet.net/test.html")
r.encoding = "utf-8"
soup = BeautifulSoup(r.text, "lxml")

fp = open("test2.txt", "w", encoding="utf8")
fp.write(soup.prettify())
print("寫入檔案test2.txt...")
fp.close()

寫入檔案test2.txt...


In [7]:
#Ch3_2_3
from bs4 import BeautifulSoup 

html_str = "<div id='msg' class='body strikeout'>Hello World!</div>"
soup = BeautifulSoup(html_str, "lxml")
tag = soup.div
print(type(tag))     # Tag型態
print(tag.name)      # 標籤名稱
print(tag["id"])     # 標籤屬性
print(tag["class"])  # 多重值屬性的值清單
print(tag.attrs)     # 標籤所有屬性值的字典



<class 'bs4.element.Tag'>
div
msg
['body', 'strikeout']
{'id': 'msg', 'class': ['body', 'strikeout']}


In [8]:
#Ch3_2_3a
from bs4 import BeautifulSoup 

html_str = "<div id='msg' class='body strikeout'>Hello World!</div>"
soup = BeautifulSoup(html_str, "lxml")
tag = soup.div
print(tag.string)        # 標籤內容
print(type(tag.string))  # NavigableString型別

Hello World!
<class 'bs4.element.NavigableString'>


In [9]:
#Ch3_2_3b
from bs4 import BeautifulSoup 

html_str = "<div id='msg'>Hello World! <p> Final Test <p></div>"
soup = BeautifulSoup(html_str, "lxml")
tag = soup.div
print(tag.string)        # string屬性
print(tag.text)          # text屬性
print(type(tag.text)) 
print(tag.get_text())    # get_text()函數
print(tag.get_text("-"))
print(tag.get_text("-", strip=True))

None
Hello World!  Final Test 
<class 'str'>
Hello World!  Final Test 
Hello World! - Final Test 
Hello World!-Final Test


In [10]:
#Ch3_2_3c
from bs4 import BeautifulSoup 

html_str = "<div id='msg'>Hello World!</div>"
soup = BeautifulSoup(html_str, "lxml")
tag = soup.div
print(soup.name)
print(type(soup))   # BeautifulSoup型態



[document]
<class 'bs4.BeautifulSoup'>


In [11]:
#Ch3_2_3d
from bs4 import BeautifulSoup 

html_str = "<p><!-- 註解文字 --></p>"
soup = BeautifulSoup(html_str, "lxml")
comment = soup.p.string
print(comment)
print(type(comment))   # Comment型態



 註解文字 
<class 'bs4.element.Comment'>


In [12]:
#Ch3_3_1
from bs4 import BeautifulSoup 

with open("Example.html", "r", encoding="utf8") as fp:
    soup = BeautifulSoup(fp, "lxml")

print(soup)

<!DOCTYPE html>
<html lang="big5">
<head>
<meta charset="utf-8"/>
<title>測試資料擷取的HTML網頁</title>
</head>
<body>
<div class="surveys" id="surveys">
<div class="survey" id="q1">
<p class="question">
<a href="http://example.com/q1">請問你的性別?</a></p>
<ul class="answer">
<li class="response">男-<span>10</span></li>
<li class="response selected">女-<span>20</span></li>
</ul>
</div>
<div class="survey" id="q2">
<p class="question">
<a href="http://example.com/q2">請問你是否喜歡偵探小說?</a></p>
<ul class="answer">
<li class="response">喜歡-<span>40</span></li>
<li class="response selected">普通-<span>20</span></li>
<li class="response">不喜歡-<span>0</span></li>
</ul>
</div>
<div class="survey" id="q3">
<p class="question">
<a href="http://example.com/q3">請問你是否會程式設計?</a></p>
<ul class="answer">
<li class="response selected">會-<span>30</span></li>
<li class="response">不會-<span>6</span></li>
</ul>
</div>
</div>
<div class="emails" id="emails">
<div class="question">電子郵件清單資訊: </div>
    abc@example.com
    <div class="