# requests 模組：讀取網站檔案

## 讀取網頁原始碼

In [None]:
import requests
url = 'http://www.ehappy.tw/demo.htm'
html = requests.get(url)
# 檢查HTTP回應碼是否為200(requests.code.ok)
if html.status_code == requests.codes.ok:
    print(html.text)

<!doctype html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>Hello</title>
  </head>
  <body>
    <p>Hello World!</p>
  </body>
</html>


## 加上 URL 參數

In [None]:
import requests
# 將查詢參數定義為字典資料加入GET請求中
payload = {'key1': 'value1', 'key2': 'value2'}
html = requests.get("http://httpbin.org/get",
                     params=payload)
print(html.text)

{
  "args": {
    "key1": "value1", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.23.0", 
    "X-Amzn-Trace-Id": "Root=1-6308729f-3658acd80f708273566db948"
  }, 
  "origin": "34.125.157.164", 
  "url": "http://httpbin.org/get?key1=value1&key2=value2"
}



## 發送POST請求

In [6]:
import requests
# 將查詢參數加入 POST 請求中
payload = {'key1': 'value1', 'key2': 'value2'}
html = requests.post("http://httpbin.org/post",
                   data=payload)
print(html.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "key1": "value1", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "23", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.23.0", 
    "X-Amzn-Trace-Id": "Root=1-6308751e-690491a96bb475df77855ee0"
  }, 
  "json": null, 
  "origin": "34.125.157.164", 
  "url": "http://httpbin.org/post"
}



## 自訂HTTP Headers偽裝瀏覽器操作

In [None]:
import requests
url = 'https://irs.thsrc.com.tw/IMINT/'
# 自訂表頭
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; \
            SM-G960F Build/R16NW) AppleWebKit \
            /537.36 (KHTML, like Gecko) \
            Chrome/62.0.3202.84 Mobile Safari/537.36'
}
# 將自訂表頭加入 GET 請求中
html = requests.get(url, headers=headers)
print(html)

<Response [200]>


## 在requests請求時加入Cookie


In [None]:
import requests
url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
# 設定cookies的值
cookies = {'over18':'1'}
html = requests.get(url, cookies=cookies)
print(html.text)

# BeautifulSoup 模組：網頁解析

## 認識網頁的結構

In [None]:
# bsdemo1.htm
'''
<!doctype html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>我是網頁標題</title>
  </head>
  <body>
    <h1 class="large">我是標題</h1>
    <div>
      <p>我是段落</p>
      <img src="https://www.w3.org/html/logo/
			downloads/HTML5_Logo_256.png" alt="我是圖片">
      <a href="http://www.e-happy.com.tw">我是超連結</a>
    </div>
      </body>
</html>
'''

## BeautifulSoup 常用的屬性

In [8]:
import requests
from bs4 import BeautifulSoup
url = 'http://ehappy.tw/bsdemo1.htm'
html = requests.get(url)
html.encoding = 'UTF-8'
sp = BeautifulSoup(html.text, 'lxml')
print(sp.title)
print(sp.title.text)
print(sp.h1)
print(sp.p)

<title>我是網頁標題</title>
我是網頁標題
<h1 class="large">我是標題</h1>
<p>我是段落</p>


## 找尋指定標籤的內容：find()、find_all()

In [9]:
html = '''
<html>
  <head><meta charset="UTF-8"><title>我是網頁標題</title></head>
  <body>
      <p id="p1">我是段落一</p>
      <p id="p2" class='red'>我是段落二</p>
  </body>
</html>
'''

In [None]:
from bs4 import BeautifulSoup
sp = BeautifulSoup(html, 'lxml')
print(sp.find('p'))
print(sp.find_all('p'))
print(sp.find('p', {'id':'p2', 'class':'red'}))
print(sp.find('p', id='p2', class_= 'red'))

<p id="p1">我是段落一</p>
[<p id="p1">我是段落一</p>, <p class="red" id="p2">我是段落二</p>]
<p class="red" id="p2">我是段落二</p>
<p class="red" id="p2">我是段落二</p>


## 利用CSS選擇器找尋內容：select()

In [10]:
from bs4 import BeautifulSoup
sp = BeautifulSoup(html, 'lxml')
print(sp.select('title'))
print(sp.select('p'))
print(sp.select('#p1'))
print(sp.select('.red'))

[<title>我是網頁標題</title>]
[<p id="p1">我是段落一</p>, <p class="red" id="p2">我是段落二</p>]
[<p id="p1">我是段落一</p>]
[<p class="red" id="p2">我是段落二</p>]


## *取得標籤的屬性內容*

In [11]:
html = '''
<html>
  <head><meta charset="UTF-8"><title>我是網頁標題</title></head>
  <body>
      <img src="http://www.ehappy.tw/python.png">
      <a href="http://www.e-happy.com.tw">超連結</a>
  </body>
</html>
'''

In [12]:
from bs4 import BeautifulSoup
sp = BeautifulSoup(html, 'lxml')
print(sp.select('img')[0].get('src'))
print(sp.select('a')[0].get('href'))
print(sp.select('img')[0]['src'])
print(sp.select('a')[0]['href'])

http://www.ehappy.tw/python.png
http://www.e-happy.com.tw
http://www.ehappy.tw/python.png
http://www.e-happy.com.tw


## 專題：威力彩開獎號碼

In [13]:
import requests
from bs4 import BeautifulSoup
url = 'https://www.taiwanlottery.com.tw/'
r = requests.get(url)
sp = BeautifulSoup(r.text, 'lxml')
# 找到威力彩的區塊
datas = sp.find('div', class_='contents_box02')
# 開獎期數
title = datas.find('span', 'font_black15').text
print('威力彩期數：', title)
# 開獎號碼
nums = datas.find_all('div', class_='ball_tx ball_green')
# 開出順序
print('開出順序：', end=' ')
for i in range(0,6):
    print(nums[i].text, end=' ')
# 大小順序
print('\n大小順序：', end=' ')
for i in range(6,12):
    print(nums[i].text, end=' ')
# 第二區
num = datas.find('div', class_='ball_red').text
print('\n第二區：', num)

威力彩期數： 111/8/25 第111000068期 
開出順序： 38  27  13  11  28  08  
大小順序： 08  11  13  27  28  38  
第二區： 05 


# 使用正規表達式

## 建立正規表達式物件

In [14]:
import re
m = re.search(r'[0-9]+','abc123xyz')

### match()方法

In [22]:
import re
m = re.match(r'[a-z]+','abc123xyz')
print(m)

<re.Match object; span=(0, 3), match='abc'>


In [23]:
if m != None:
    print(m.group())    #abc
    print(m.start())    #0
    print(m.end())      #3
    print(m.span())     #(0, 3)

abc
0
3
(0, 3)


### search()方法

In [24]:
import re
m = re.search(r'[a-z]+', 'abc123xyz')
print(m)    # <re.Match object; span=(0, 3), match='abc'>
if m != None:
    print(m.group())  # abc
    print(m.start())  # 0
    print(m.end())    # 3
    print(m.span())   # (0,3)

<re.Match object; span=(0, 3), match='abc'>
abc
0
3
(0, 3)


### findall()方法

In [25]:
import re
m = re.findall(r'[a-z]+', 'abc123xyz')
print(m)    # ['abc', 'xyz'] 

['abc', 'xyz']


## 使用正規表達式取代內容

In [26]:
import re
result = re.sub(r"\d+", "*", "Password:1234,ID:5678")
print(result)		# Password:*,ID:*

Password:*,ID:*


## 範例：正規表示式練習

In [27]:
html = """
<div class="content">
    E-Mail：<a href="mailto:mail@test.com.tw">
      mail</a><br>
    E-Mail2：<a href="mailto:mail2@test.com.tw">
      mail2</a><br>
    <ul class="price">定價：360元 </ul>
    <img src="http://test.com.tw/p1.jpg">
    <img src="http://test.com.tw/p2.png">
    電話：(04)-76543210、0937-123456
</div>
"""

In [28]:
import re
pattern=r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
emails = re.findall(pattern,html)
for email in emails: #顯示 email
    print(email)

price=re.findall(r'[\d]+元',html)[0].split('元')[0] #價格
print(price) #顯示定價金額

imglist = re.findall(r'[http://]+[a-zA-Z0-9-/.]+\.[jpgpng]+',html)
for img in imglist: #
    print(img) #顯示圖片網址
    
phonelist = re.findall(r'\(?\d{2,4}\)?\-\d{6,8}',html)
for phone in phonelist:
    print(phone) #顯示電話號碼 

mail@test.com.tw
mail2@test.com.tw
360
http://test.com.tw/p1.jpg
http://test.com.tw/p2.png
(04)-76543210
0937-123456
