**0. urlparse** 解析URL


In [0]:
from urllib.parse import urlparse # 匯入urllib套件的urlparse函式

url = 'http://taqm.epa.gov.tw:80/pm25/tw/PM25A.aspx?area=1'
result = urlparse(url) # 以result回傳ParseResult物件

print(result)
print("==============================================================")

print("netloc = ",result.netloc) # 分辨網址的位置
print("path = ", result.path) # 分辨網址的路徑

ParseResult(scheme='http', netloc='taqm.epa.gov.tw:80', path='/pm25/tw/PM25A.aspx', params='', query='area=1', fragment='')
netloc =  taqm.epa.gov.tw:80
path =  /pm25/tw/PM25A.aspx


**urlparse()**，會將urlstring以tuple回傳(scheme, netloc, path, parameters, query, fragment)
可以確定使用的協議文本、網域、路徑等

**1. requests**

In [0]:
import requests

url = 'http://www.lib.pu.edu.tw/history/hours.html'
html = requests.get(url)
html.encoding = 'utf-8'

list = html.text.splitlines()                     # html.text，輸出網頁的原始碼；splitlines()，字串按照\r,\r\n,\n進行分隔
print("**********", list)

for row in list:
    print("=====")
    print(row)

**requests.get()**，可要求下載網站內容

**2. re**

In [0]:
import re                         # 用於正規表示式

#　re.match(pattern, string, flags=0)

# 從頭比對，不match re.match
print("----- match -----")
m = re.match('[a-z]+','12tem12po') # None，在起始位置尋找match的組別，
                                   #如果起始位置沒找到，會顯示None
print(m)

# 從頭找起，直到第一組吻合 re.search
print("----- search -----")
m = re.search('[a-z]+','12tem12po')
print(m) # <re.Match object; span=(2, 5), match='tem'>
print(m.span()) # 找出第一組match的第一個以及最後一個index
print(m.start()) # 找出第一組match的第一個index
print(m.end()) # 找出第一組match的最後一個index

# 從頭找起，直到所有吻合的 re.finall
print("----- findall -----")
m = re.findall('[a-z]+','12tem12po') # 找出所有match的
print(m) 

----- match -----
None
----- search -----
<_sre.SRE_Match object; span=(2, 5), match='tem'>
(2, 5)
2
5
----- findall -----
['tem', 'po']


**3. Beautiful Soup_1**

In [2]:
import requests
from bs4 import BeautifulSoup
url = 'http://www.taiwanlottery.com.tw/'
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')

print("\n---- (1) ----")
data1 = sp.select("title") # 讀取title內容
print(data1)

print("\n---- (2) ----")
data1 = sp.select("#rightdown") # "#id":指定id尋找
print(data1)

print("\n---- (3) ----")
data1 = sp.select(".contents_box04") # ".classname":指定的class去尋找
print(data1)

print("\n---- (4) ----")
data1 = sp.select("html head title") # 以tag逐層尋找
print(data1)

print("\n---- (5) ----")
data1 = sp.select("html head") # 以tag逐層尋找head
print(data1)


---- (1) ----
[<title>
	台灣彩券 taiwanlottery
</title>]

---- (2) ----
[<div id="rightdown">
<!--***************BINGO BINGO**************-->
<div class="contents_box01">
<div id="contents_logo_01"> </div><div class="contents_mine_tx01"><span class="font_black15">109/4/29 第109024197期 </span> <span class="font_red14"><a href="/Lotto/BingoBingo/history.aspx">開獎結果</a> ｜ <a href="/Lotto/BingoBingo/drawing.aspx">各期獎號查詢</a><a href="/lotto/BingoBingo/OEHLStatistic.htm"><div id="contents_logo_01-2"></div></a></span></div><div class="contents_mine_tx04">開出獎號</div><div class="ball_box01"><div class="ball_tx ball_yellow">08 </div><div class="ball_tx ball_yellow">13 </div><div class="ball_tx ball_yellow">15 </div><div class="ball_tx ball_yellow">17 </div><div class="ball_tx ball_yellow">19 </div><div class="ball_tx ball_yellow">20 </div><div class="ball_tx ball_yellow">22 </div><div class="ball_tx ball_yellow">23 </div><div class="ball_tx ball_yellow">26 </div><div class="ball_tx ball_yellow">27 </

**4. Beautiful Soup_2**

In [0]:
html_doc = """
<html>
    <head>
    <title>
        網頁標題
    </title>
    </head>

    <p class = "title"><b>
        文件標題
    </b></p>

    <p class = "story">
        Once upon a time there were three little sisters;
        and their names were
    <a href = "http://example.com/elsie"
        class = "sister" id = "link1"> Elsie</a>,
    <a href = "http://example.com/lacie"
        class = "sister" id = "link2"> Lacie</a> and
    <a href = "http://example.com/tilllie"
        class = "sister" id ="link3"> Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class = "story">...</p>
</html>
"""
from bs4 import BeautifulSoup
sp = BeautifulSoup(html_doc,'html.parser')

print('-----(1)-----') # 尋找第一個<b> tag
b = sp.find('b')
print(b)

print('-----(2)-----') # 尋找所有<a> tag
a = sp.find_all('a')
print(a)

print('-----(3)-----') # 尋找第一個<a> tag 且擁有id==link1
data1 = sp.find('a',{'id':'link1'})
print(data1.text)

print('-----(4)-----') # 尋找所有<a> tag 且 class == sister
print(sp.find_all('a',{'class':'sister'}))

print('-----(5)-----') # 尋找第一個 <a> tag 且 href = "http://example.com/lacie"
data1 = sp.find('a',{'href':'http://example.com/lacie'})
print(data1.text)

print('-----(6)-----') # 尋找 id == link3的區塊
data3 = sp.select('#link3')
print(data3[0].text)

print('-----(7)-----') # 找出所有標題以及 <a> tag
print(sp.find_all(['title','a']))

print('-----(8)----')
data1 = sp.find('a',{'id':'link1'})
print(data1.get('href'))

-----(1)-----
<b>
        文件標題
    </b>
-----(2)-----
[<a class="sister" href="http://example.com/elsie" id="link1"> Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2"> Lacie</a>, <a class="sister" href="http://example.com/tilllie" id="link3"> Tillie</a>]
-----(3)-----
 Elsie
-----(4)-----
[<a class="sister" href="http://example.com/elsie" id="link1"> Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2"> Lacie</a>, <a class="sister" href="http://example.com/tilllie" id="link3"> Tillie</a>]
-----(5)-----
 Lacie
-----(6)-----
 Tillie
-----(7)-----
[<title>
        網頁標題
    </title>, <a class="sister" href="http://example.com/elsie" id="link1"> Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2"> Lacie</a>, <a class="sister" href="http://example.com/tilllie" id="link3"> Tillie</a>]
-----(8)----
http://example.com/elsie


**5. Beautiful Soup_3**

In [0]:
import requests
from bs4 import BeautifulSoup

# url = "http://www.taiwanlottry.com.tw/"
url = "http://www.taiwanlottery.com.tw/index_new.aspx"
html = requests.get(url)
sp = BeautifulSoup(html.text, 'html.parser')

# 逐漸縮小範圍: data1 -> data2 -> data3
print("\n----------(1)----------") # return a list
data1 = sp.select("#rightdown")
print(data1)
print()

print("\m----------(2)----------") # return a string
data2 = data1[0].find('div',{'class':'contents_box02'})
print(data2)
print()

print("\m----------(3)----------") # return a list
data3 = data2.find_all('div',{'class':'ball_tx'})
print(data3)
print()

print("\n-----------------------")
print("\n----------Why?---------") # 為何要逐漸縮小範圍
data = sp.find_all('div',{'class':'ball_tx'})
print(data)


----------(1)----------
[<div id="rightdown">
<!--***************BINGO BINGO**************-->
<div class="contents_box01">
<div id="contents_logo_01"> </div><div class="contents_mine_tx01"><span class="font_black15">109/4/28 第109024118期 </span> <span class="font_red14"><a href="/Lotto/BingoBingo/history.aspx">開獎結果</a> ｜ <a href="/Lotto/BingoBingo/drawing.aspx">各期獎號查詢</a><a href="/lotto/BingoBingo/OEHLStatistic.htm"><div id="contents_logo_01-2"></div></a></span></div><div class="contents_mine_tx04">開出獎號</div><div class="ball_box01"><div class="ball_tx ball_yellow">03 </div><div class="ball_tx ball_yellow">09 </div><div class="ball_tx ball_yellow">10 </div><div class="ball_tx ball_yellow">12 </div><div class="ball_tx ball_yellow">15 </div><div class="ball_tx ball_yellow">25 </div><div class="ball_tx ball_yellow">28 </div><div class="ball_tx ball_yellow">32 </div><div class="ball_tx ball_yellow">33 </div><div class="ball_tx ball_yellow">35 </div><div class="ball_tx ball_yellow">42 </div>

**6. Beautiful Soup_**4 

In [3]:
import requests
from datetime import datetime
from bs4 import BeautifulSoup

# input輸入當天日期
date = input("輸入兌獎日期(yyyymmdd):") 
# 系統時間
now = datetime.now() 
Y = now.year
M = now.month
D = now.day

# 將input分別儲存 y,m,d in int型態 對應年、月、日
y = int(date[0:4])
m = int(date[4:6])
d = int(date[6:8])

 # 抓取統一發票中獎號碼單網頁
url = "https://www.etax.nat.gov.tw/etw-main/web/ETW183W1/"
html = requests.get(url)
html.encoding = 'utf-8'
sp = BeautifulSoup(html.text, 'html.parser')

# 根據輸入日期抓取其他網頁連結，要判斷抓什麼時間的
y = y - 1911
if m == 3:
  if d < 26: # 如果日期小於26(公布名單當日)
    m = 11
    y -= 1
  else:
    m -= 2
elif m < 3:
  m = 11
  y -= 1
else:
  if d < 26: # 如果日期小於26(公布名單當日)
    if m % 2 == 0:
      m -= 3
    else:
      m -= 2
  else:
    if m % 2 == 0:
      m -= 3
    else:
      m -= 2
print("========== %d 年 %d ~ %d 月 統一發票中獎號碼 =========" % (y,m,(m+1)))
print("=====================================================")
# 兌獎，要配合日期判斷要抓取的網頁
links = sp.find_all(["a"]) # 先抓取所有 <a> tag，來判斷超連結
#for link in links:
#    href = link.get("href") # 讀取href屬性內容
#    if href != None and href.startswith("/etw-main/web/ETW183W2_"):
#        print(href)
if m < 10:
  m = "0"+str(m)
m = str(m)
url = "https://www.etax.nat.gov.tw/etw-main/web/ETW183W2_"+str(y)+m # 獲取要兌獎的月份網頁
html = requests.get(url)
html.encoding = "utf-8"
sp = BeautifulSoup(html.text, 'html.parser')

data = sp.find_all("td",{"class":"number"})
print("特別獎: ",data[0].text)
print("特獎: ",data[1].text)
print("頭獎: ",data[2].text)
print("增開六獎:  ",data[3].text)
print("=====================================================")

# 作一個list，去存取所有可能的兌獎號碼
list1 = []
for i in range(len(data)):
  list2 = []
  string1 = data[i].text.strip()
  if " " in string1:
    for j in range(6):
      list3 = string1.split(" ")
      for k in range(3):
        string2 = list3[k]
        list3[k] = string2[j:8]
      list1.append(list3)
  elif "、" in string1:
    list4 = string1.split("、")
    for i in range(len(list4)):
      list1[-1].append(list4[i])
  elif i == len(data)-1:
    list2.append(string1)
    list1[-1].append(list2[0])
  else:
    list2.append(string1)
    list1.append(list2)
print("=====================================================")
print("特別獎 : ",list1[0])
print("特獎 : ",list1[1])
print("頭獎 : ",list1[2])
print("二獎 : ",list1[3])
print("三獎 : ",list1[4])
print("四獎 : ",list1[5])
print("五獎 : ",list1[6])
print("六獎 : ",list1[7])
print("=====================================================")

while True:
  num = str(input("Number:(-1 for exit)"))
  if num == "-1":
    break
  
  if len(num) < 3:
    print("沒中獎")
  elif num in list1[0]:
    print("特別獎")
  elif num in list1[1]:
    print("特獎")
  elif num in list1[2]:
    print("頭獎")
  elif num in list1[3] or (len(num) > 7 and num[-7:] in list1[3]):
    print("二獎")
  elif num in list1[4] or (len(num) > 6 and num[-6:] in list1[4]):
    print("三獎")
  elif num in list1[5] or (len(num) > 5 and num[-5:] in list1[5]):
    print("四獎")
  elif num in list1[6] or (len(num) > 4 and num[-4:] in list1[6]):
    print("五獎")
  elif num in list1[7] or (len(num) > 3 and num[-3:] in list1[7]):
    print("六獎")
  else:
    print("沒中獎")

輸入兌獎日期(yyyymmdd):20200123
特別獎:   59647042 
特獎:   01260528 
頭獎:   01616970 69921388 53451508  
增開六獎:    710、585、633 
特別獎 :  ['59647042']
特獎 :  ['01260528']
頭獎 :  ['01616970', '69921388', '53451508']
二獎 :  ['1616970', '9921388', '3451508']
三獎 :  ['616970', '921388', '451508']
四獎 :  ['16970', '21388', '51508']
五獎 :  ['6970', '1388', '1508']
六獎 :  ['970', '388', '508', '710', '585', '633']
Number:(-1 for exit)263
沒中獎


KeyboardInterrupt: ignored

**7. md5**

In [0]:
import hashlib, os, requests

# 失效的網址
url = "http://opendata.epa.gov.tw/webapi/Data/ATM00766/?$orderby=SiteId%20desc&$skip=0&$top=1000&format=json"

# 讀取網頁原始碼
html = requests.get(url).text.encode('utf-8')

# 判斷網頁是否更新
new_md5 = hashlib.md5(html).hexdigest()
if os.path.exists('old_md5.txt'):
    with open('old_md5.txt','r') as f:
        old_md5 = f.read()
    with open('old_md5.txt','w') as f:
        f.write(old_md5)
else:
    with open('old_md5.txt','w') as f:
        f.write(new_md5)
    old_md5 = ""

print(new_md5)

if new_md5 != old_md5:
    print('資料已更新，分析轉換新網頁…')
    # 分析轉換新網頁
else:
    print('資料未更新，從資料庫讀取…')
    # 從資料庫讀取