In [1]:
# import必要套件
from urllib import parse
import requests as req
from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent
from datetime import datetime, timedelta
import re
import pandas as pd

In [2]:
'''
爬取資料的中央氣象局1週預報網站: https://www.cwb.gov.tw/V8/C/W/week.html

直接爬取時資料為空值，推測是動態產生內容，經檢查觀察到網站有發出
get request給另一個網址，reponse為HTML格式並內含需要的預報資料，
因此本此模擬呼叫網址進行呼叫

※chrome dev console的network > Fetch/XHR 確認
'''

# 製作根據時間動態產生的URL
# 中央氣象局的預報會呼叫目前時間的Query String

now = datetime.now()
nowFormatted = f"{now.year}{now.month:02}{now.day:02}{now.hour:02}-{str(now.minute)[0]}"
print(f"放入URL的時間Query String: {nowFormatted}")
url = f"https://www.cwb.gov.tw/V8/C/W/County/MOD/wf7dayNC_NCSEI/ALL_Week.html?t={nowFormatted}"
print(url)

放入URL的時間Query String: 2022062817-5
https://www.cwb.gov.tw/V8/C/W/County/MOD/wf7dayNC_NCSEI/ALL_Week.html?t=2022062817-5


In [3]:
# 設定隨機UserAgent、發出請求並確認狀況、建立soup物件


# 設定隨機UserAgent
ua = UserAgent()
my_headers = {
    'User-Agent':ua.random
}

# 發出get request，確認連線狀況
response = req.get(url, headers = my_headers)
print(f'網站狀態碼: {response.status_code}')
# print(f'網站編碼　: {response.encoding}')
# print(f'回覆標頭　: {response.headers}')

soup = bs(response.text, 'lxml')

網站狀態碼: 200


In [4]:
# dataByCountyDay = soup.select('tr.day') # headers的值預設為partial match的搜尋
dataByCounty = soup.select('tbody') # headers的值預設為partial match的搜尋
print(f"dataByCounty串列長度: {len(dataByCounty)}\n") # 22筆

# dataByCounty內的22筆資料對應到全台22個縣市
for county in dataByCounty:
    print(county.select('th[headers] span')) # 

dataByCounty串列長度: 22

[<span class="heading_3">基隆市<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">臺北市<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">新北市<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">桃園市<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">新竹市<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">新竹縣<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">苗栗縣<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">臺中市<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">彰化縣<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">南投縣<i aria-hidden="true" class="fa fa-plus-square"></i></span>]
[<span class="heading_3">雲林縣<i aria-hidden="true" class="fa fa-plus-square"></i></span>]

In [5]:
# dataByCounty構造探勘用
for county in dataByCounty:
    day = county.select('tr.day') # 白天資料
    night = county.select('tr.night') # 晚上資料
    print("="*20)
    cityName = county.select('th[headers] span')[0].get_text()
    print(f"day資料長度:{len(day)}")
    print(f"night資料長度:{len(night)}")
    dayTemp = day[0].select('span.tem-C')
    nightTemp = night[0].select('span.tem-C')
    print(f"1個county白天溫度資料筆數: {len(dayTemp)}")
    print(f"1個county晚上溫度資料筆數: {len(nightTemp)}")
    print(cityName)
    print(day[0].select('td[headers] span')[0].get_text())
    # for i in range(len(dayTemp)):
    #     # print(dayTemp[i].get_text())
    #     print(f"day {i+1} {dayTemp[i].get_text()}")
    print(night[0].select('td[headers] span')[0].get_text())
    # for i in range(len(nightTemp)):
    #     # print(dayTemp[i].get_text())
    #     print(f"day {i+1} {nightTemp[i].get_text()}")

day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
基隆市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
臺北市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
新北市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
桃園市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
新竹市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
新竹縣
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
苗栗縣
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
臺中市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
彰化縣
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
南投縣
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
雲林縣
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
嘉義市
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
嘉義縣
白天
晚上
day資料長度:1
night資料長度:1
1個county白天溫度資料筆數: 7
1個county晚上溫度資料筆數: 7
臺南

In [6]:
# 交錯印出白天、晚上時間以及溫度
for county in dataByCounty:
    day = county.select('tr.day') # 白天資料
    night = county.select('tr.night') # 晚上資料
    dayLabel = day[0].select('td[headers] span')[0].get_text()
    nightLabel = night[0].select('td[headers] span')[0].get_text()
    cityName = county.select('th[headers] span')[0].get_text()
    print(cityName)
    print(dayLabel)
    dayTemp = day[0].select('span.tem-C')
    nightTemp = night[0].select('span.tem-C')
    for i in range(len(dayTemp)):
        print(f"day {i+1} {dayTemp[i].get_text()}")
    
    # 印出晚上時間標題
    print(nightLabel)
    for i in range(len(nightTemp)):
        print(f"day {i+1} {nightTemp[i].get_text()}")

基隆市
白天
day 1 26 - 31
day 2 26 - 30
day 3 26 - 30
day 4 26 - 31
day 5 26 - 30
day 6 25 - 30
day 7 25 - 29
晚上
day 1 26 - 29
day 2 25 - 29
day 3 25 - 28
day 4 26 - 29
day 5 25 - 28
day 6 25 - 28
day 7 25 - 28
臺北市
白天
day 1 25 - 34
day 2 25 - 34
day 3 25 - 34
day 4 25 - 35
day 5 25 - 35
day 6 25 - 34
day 7 25 - 33
晚上
day 1 25 - 30
day 2 25 - 30
day 3 25 - 30
day 4 25 - 30
day 5 25 - 31
day 6 25 - 30
day 7 25 - 30
新北市
白天
day 1 26 - 35
day 2 26 - 34
day 3 26 - 34
day 4 26 - 35
day 5 26 - 35
day 6 26 - 33
day 7 26 - 33
晚上
day 1 26 - 31
day 2 26 - 30
day 3 26 - 30
day 4 26 - 31
day 5 26 - 31
day 6 26 - 30
day 7 26 - 30
桃園市
白天
day 1 25 - 34
day 2 25 - 33
day 3 25 - 33
day 4 25 - 34
day 5 25 - 34
day 6 25 - 32
day 7 25 - 32
晚上
day 1 25 - 30
day 2 25 - 29
day 3 24 - 29
day 4 25 - 30
day 5 25 - 30
day 6 25 - 29
day 7 25 - 29
新竹市
白天
day 1 26 - 32
day 2 26 - 31
day 3 26 - 31
day 4 26 - 31
day 5 26 - 32
day 6 26 - 31
day 7 26 - 31
晚上
day 1 26 - 30
day 2 26 - 29
day 3 26 - 30
day 4 26 - 30
day 5 26 - 3

In [None]:
# 格式整理v1 - 手動給range值印出預報溫度
for county in dataByCounty:
    dayLabel = county.select('tr.day td[headers] span')[0].get_text()
    nightLabel = county.select('tr.night td[headers] span')[0].get_text()
    cityName = county.select('th[headers] span')[0].get_text()
    print(cityName)
    print(f"day #     {dayLabel:8}{nightLabel:8}")
    for i in range(7):
        dayTemp = county.select('tr.day td[headers] span.tem-C')[i].get_text()
        nightTemp = county.select('tr.night td[headers] span.tem-C')[i].get_text()
        print(f"day {i+1}     {dayTemp:10}{nightTemp:10}") 

In [1]:
# 格式整理v2 - 利用len()動態取得dayTemp資料長度作為印出預報值
for county in dataByCounty:
    dayLabel = county.select('tr.day td[headers] span')[0].get_text()
    nightLabel = county.select('tr.night td[headers] span')[0].get_text()
    cityName = county.select('th[headers] span')[0].get_text()
    print(cityName)
    print(f"day #     {dayLabel:8}{nightLabel:8}")
    dayTempAll = county.select('tr.day td[headers] span.tem-C')
    nightTempAll = county.select('tr.night td[headers] span.tem-C')
    for i in range(len(dayTempAll)):
        dayTemp = dayTempAll[i].get_text()
        nightTemp = nightTempAll[i].get_text()
        print(f"day {i+1}     {dayTemp:10}{nightTemp:10}") 

NameError: name 'dataByCounty' is not defined

In [8]:
# 移除溫度間空白值 - 確認空白值實際內容
for county in dataByCounty:
    dayLabel = county.select('tr.day td[headers] span')[0].get_text()
    nightLabel = county.select('tr.night td[headers] span')[0].get_text()
    cityName = county.select('th[headers] span')[0].get_text()
    print(cityName)
    print(f"day #     {dayLabel:8}{nightLabel:8}")
    dayTempAll = county.select('tr.day td[headers] span.tem-C')
    nightTempAll = county.select('tr.night td[headers] span.tem-C')
    for i in range(len(dayTempAll)):
        dayTemp = repr(dayTempAll[i].get_text())
        nightTemp = repr(nightTempAll[i].get_text())
        print(f"day {i+1}     {dayTemp:10}{nightTemp:10}") 

基隆市
day #     白天      晚上      
day 1     '26\u2002-\u200231''26\u2002-\u200229'
day 2     '26\u2002-\u200231''26\u2002-\u200229'
day 3     '26\u2002-\u200231''25\u2002-\u200229'
day 4     '26\u2002-\u200231''25\u2002-\u200229'
day 5     '26\u2002-\u200230''25\u2002-\u200228'
day 6     '25\u2002-\u200230''25\u2002-\u200228'
day 7     '25\u2002-\u200230''25\u2002-\u200228'
臺北市
day #     白天      晚上      
day 1     '25\u2002-\u200234''25\u2002-\u200230'
day 2     '25\u2002-\u200234''25\u2002-\u200230'
day 3     '25\u2002-\u200234''25\u2002-\u200230'
day 4     '25\u2002-\u200235''25\u2002-\u200231'
day 5     '25\u2002-\u200234''25\u2002-\u200230'
day 6     '25\u2002-\u200234''25\u2002-\u200230'
day 7     '25\u2002-\u200234''25\u2002-\u200230'
新北市
day #     白天      晚上      
day 1     '26\u2002-\u200234''26\u2002-\u200230'
day 2     '26\u2002-\u200234''26\u2002-\u200230'
day 3     '26\u2002-\u200235''26\u2002-\u200231'
day 4     '26\u2002-\u200235''26\u2002-\u200231'
day 5     '26\u2002-\u200

In [None]:
# 移除溫度間空白值
'''
遇到溫度間空白值無法移除的問題，先用repr()實際空白實際為utf-8的
en space編碼：\u2002
'''

for county in dataByCounty:
    dayLabel = county.select('tr.day td[headers] span')[0].get_text()
    nightLabel = county.select('tr.night td[headers] span')[0].get_text()
    cityName = county.select('th[headers] span')[0].get_text()
    print(cityName)
    print(f"day #     {dayLabel:8}{nightLabel:8}")
    dayTempAll = county.select('tr.day td[headers] span.tem-C')
    nightTempAll = county.select('tr.night td[headers] span.tem-C')
    for i in range(len(dayTempAll)):
        dayTemp = re.sub(r'\u2002',r'',dayTempAll[i].get_text())
        nightTemp = re.sub(r'\u2002',r'',nightTempAll[i].get_text())
        print(f"day {i+1}     {dayTemp:10}{nightTemp:10}") 

In [None]:
# csv表頭日期資料來源、結構
dateSource = soup.select('tr.table_top th:not(#County, #time)')
print(f"dateSource資料長度: {len(dateSource)}")
print()
print(dateSource)

followingWeek = []
reDate = r'([\d]+\/[\d]{2})'
for date in dateSource:
    followingWeek.append(re.search(reDate, date.get_text())[0])

print(followingWeek)
    



In [None]:
# 建立天氣資料字典 - 日期抓自資料
dayLabel = dataByCounty[0].select('tr.day td[headers] span')[0].get_text()
nightLabel = dataByCounty[0].select('tr.night td[headers] span')[0].get_text()




# weatherData = {}
# weatherData['city_name']=[]
# weatherData['date']=[]
# weatherData[dayLabel]=[]
# weatherData[nightLabel]=[]

# print(f"weatherData字典結構：{weatherData}")

# for county in dataByCounty:
#     dayTempAll = county.select('tr.day td[headers] span.tem-C')
#     nightTempAll = county.select('tr.night td[headers] span.tem-C')
#     cityName = county.select('th[headers] span')[0].get_text()
    
#     for i in range(7):
#         dayTemp = re.sub(r'\u2002',r'',dayTempAll[i].get_text())
#         nightTemp = re.sub(r'\u2002',r'',nightTempAll[i].get_text())
#         forecastDate = (now+timedelta(i+1)).strftime('%m/%d')
#         weatherData['city_name'].append(cityName)
#         weatherData['date'].append(forecastDate)
#         weatherData[dayLabel].append(dayTemp)
#         weatherData[nightLabel].append(nightTemp)
        
# print(weatherData)

In [None]:
# 建立天氣資料字典 - 自動產生日期版
dayLabel = dataByCounty[0].select('tr.day td[headers] span')[0].get_text()
nightLabel = dataByCounty[0].select('tr.night td[headers] span')[0].get_text()

weatherData = {}
weatherData['city_name']=[]
weatherData['date']=[]
weatherData[dayLabel]=[]
weatherData[nightLabel]=[]
# day_of_week = ['星期一','星期二','星期三','星期四','星期五','星期六','星期日']

print(f"weatherData字典結構：{weatherData}")

for county in dataByCounty:
    dayTempAll = county.select('tr.day td[headers] span.tem-C')
    nightTempAll = county.select('tr.night td[headers] span.tem-C')
    cityName = county.select('th[headers] span')[0].get_text()
    
    for i in range(7):
        dayTemp = re.sub(r'\u2002',r'',dayTempAll[i].get_text())
        nightTemp = re.sub(r'\u2002',r'',nightTempAll[i].get_text())
        forecastDate = (now+timedelta(i+1)).strftime('%m/%d')
        weatherData['city_name'].append(cityName)
        weatherData['date'].append(forecastDate)
        weatherData[dayLabel].append(dayTemp)
        weatherData[nightLabel].append(nightTemp)
        
print(weatherData)

In [6]:
# 取出天氣資料 => 轉換為字典、建立dataframe以csv輸出

dataByArea = soup.select('tbody') # 縣市天氣資料上級標籤，共22筆
rawDate = soup.select('tr.table_top th:not(#County, #time)') # 日期標籤來源，共7筆
reDate = r'([\d]+\/[\d]{2})' # 用於取出天氣資料的
dateKeys = []
dayPartLabel = []

dayPartLabel.append(dataByArea[0].select('tr.day td[headers] span')[0].get_text()) # 白天label
dayPartLabel.append(dataByArea[0].select('tr.night td[headers] span')[0].get_text()) # 晚天label
print(dayPartLabel) # 天氣label串列結構確認

# ==== 建立字典結構 ====
weeklyTempData = {}
weeklyTempData['cityCounty'] = []
weeklyTempData['dayPart'] = []
# ==== 建立字典結構 end ====

for rawDay in rawDate:
    day = re.search(reDate, rawDay.get_text())[0]
    weeklyTempData[day] = []
    dateKeys.append(day)

# ==== 確認用 ====
print(dateKeys) 
print(weeklyTempData)
# ==== 確認用 end ====
    
for county in dataByCounty:
    cityName = county.select('th[headers] span')[0].get_text()
    dayTempAll = county.select('tr.day td[headers] span.tem-C')
    nightTempAll = county.select('tr.night td[headers] span.tem-C')
    
    for i in range(2):
        weeklyTempData['cityCounty'].append(cityName)
        weeklyTempData['dayPart'].append(dayPartLabel[i])
    
    for i in range(len(dayTempAll)):
        dayTemp = re.sub(r'\u2002',r'',dayTempAll[i].get_text())
        weeklyTempData[dateKeys[i]].append(dayTemp)
        
    for i in range(len(nightTempAll)):
        nightTemp = re.sub(r'\u2002',r'',nightTempAll[i].get_text())
        weeklyTempData[dateKeys[i]].append(nightTemp)

# print(weeklyTempData)

['白天', '晚上']
['06/28', '06/29', '06/30', '07/01', '07/02', '07/03', '07/04']
{'cityCounty': [], 'dayPart': [], '06/28': [], '06/29': [], '06/30': [], '07/01': [], '07/02': [], '07/03': [], '07/04': []}


In [8]:
# 取出天氣資料 => 轉換為字典 (欄位標籤為城市、時段、日期) 改良版

dataByArea = soup.select('tbody') # 縣市天氣資料上級標籤，共22筆
rawDate = soup.select('tr.table_top th:not(#County, #time)') # 日期標籤來源，共7筆
reDate = r'([\d]+\/[\d]{2})' # 用於取出天氣資料的
dateKeys = []
dayPartLabel = []

dayPartLabel.append(dataByArea[0].select('tr.day td[headers] span')[0].get_text()) # 白天label
dayPartLabel.append(dataByArea[0].select('tr.night td[headers] span')[0].get_text()) # 晚天label
print(dayPartLabel) # 天氣label串列結構確認

# ==== 建立字典結構 ====
weeklyTempData = {}
weeklyTempData['cityCounty'] = []
weeklyTempData['dayPart'] = []
# ==== 建立字典結構 end ====

for rawDay in rawDate:
    day = re.search(reDate, rawDay.get_text())[0]
    weeklyTempData[day] = []
    dateKeys.append(day)

# ==== 確認用 ====
print(dateKeys) 
print(weeklyTempData)
# ==== 確認用 end ====
    
for county in dataByCounty:
    cityName = county.select('th[headers] span')[0].get_text()
    dayTempAll = county.select('tr.day td[headers] span.tem-C')
    nightTempAll = county.select('tr.night td[headers] span.tem-C')
    
    for i in range(2):
        weeklyTempData['cityCounty'].append(cityName)
        weeklyTempData['dayPart'].append(dayPartLabel[i])
    
    for i in range(len(dayTempAll)):
        dayTemp = re.sub(r'\u2002',r'',dayTempAll[i].get_text())
        nightTemp = re.sub(r'\u2002',r'',nightTempAll[i].get_text())
        weeklyTempData[dateKeys[i]].append(dayTemp)
        weeklyTempData[dateKeys[i]].append(nightTemp)

# print(weeklyTempData)

['白天', '晚上']
['06/27', '06/28', '06/29', '06/30', '07/01', '07/02', '07/03']
{'cityCounty': [], 'dayPart': [], '06/27': [], '06/28': [], '06/29': [], '06/30': [], '07/01': [], '07/02': [], '07/03': []}


In [None]:
# 把天氣資料字典轉換以pandas dataframe方式寫進CSV

weeklyTempData_df = pd.DataFrame(weeklyTempData)
weeklyTempData_df.to_csv('weeklyTempData.csv')

In [13]:
# 把天氣資料以excel形式寫入檔案
from openpyxl import Workbook, load_workbook

# 在目前目錄下打開
wb = Workbook()
ws = wb.active
ws.title = '一周天氣預報'

title_row = ['A1','B1','C1','D1','E1','F1','G1','H1','I1']

# ==== 創建excel表格標題列 ====
i = 0
for key in weeklyTempData.keys():
    ws[title_row[i]] = key
    j = 2
    for item in weeklyTempData[key]:
        current_cell = f"{title_row[i][0:1]}{j}"
        ws[current_cell] = item
        j += 1
    i += 1
# ==== 創建excel表格標題列 end ====


wb.save('weeklyTempExcel.xlsx')



#### 參考選擇器寫法

用標籤屬性質選擇: 
https://stackoverflow.com/questions/24984398/how-to-select-tags-by-attribute-value-with-beautiful-soup
https://stackoverflow.com/questions/71905498/getting-an-attribute-from-a-tag-with-beautiful-soup

動態抓取目前時間：
https://stackoverflow.com/questions/30071886/how-to-get-current-time-in-python-and-break-up-into-year-month-day-hour-minu

f-string formatting:
https://saralgyaan.com/posts/f-string-in-python-usage-guide/#How_to_pad_string_with_zero_in_Python_f-strings_(0-Padding)?

縣市code:
https://rpubs.com/lily41628/taiwancountyelection2018

repr()的用法:
https://ithelp.ithome.com.tw/articles/10194593

日期格式化(strifttime)、計算時間(timedelta):
https://stackoverflow.com/questions/65111847/how-convert-current-date-to-a-dd-mm-yyyy
https://stackoverflow.com/questions/20573459/getting-the-date-of-7-days-ago-from-current-date-in-python

BS selector選擇『沒有』特定屬性標籤的寫法
https://stackoverflow.com/questions/69639066/is-there-a-way-to-find-tags-in-beautifulsoup-that-do-not-contain-a-specific-clas