In [17]:
import os
import urllib
import datetime
import pandas as pd
import numpy as np

In [18]:
# Create path and list website link
BATH_PATH = os.path.dirname(os.path.abspath('__file__'))

ETC_PATH = os.path.join(BATH_PATH, "data")

START = datetime.date(2020, 6, 1)

END = datetime.date(2022, 3, 1 + 1)

webpage = "https://e-service.cwb.gov.tw/wdps/obs/state.htm"

## CWB Meta

In [19]:
# %% Read webpage
# Read html content as DataFrame
read_html = pd.read_html(webpage)
html_table = read_html[0] # Another table is for removed stations.

# Extract columns of station number, station id, altitude (meter), city, lontitude, latitude, and station address
html_df = html_table.loc[:, ["站號", "站名", "海拔高度(m)", "城市", "經度", "緯度", "地址", "資料起始日期"]]
html_df["資料起始日期"] = pd.to_datetime(html_df["資料起始日期"])
html_df.dropna(inplace=True)

# %% Save as csv file 
# html_df.to_csv(os.path.join(ETC_PATH, "cwb_meta.csv"))
html_df.head(3)

Unnamed: 0,站號,站名,海拔高度(m),城市,經度,緯度,地址,資料起始日期
0,466850,五分山雷達站,756.0,新北市,121.781205,25.071182,瑞芳區靜安路四段1巷1號,1988-07-01
1,466880,板橋,9.7,新北市,121.442017,24.997647,板橋區大觀路二段265巷62號,2002-01-01
2,466900,淡水,19.0,新北市,121.448906,25.164889,淡水區中正東路42巷6號,1942-10-01


## CWB Downloader 

In [20]:
def data_downloader(stn, st_name, date, altitude):
    st_name = urllib.parse.quote(urllib.parse.quote(st_name))
    url = f"https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station={stn}&stname={st_name}&datepicker={date}&altitude={altitude}"
    data = pd.read_html(url)[1]
    data.columns = [i[2] for i in np.array(data.columns)]
    data.insert(loc=0, column="Date", value=date)
    return data

In [21]:
# Only keep stations have radiation and the start date is after START
html_df = html_df[html_df['站號'].str.contains("^46")].reset_index(drop=True)
html_df = html_df[html_df['資料起始日期'] <= pd.to_datetime(START)].reset_index(drop=True)
html_df.head(3)

Unnamed: 0,站號,站名,海拔高度(m),城市,經度,緯度,地址,資料起始日期
0,466850,五分山雷達站,756.0,新北市,121.781205,25.071182,瑞芳區靜安路四段1巷1號,1988-07-01
1,466880,板橋,9.7,新北市,121.442017,24.997647,板橋區大觀路二段265巷62號,2002-01-01
2,466900,淡水,19.0,新北市,121.448906,25.164889,淡水區中正東路42巷6號,1942-10-01


In [22]:
# According to the list of stations, access to data
for i, row in html_df.iterrows():
    delta = pd.date_range(start=START, end=END).tolist()
    data = pd.concat([data_downloader(row["站號"], row["站名"], str(date.date()), row["海拔高度(m)"]) for date in delta]).sort_values(['Date', 'ObsTime']).reset_index(drop=True)
    data.insert(loc=1, column="Station", value=row["站號"])
    data.insert(loc=1, column="Lat", value=row["緯度"])
    data.insert(loc=1, column="Lon", value=row["經度"])
    data.insert(loc=1, column="City", value=row["城市"])
    data.to_csv(os.path.join(ETC_PATH, f'observation(cwb)/{row["站號"]}.csv'), index=False)