In [None]:
import os
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.common import exceptions

NoSuchElementException = exceptions.NoSuchElementException
NoSuchAttributeException = exceptions.NoSuchAttributeException

In [None]:
URL = "https://ahis9.aphia.gov.tw/Veter/OD/HLIndex.aspx"

# 對edge的options加上headers
options = Options()
options.add_argument("user-agent=MyAgent/1.0")

driver = webdriver.Edge(options=options)
driver.get(URL)
time.sleep(2)

try:
    # 取得識別碼的圖片元素
    captcha_img = driver.find_element(
        By.ID, "ctl00_ContentPlaceHolder1_imgValidateCode"
    )

    # 透過img的src取得驗證碼
    # src的連結中，=後面的數字即為驗證碼
    captcha_src = captcha_img.get_attribute("src").split("=")[-1]
    print(f"驗證碼為: {captcha_src}")

    # 找到文字輸入框並將輸入驗證碼
    driver.find_element(
        By.ID, "ctl00_ContentPlaceHolder1_Field_ValidateCode"
    ).send_keys(captcha_src)

    # 點擊查詢按鈕
    driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_btnSave").click()
except NoSuchElementException as err:
    print(err)
except NoSuchAttributeException as err:
    print(err)
    
time.sleep(3)

In [None]:
# 取得網頁原始碼
html = driver.page_source

# 使用bs4解析
soup = BeautifulSoup(html, "html.parser")

In [None]:
# 取得class = col-md-12 col-xs-12 的div tag
div_tag = soup.select("div.col-md-12.col-xs-12 > div.col-md-12")

# 依序取得div tag裡面的text
hospital_list =[div.get_text(strip=True) for div in div_tag]

# 將hospital list中的元素每7個自行組成一個list
grouped_hospital_list = [hospital_list[i: i+7] for i in range(0, len(hospital_list), 7)]

# 創建DataFrame
columns = ["name", "license", "license_date", "vet", "tel", "address", "service"]
df = pd.DataFrame(data=grouped_hospital_list, columns=columns)
print("DataFrame已建置完成")

# 儲存為csv檔
path = "../../data/raw/hospital_data.csv"
# 若目錄不存在則建立
os.makedirs(os.path.dirname(path), exist_ok=True)
df.to_csv(path, index=False)
print("原始CSV檔已儲存完畢")


In [None]:
# ETL
need_revised_columns = ["license", "license_date", "vet", "tel", "address", "service"]

# 移除:前面的資料
for col in need_revised_columns:
    df[col] = df[col].str.split("：").str[-1].str.strip()

# 將空字串設為NaN
df = df.replace({"":np.nan})
df = df.fillna("無此資訊")

# 儲存為整理後的csv
path = "../../data/processed/hospital_data_ETL.csv"
os.makedirs(os.path.dirname(path), exist_ok=True)
df.to_csv(path, index=False)
print("ETL後的CSV檔已儲存完畢")