In [1]:
import pandas as pd
import os
from datetime import date, timedelta, datetime
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [2]:
def try_parsing_date(text):
    for fmt in ('%Y-%m-%d', '%d/%m/%Y', '%m-%d-%Y', '%Y/%m/%d', "%Y-%m-%d %H:%M:%S"):
        return datetime.strptime(text, fmt)

In [3]:
# 設定縣市名稱及列表、欄位名稱，先建立空的主表
city_dict = {
    "NewTaipei":"新北市",
    "Taipei":"臺北市",
    "Taoyuan":"桃園市",
    "Taichung":"臺中市",
    "Tainan":"臺南市",
    "Kaohsiung":"高雄市"
}

# city_list = list(city_dict.keys())

columns = ['AreaID', 'AreaName', 'fld01', 'fld02', 'fld03', 'fld05', 'fld06',
           'fld04', 'fld08', 'fld07', 'fld10', 'animal', 'date', 'city', 'update_date']
df_main = pd.DataFrame(columns=columns)

# 依序提取檔案之後合併至主表
for city in city_dict:
    folder = r"C:\Users\add41\Documents\Data_Engineer\Project\example_data\pet_regis"
    file = f"{city}.csv"
    file_path = os.path.join(folder, file)

    df = pd.read_csv(file_path)

    df["city"] = city_dict[city]

    df_main = pd.concat([df_main, df], ignore_index=True)

# 合併完成。先還原欄位名
columns = [
    "area_id",
    "district",
    "登記單位數",
    "regis_count",
    "removal_count",
    "轉讓數",
    "變更數",
    "絕育數",
    "絕育除戶數",
    "免絕育數",
    "免絕育除戶數",
    "animal",
    "date",
    "city",
    "update_date"
]

df_main.columns = columns

# 將不要的欄位去除
df_main.drop(columns=["area_id", "登記單位數", "轉讓數", "變更數",
             "絕育數", "絕育除戶數", "免絕育數", "免絕育除戶數"], axis=1, inplace=True)

# 統一日期格式
df_main["date"] = pd.to_datetime(df_main["date"], format='mixed')
df_main["date"] = df_main["date"].dt.strftime("%Y/%m/%d")

# 將區的郵遞區號去除
df_main["district"] = df_main["district"].apply(lambda x: x[3:])

# join取得loc id
load_dotenv()

username = os.getenv("MYSQL_USERNAME")
password = os.getenv("MYSQL_PASSWORD")
target_ip = os.getenv("MYSQL_IP")
target_port = int(os.getenv("MYSQL_PORTT"))
db_name = os.getenv("MYSQL_DB_NAME")

engine = create_engine(
    f"mysql+pymysql://{username}:{password}@{target_ip}:{target_port}/{db_name}")

sql = "SELECT * FROM location"

df_loc = pd.read_sql(sql, engine)
df_loc = df_loc[["loc_id", "city", "district"]]

df_main = df_main.merge(df_loc, how="left", on=["city", "district"])

df_main.drop(columns=["city", "district"], axis=1, inplace=True)


# 將欄位重新排序
new_col = ["loc_id", "date", "animal",
           "regis_count", "removal_count", "update_date"]
df_main = df_main[new_col]

In [4]:
# 完成後存檔
main_file = "pet_regis_data.csv"
main_path = os.path.join(folder, main_file)

df_main.to_csv(main_path, index=False, encoding="utf-8")

In [29]:
df_main.tail(30)

Unnamed: 0,loc_id,date,animal,regis_count,removal_count,update_date
676347,KSH004,2025/10/30,1,10,0,2025/10/31
676348,KSH011,2025/10/30,1,2,0,2025/10/31
676349,KSH003,2025/10/30,1,17,0,2025/10/31
676350,KSH017,2025/10/30,1,6,0,2025/10/31
676351,KSH016,2025/10/30,1,0,0,2025/10/31
676352,KSH019,2025/10/30,1,5,0,2025/10/31
676353,KSH024,2025/10/30,1,0,0,2025/10/31
676354,KSH023,2025/10/30,1,2,0,2025/10/31
676355,KSH022,2025/10/30,1,0,0,2025/10/31
676356,KSH021,2025/10/30,1,0,0,2025/10/31


In [30]:
df_main.to_sql(name="pet_regis", con=engine, index=False, if_exists="replace")

676377