In [1]:
import requests
import pandas as pd
from io import BytesIO

# E_抓取檔案
## requests.get下載檔案
url = "https://www.ris.gov.tw/info-popudata/app/awFastDownload/file/y0s6-00000.xls/y0s6/00000/"
res = requests.get(url=url)

## 連結本身就會下載excel檔案，下載完後將其讀入
excel_data = BytesIO(res.content)
df = pd.read_excel(excel_data)

In [2]:
import pandas as pd

# T_修正欄位名稱
columns = ["location", "population", "area(km^2)", "density"]
df.columns = columns

In [3]:
import pandas as pd

# T_製作副本，並將「縣市」列表取出做成list
df_city_list = df.copy()
df_city_list = df_city_list.iloc[3:29]

df_city_list["location"] = df_city_list["location"].str.replace(" ", "").str.replace("　", "").str.replace("※", "")

city_list = list(df_city_list["location"])

city_list

['新北市',
 '臺北市',
 '桃園市',
 '臺中市',
 '臺南市',
 '高雄市',
 '臺灣省',
 '宜蘭縣',
 '新竹縣',
 '苗栗縣',
 '彰化縣',
 '南投縣',
 '雲林縣',
 '嘉義縣',
 '屏東縣',
 '臺東縣',
 '花蓮縣',
 '澎湖縣',
 '基隆市',
 '新竹市',
 '嘉義市',
 '福建省',
 '金門縣',
 '連江縣',
 '東沙群島',
 '南沙群島']

In [None]:
import pandas as pd

# T_對總表修改欄位，去除掉多餘欄位（僅保留縣市及鄉鎮市區資料）
# df.drop(index=range(0,42), axis=0, inplace=True)
df["location"] = df["location"].str.replace(" ", "").str.replace("　", "").str.replace("※", "")
df = df.reset_index(drop=True)


Unnamed: 0,location,population,area(km^2),density
0,中華民國113年底,,,單位：人；平方公里
1,區域別,年底人口數,土地面積,人口密度
2,總計,23400220,36197.3371,646.462471
3,新北市,4047001,2052.5667,1971.678192
4,臺北市,2490869,271.7997,9164.355222
5,桃園市,2338648,1220.954,1915.426789
6,臺中市,2860601,2214.8968,1291.527894
7,臺南市,1858651,2191.6531,848.058938
8,高雄市,2731412,2952.1226,925.236642
9,臺灣省,6915487,25110.0037,275.407646


In [14]:
drop_idx = df[df["location"] == "總計"].index

df = df.drop(index = range(0, drop_idx[1]+1), axis=0)

df

Unnamed: 0,location,population,area(km^2),density
42,新北市,4047001,2052.5667,1971.678192
43,板橋區,553538,23.1373,23924.053368
44,三重區,383355,16.317,23494.208494
45,中和區,405956,20.144,20152.700556
46,永和區,213742,5.7138,37408.029683
...,...,...,...,...
431,北竿鄉,3062,9.9,309.292929
432,莒光鄉,1518,4.7,322.978723
433,東引鄉,1555,3.8,409.210526
434,東沙群島,…,2.38,…


In [None]:
import pandas as pd

# T_根據前面取得的縣市列表，取得「縣市」整區資料的位置並做成list
city_index = []
for city in city_list:
    idx = df.index[df["location"] == city].tolist()
    city_index.append(int(idx[0]))

In [6]:
import pandas as pd

# T_新增一欄"city"並根據剛剛取得的縣市位置，將縣市填入，取得縣市+區兩個欄位
df["city"] = None

for n in range(0, len(city_index)):
    if n < len(city_index) - 1:
        df.loc[city_index[n]:city_index[n+1]-1, "city"] = city_list[n]
    else:
        df.loc[city_index[n]:, "city"] = city_list[n]

In [7]:
import pandas as pd

# T_重新調整欄位名並重置index確保其索引正確
new_columns = ["district", "population", "area(km^2)", "density", "city"]
df.columns = new_columns
new_order = ["city", "district", "population", "area(km^2)", "density"]
df = df[new_order]

In [8]:
import pandas as pd

# T_去除人口及人口密度（資料為去年底的過時資料）
df.drop(columns=["population", "density"], axis=1, inplace=True)

In [9]:
import pandas as pd

# T_將面積四捨五入至小數點後第二位
df["area(km^2)"] = df["area(km^2)"].apply(float)
df["area(km^2)"] = df["area(km^2)"].round(2)

In [10]:
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import pandas as pd

load_dotenv()

username = os.getenv("MYSQL_USERNAME")
password = os.getenv("MYSQL_PASSWORD")
target_ip = os.getenv("MYSQL_IP")
target_port = int(os.getenv("MYSQL_PORTT"))
db_name = os.getenv("MYSQL_DB_NAME")

engine = create_engine(f"mysql+pymysql://{username}:{password}@{target_ip}:{target_port}/{db_name}")

df.to_sql(name="raw_location", con=engine, if_exists="replace", index=False)

print("資料已輸入資料庫！")

資料已輸入資料庫！
