In [5]:
import pandas as pd
import os
from datetime import date, datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv
from sqlalchemy import create_engine
from Mods import pandas_mod as pdm
from Mods import date_mod as dtm

In [6]:
# 連線資料庫，讀取主表
load_dotenv()

username = os.getenv("MYSQL_USERNAME")
password = os.getenv("MYSQL_PASSWORD")
target_ip = os.getenv("MYSQL_IP")
target_port = int(os.getenv("MYSQL_PORTT"))
db_name = os.getenv("MYSQL_DB_NAME")

engine = create_engine(f"mysql+pymysql://{username}:{password}@{target_ip}:{target_port}/{db_name}")

sql = "SELECT * FROM airline"

df_main = pd.read_sql(sql, engine)

In [7]:
# 設定欄位名、檔案路徑
folder = r"C:\Users\add41\Documents\Data_Engineer\Project\Flights-Data-Crawler\Data"

corp_list = ["EVA", "CAL", "SJX", "TTW"]
columns = ["airport", "code_1", "code_2"]
day = dtm.get_yesterday()

# 根據公司開始迴圈，逐一讀入並合併至主表
for corp in corp_list:
    file = f"{day}_{corp}_FlightList.csv"
    exist, file_path = pdm.exist_or_not(folder, file)

    # 若找不到今日資料檔案則回傳訊息並跳過
    if exist:
        df = pd.read_csv(file_path)
    else:
        print(f"查無{corp}的{day}資料，請確認資料是否存在！")
        continue

    # 取出起飛和抵達機場作為一個新表，並與舊表合併
    df_airline = df[["departure_airport", "departure_airport_code_1", "departure_airport_code_2",
                     "arrival_airport", "arrival_airport_code_1", "arrival_airport_code_2"]].copy()

    df_airline["ARL_id"] = ""

    df_main = pd.concat([df_main, df_airline], ignore_index=True)

In [8]:
# 去除重複資料及含有空值資料（理論上不應存在，如有則去除）
subset_col = ["departure_airport", "departure_airport_code_1", "departure_airport_code_2",
              "arrival_airport", "arrival_airport_code_1", "arrival_airport_code_2"]
df_main.drop_duplicates(subset=subset_col, inplace=True, keep="first")
df_main.dropna(inplace=True)

# 透過函式自動編號
df_main = pdm.reassign_id(df_main, "ARL_id", "ARL")

# 存檔至地端
main_file = "airline.csv"
main_path = os.path.join(folder, main_file)
df_main.to_csv(main_path, index=False)

# 存檔至資料庫
df_main.to_sql(name="airline", con=engine, index=False, if_exists="replace")

print(f"已更新航線列表，目前資料筆數：{len(df_main)}")

已更新航線列表，目前資料筆數：274
