In [5]:
import os
from datetime import date, datetime, timedelta
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

from Mods import date_mod as dtm
from Mods import pandas_mod as pdm

In [None]:
load_dotenv()

username = os.getenv("MYSQL_USERNAME")
password = os.getenv("MYSQL_PASSWORD")
target_ip = os.getenv("MYSQL_IP")
target_port = int(os.getenv("MYSQL_PORTT"))
db_name = os.getenv("MYSQL_DB_NAME")

engine = create_engine(
    f"mysql+pymysql://{username}:{password}@{target_ip}:{target_port}/{db_name}")

sql = "SELECT * FROM flights_list"

df_main = pd.read_sql(sql, engine)

In [8]:
folder = r"C:\Users\add41\Documents\Data_Engineer\Project\Flights-Data-Crawler\Data"

corp_list = ["EVA", "CAL", "SJX", "TTW"]
day = dtm.get_yesterday()

for corp in corp_list:
    file = f"{day}_{corp}_FlightList.csv"
    exist, file_path = pdm.exist_or_not(folder, file)

    if exist:
        df = pd.read_csv(file_path)
    else:
        print(f"查無{corp}的{day}資料，請確認資料是否存在")
        continue

    # 將機場名稱中的Int'l字樣去除，並去除前後空白
    df['departure_airport'] = df['departure_airport'].str.replace(
        "Int\'l", "").str.replace("Intl", "").str.strip()
    df['arrival_airport'] = df['arrival_airport'].str.replace(
        "Int\'l", "").str.replace("Intl", "").str.strip()
    
    df["corp"] = corp

    df["flight_id"] = ""
    df_main = pd.concat([df_main, df], ignore_index=True)

    df.to_csv(file_path, index=False)
    print(f"已處理{corp}航班列表資料")

print("已處理完所有資料")

已處理EVA航班列表資料
已處理CAL航班列表資料
已處理SJX航班列表資料
已處理TTW航班列表資料
已處理完所有資料


In [9]:
df_main.drop_duplicates(subset="link", keep="first", inplace=True)
df_main.dropna(inplace=True)

df_main = pdm.reassign_id(df_main, "flight_id", "FLT_")

new_col = ['flight_id', 'flight_no', "corp", 'flight_type', 'departure_airport', 'departure_airport_code_1',
           'departure_airport_code_2', 'arrival_airport', 'arrival_airport_code_1', 'arrival_airport_code_2', 'link', 'query_date']
df_main = df_main[new_col]

df_main.to_sql(name="flights_list", con=engine,
               index=False, if_exists="replace")
print("已儲存置資料庫")

已儲存置資料庫
