In [None]:
import locale
import os
from datetime import date, datetime, timedelta

import pandas as pd
import pytz
from Mods import date_mod as dtm
from Mods import pandas_mod as pdm


In [None]:
def trans_date_from_chinese(chinese_date):
    clean_date = chinese_date.split("(")[0].strip()
    fmt = "%Y年 %m月 %d日"
    trans = datetime.strptime(clean_date, fmt)

    return trans


def find_tz(tz_str: str):
    for i in pytz.all_timezones:
        tz = pytz.timezone(i)
        now = datetime.now(tz)
        if now.tzname() == tz_str:
            return i


def to_utc_time(tz_str, time):
    tz = find_tz(tz_str)
    tz = pytz.timezone(tz)

    fmt = "%Y-%m-%d %H:%M"
    time = datetime.strptime(time, fmt)

    local_time = tz.localize(time)
    utc = local_time.astimezone(pytz.utc).strftime(fmt)

    return utc


def get_col():
    """取得欄位名list"""
    return [
        'flight_NO',
        'flight_type',
        'flight_company',
        'fly_distance',
        'departure_airport_code',
        'departure_city',
        'arrival_airport_code',
        'arrival_city',
        'departure_date',
        'leave_gate_estimate',
        'leave_gate_actual',
        'departure_time_estimate',
        'departure_time_actual',
        'departure_timezone',
        'arrival_date',
        'landing_time_estimate',
        'landing_time_actual',
        'arrive_gate_estimate',
        'arrive_gate_actual',
        'arrive_timezone',
        'link'
    ]

In [None]:
# 讀取所有航空公司的table，並組合成一個dataframe
flight_corp = ["EVA", "CAL", "SJX", "TTW"]
folder = r"C:\Users\add41\Documents\Data_Engineer\Project\Flights-Data-Crawler\Data"
day = dtm.get_2days_ago()

columns = get_col()
main_file = "flights_info.csv"
df_main, file_path = pdm.read_or_build(folder, main_file, columns)

for corp in flight_corp:
    file = f"{day}_{corp}_FlightsTable.csv"
    file_path = os.path.join(folder, file)
    df = pd.read_csv(file_path)
    df_main = pd.concat([df_main, df], ignore_index=True)

In [None]:
# 將沒有抵達日期的資料去除
df_main = df_main.dropna(subset='arrival_date')

# 將中文日期轉換成標準日期格式
df_main['departure_date'] = df_main['departure_date'].apply(
    trans_date_from_chinese)
df_main['arrival_date'] = df_main['arrival_date'].apply(
    trans_date_from_chinese)

In [None]:
# 將日期+時間轉換成UTC標準時間
# 起飛

df_main['departure_time_actual(UTC)'] = df_main['departure_date'] + \
    ' ' + df_main['departure_time_actual']
df_main['departure_time_actual(UTC)'] = df_main.apply(lambda row: to_utc_time(
    df_main['departure_timezone'], df_main['departure_time_actual(UTC)']), axis=1)

df_main['departure_time_estimate(UTC)'] = df_main['departure_date'] + \
    ' ' + df_main['departure_time_estimate']
df_main['departure_time_estimate(UTC)'] = df_main.apply(lambda row: to_utc_time(
    df_main['departure_timezone'], df_main['departure_time_estimate(UTC)']), axis=1)

# 降落

df_main['arrive_time_actual(UTC)'] = df_main['arrive_date'] + \
    ' ' + df_main['landing_time_actual']
df_main['arrive_time_actual(UTC)'] = df_main.apply(lambda row: to_utc_time(
    df_main['arrive_timezone'], df_main['arrive_time_actual(UTC)']), axis=1)

df_main['arrive_time_estimate(UTC)'] = df_main['arrive_date'] + \
    ' ' + df_main['landing_time_estimate']
df_main['arrive_time_estimate(UTC)'] = df_main.apply(lambda row: to_utc_time(
    df_main['arrive_timezone'], df_main['arrive_time_estimate(UTC)']), axis=1)

df_main

In [None]:
# 日期時間轉換完畢後，將中繼欄位去除，只留下UTC標準時間即可
drop_col = ["departure_date", "departure_time_actual", "departure_time_estimate", "departure_timezone",
            "arrive_date", "landing_time_actual", "landing_time_estimate", "arrive_timezone"]
df_main.drop(columns=drop_col, axis=1, inplace=True)