In [5]:
import os
import pandas as pd
import json

def parse_data(filename):
    df1 = pd.read_csv(filename)
    df2 = df1.rename(columns={"trnOpDate":"乘車日",
            "staCode":"車站代碼",
            "gateInComingCnt":"進站人數",
            "gateOutGoingCnt":"出站人數"})
    with open('車站基本資料集.json',encoding='utf-8') as file:
        json_data = json.load(file)
        stations_info = pd.DataFrame(json_data,columns=['stationCode','stationName'])
        stations_info = stations_info.astype({'stationCode':'int32'})
    df3 = pd.merge(df2,stations_info,left_on='車站代碼',right_on='stationCode')
    df4 = df3.rename(columns={'stationName':'車站名稱'})
    df5 = df4.reindex(columns=['乘車日','車站名稱','進站人數','出站人數'])
    df5['乘車日'] = pd.to_datetime(df5['乘車日'].astype(str))
    return df5

def main():
    current_dir = os.getcwd()
    csv_dir = os.path.join(current_dir,'每日各站進出站人數')
    files_and_dirs = os.listdir(csv_dir)
    all_csv_files = [filename for filename in files_and_dirs if '每日各站進出站人數' in filename]
    sorted_csv_files = sorted(all_csv_files)
    all_final_dfs = [] 
    for filename in sorted_csv_files:
        abs_filename = os.path.join(csv_dir,filename)
        all_final_dfs.append(parse_data(abs_filename))
    df_done = pd.concat(all_final_dfs)
    df_done1 = df_done.set_index('乘車日')
    return df_done1 


if __name__ == '__main__':
    df_done1 = main()

    

In [6]:
df_done2 = df_done1.query('車站名稱 in ("臺北","臺中","高雄")')

In [7]:
df_done2

Unnamed: 0_level_0,車站名稱,進站人數,出站人數
乘車日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-23,臺北,55931,53907
2019-04-23,臺中,18885,18430
2019-04-23,高雄,13281,12606
2019-04-24,臺北,55970,54862
2019-04-24,臺中,18773,18509
...,...,...,...
2023-12-30,臺中,42661,43502
2023-12-30,高雄,18901,19072
2023-12-31,臺北,62141,90865
2023-12-31,臺中,41206,47244


In [8]:
years = df_done2.index.to_series().dt.year.values
months = df_done2.index.to_series().dt.month.values


In [14]:
groupby = df_done2.groupby(by=[years, months,'車站名稱'])
df2 = groupby[['進站人數','出站人數']].sum()
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,進站人數,出站人數
Unnamed: 0_level_1,Unnamed: 1_level_1,車站名稱,Unnamed: 3_level_1,Unnamed: 4_level_1
2019,4,臺中,196819,196322
2019,4,臺北,499349,484128
2019,4,高雄,131501,129092
2019,5,臺中,756624,753259
2019,5,臺北,1881659,1844757
...,...,...,...,...
2023,11,臺北,1917620,1878372
2023,11,高雄,432731,434283
2023,12,臺中,861473,854234
2023,12,臺北,2033892,2015138


In [15]:
df2.unstack(level='車站名稱')

Unnamed: 0_level_0,Unnamed: 1_level_0,進站人數,進站人數,進站人數,出站人數,出站人數,出站人數
Unnamed: 0_level_1,車站名稱,臺中,臺北,高雄,臺中,臺北,高雄
2019,4,196819,499349,131501,196322,484128,129092
2019,5,756624,1881659,496946,753259,1844757,501062
2019,6,806261,1909510,516769,811109,1883199,515675
2019,7,813097,2036875,544617,741690,2033139,548130
2019,8,826856,2057971,528537,770365,2047495,528947
2019,9,811352,1861350,513354,816947,1861246,510600
2019,10,821320,2021092,532588,821740,2007138,534422
2019,11,798980,1957780,511155,799602,1935330,512241
2019,12,865393,1999044,538406,870876,2012196,542660
2020,1,853257,1894584,532721,851720,1851622,532198
