# 멀티스트림 하이브리드 딥러닝 구조(논문 기반)
- 구성: 시간 흐름(Temporal), 날씨(Weather), 역 정보(Station)을 각각 다른 스트림(Stream)으로 처리한 뒤 **결합(Concatenate)**하여 하나의 회귀값 예측 -> 다중 입력 스트림 구조

In [1]:
import os
import pandas as pd
import numpy as np
df23 = pd.read_csv('./data/train_subway23.csv', encoding='CP949')
df22 = pd.read_csv('./data/train_subway22.csv', encoding='CP949')
df21 = pd.read_csv('./data/train_subway21.csv', encoding='CP949')
df = pd.concat([df21, df22, df23], axis=0)
t = pd.read_excel('./data/환승역.xlsx', names =['Line','station_name','transfer'], header=0)
address = pd.read_csv('./data/result_address.csv', encoding='CP949')
subway_13 = pd.DataFrame({'역명':['성수E', '응암S','불암산']
             ,'주소':['서울 성동구 아차산로 100','서울 은평구 증산로 477','서울 노원구 상계로 305']})
address = pd.concat([address, subway_13], axis=0).reset_index(drop=True)
df.shape

(16369332, 15)

In [2]:
address.columns=['station_name','address']
address.station_name = address.station_name.apply(lambda x: x.split('(')[0].strip() if '(' in x else x)
address.address = address.address.apply(lambda x: x.split()[0] if '서울' not in x else x.split()[1])
addr = address['address']  
address['address'] = np.where(addr.str.contains('인천'), '인천',np.where(addr.str.contains('경기'), '경기', addr))

print(df.shape)
df.head()

(16369332, 15)


Unnamed: 0,TM,Line,station_number,station_name,Direction,STN,TA,WD,WS,RN_DAY,RN_HR1,HM,SI,ta_chi,Congestion
0,2021010100,1,150,서울역,상선,419,-9.6,291.1,3.3,0.0,0.0,-99.0,-99.0,-12.6,0.0
1,2021010101,1,150,서울역,상선,419,-9.7,284.6,2.0,0.0,0.0,-99.0,-99.0,-9.8,0.0
2,2021010105,1,150,서울역,상선,419,-9.3,124.7,2.4,0.0,0.0,-99.0,-99.0,-10.3,1.0
3,2021010106,1,150,서울역,상선,419,-9.3,126.2,1.7,0.0,0.0,-99.0,-99.0,-10.1,2.0
4,2021010107,1,150,서울역,상선,419,-9.1,145.7,1.3,0.0,0.0,-99.0,-99.0,-9.7,3.0


In [3]:
# 데이터 형변환
df['TM'] = pd.to_datetime(df['TM'], format='%Y%m%d%H')
df = df.sort_values('TM').reset_index(drop=True)
cat_columns = 'Line station_number STN station_name Direction'.split()
for col in cat_columns:
    df[col] = df[col].astype('category')

# 결측치 생성
df['WD'] = df['WD'].where(df['WD'] >= 0, np.nan)
df['WS'] = df['WS'].replace(-99.0, np.nan)
df['RN_DAY'] = df['RN_DAY'].replace(-99.0, np.nan)
df['RN_HR1'] = df['RN_HR1'].replace(-99.0, np.nan)
df['TA'] = df['TA'].replace(-99.0, np.nan)
df['ta_chi'] = df['ta_chi'].replace(-99.0, np.nan)
df['SI'] = df['SI'].replace(-99.0, np.nan)
df['HM'] = df['HM'].replace(-99.0, np.nan)
df['SI'] = df['SI'].notna().astype(int)

# 기본 파생변수
df.station_name= df.station_name.replace({'당고개':'불암산','자양(뚝섬한강공원)':'자양','신촌(지하)':'신촌'})

df = pd.merge(df, t, on=['Line','station_name'], how='left') # 환승역 개수 추가
df['transfer'] = df['transfer'].fillna(0).astype(int)
df = pd.merge(df, address, on=['station_name'], how='left') # 주소 추가

df['key'] = (df['Line'].astype(str) + '_' +df['station_name'].astype(str) + '_' +df['Direction'].astype(str))
df['year'] = df['TM'].dt.year - 2021 # 연도는 2021년을 기준으로 상대적 추세 반영
df['month'] = df['TM'].dt.month # 월
df['day'] = df['TM'].dt.day # 일
df['hour'] = df['TM'].dt.hour # 시
df['weekday'] = df['TM'].dt.weekday # 요일
df['week_of_month'] = (df['TM'].dt.day.sub(1) // 7) + 1 # 한 달 중 몇 번재 주인지
df['week_of_year'] = df['TM'].dt.isocalendar().week # 연중 몇 번재 주인지
df['day_of_year'] = df['TM'].dt.dayofyear # 연중 몇 번째 일인지
from holidayskr import year_holidays
dates_only1 = [d[0] for d in year_holidays('2021')]
dates_only2 = [d[0] for d in year_holidays('2022')]
dates_only3 = [d[0] for d in year_holidays('2023')]
dates_only3 = [d[0] for d in year_holidays('2024')]
cond1 = df['TM'].isin(dates_only1)
cond2 = df['TM'].isin(dates_only2)
cond3 = df['TM'].isin(dates_only3)
cond4 = df['TM'].isin(dates_only3)

df['is_holiday'] = (cond1 | cond2 | cond3 | cond4).astype(int) # 공휴일 여부
df['is_weekend'] = df['TM'].dt.dayofweek # 주말 여부
df['is_day_before_holiday'] = df['TM'].shift(-1).isin(dates_only1 + dates_only2 + dates_only3).astype(int)
df['is_day_after_holiday'] = df['TM'].shift(1).isin(dates_only1 + dates_only2 + dates_only3).astype(int)

df['time_period'] = np.where(df['hour'].isin([7,8,9]), '출근',
                                np.where(df['hour'].isin([17,18,19]), '퇴근',
                                np.where((df['hour']>9)&(df['hour']<17), '낮',
                                np.where((df['hour']>19)&(df['hour']<21), '저녁',
                                '밤'))))

from pandas.api.types import CategoricalDtype

# 예시 순서 정의
direction_order = ['상선', '하선', '외선', '내선']
time_period_order = ['밤', '출근', '낮', '저녁', '퇴근']

df['Direction'] = df['Direction'].astype(CategoricalDtype(categories=direction_order, ordered=True)).cat.codes
df['time_period'] = df['time_period'].astype(CategoricalDtype(categories=time_period_order, ordered=True)).cat.codes

# 주기성 변수 sin, cos
# 일중 시간
df['sin_hod'] = np.sin(df['hour'] * (2 * np.pi / 21)) # 시간
df['cos_hod'] = np.cos(df['hour'] * (2 * np.pi / 21))
# 주중 일
df['sin_dow'] = np.sin(df['weekday'] * (2 * np.pi / 7))
df['cos_dow'] = np.cos(df['weekday'] * (2 * np.pi / 7))
# 월중 일
df['sin_dom'] = np.sin(df['day'] * (2 * np.pi / 31))
df['cos_dom'] = np.cos(df['day'] * (2 * np.pi / 31))
# 월중 주
df['sin_wom'] = np.sin(df['week_of_month'] * (2 * np.pi / 5))
df['cos_wom'] = np.sin(df['week_of_month'] * (2 * np.pi / 5))
# 연중 주
df['sin_woy'] = np.sin(df['week_of_year'] * (2 * np.pi / 52))
df['cos_woy'] = np.sin(df['week_of_year'] * (2 * np.pi / 52))
# 연중 일
df['sin_doy'] = np.sin(df['day_of_year'] * (2 * np.pi / 365))
df['cos_doy'] = np.sin(df['day_of_year'] * (2 * np.pi / 365))

df = df.sort_values(['key','TM'])
df = df.drop(columns=['TM','key'])

# 선형 보간
columns_to_fill = 'WD RN_DAY RN_HR1 TA ta_chi SI HM WS'.split()
df[columns_to_fill] = df[columns_to_fill].interpolate(method='linear', limit_direction='both')
print('보간 후 남은 결측값:', df[columns_to_fill].isna().sum())

  df.station_name= df.station_name.replace({'당고개':'불암산','자양(뚝섬한강공원)':'자양','신촌(지하)':'신촌'})
  cond1 = df['TM'].isin(dates_only1)
  cond2 = df['TM'].isin(dates_only2)
  cond3 = df['TM'].isin(dates_only3)
  cond4 = df['TM'].isin(dates_only3)
  df['is_day_before_holiday'] = df['TM'].shift(-1).isin(dates_only1 + dates_only2 + dates_only3).astype(int)
  df['is_day_after_holiday'] = df['TM'].shift(1).isin(dates_only1 + dates_only2 + dates_only3).astype(int)


보간 후 남은 결측값: WD        0
RN_DAY    0
RN_HR1    0
TA        0
ta_chi    0
SI        0
HM        0
WS        0
dtype: int64


In [4]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, QuantileTransformer, PowerTransformer

# ▶ 수치형 피처 목록
ordered_cols = ['Direction', 'time_period']
cat_cols = ['Line', 'address', 'station_name']
num_cols = [
    'HM', 'RN_DAY', 'RN_HR1', 'SI', 'STN', 'TA', 'WD', 'WS',
    'cos_dom', 'cos_dow', 'cos_doy', 'cos_hod', 'cos_wom', 'cos_woy', 'day', 'day_of_year',
    'hour', 'is_day_after_holiday', 'is_day_before_holiday', 'is_holiday', 'is_weekend',
    'month', 'sin_dom', 'sin_dow', 'sin_doy', 'sin_hod', 'sin_wom', 'sin_woy',
    'ta_chi', 'transfer', 'week_of_month', 'week_of_year', 'weekday', 'year'
]

# ▶ 타깃 로그 변환
df['Congestion'] = np.log1p(df['Congestion'])

# ▶ 수치형 스케일링
scalers = {
    'ta_chi': StandardScaler(),
    'HM': QuantileTransformer(output_distribution='normal', random_state=0),
    'RN_HR1': MinMaxScaler(),
    'RN_DAY': MinMaxScaler(),
    'WS': PowerTransformer(method='yeo-johnson'),
    'TA': RobustScaler()
}

for col, scaler in scalers.items():
    if 'RN' in col:
        df[col] = np.log1p(df[col])
    df[col] = scaler.fit_transform(df[[col]])

# ▶ key 생성
df['key'] = df['Line'].astype(str) + '_' + df['station_name'].astype(str) + '_' + df['Direction'].astype(str)

# ▶ 사용 피처
features = num_cols + ordered_cols + cat_cols + ['key', 'Congestion']

# ▶ 연도별 분리
train_df = pd.concat([df[df['year'] == 0], df[df['year'] == 1]])[features]
val_df = df[df['year'] == 2][features]

# ▶ category 인코딩
for col in ['Line', 'station_name', 'address']:
    train_df[col] = train_df[col].astype('category')
    val_df[col] = val_df[col].astype('category')
    val_df[col] = val_df[col].cat.set_categories(train_df[col].cat.categories)
    train_df[col] = train_df[col].cat.codes
    val_df[col] = val_df[col].cat.codes

# ▶ station_idx 매핑
station_keys = train_df['key'].unique().tolist()
station_dict = {k: i for i, k in enumerate(station_keys)}
unknown_idx = len(station_dict)
train_df['station_idx'] = train_df['key'].map(lambda x: station_dict.get(x, unknown_idx))
val_df['station_idx'] = val_df['key'].map(lambda x: station_dict.get(x, unknown_idx))

# ▶ time_period 인코딩
le_tp = pd.concat([train_df['time_period'], val_df['time_period']]).astype('category')
train_df['tp_idx'] = le_tp.loc[train_df.index].cat.codes
val_df['tp_idx'] = le_tp.loc[val_df.index].cat.codes

# ▶ weather / temporal 분리
weather_cols = ['TA', 'WD', 'WS', 'RN_DAY', 'RN_HR1', 'HM', 'ta_chi']
num_cols = [col for col in num_cols if col not in weather_cols]

X_train_temporal = train_df[num_cols].values
X_val_temporal = val_df[num_cols].values
X_train_weather = train_df[weather_cols].values
X_val_weather = val_df[weather_cols].values
X_train_station = train_df['station_idx'].values.reshape(-1, 1)
X_val_station = val_df['station_idx'].values.reshape(-1, 1)
X_train_address = train_df['address'].values.reshape(-1, 1)
X_val_address = val_df['address'].values.reshape(-1, 1)
X_train_tp = train_df['tp_idx'].values.reshape(-1, 1)
X_val_tp = val_df['tp_idx'].values.reshape(-1, 1)

y_train = train_df['Congestion'].values
y_val = val_df['Congestion'].values

# ▶ Input 정의
temporal_input = Input(shape=(X_train_temporal.shape[1],), name='temporal')
station_input = Input(shape=(1,), name='station')
weather_input = Input(shape=(X_train_weather.shape[1],), name='weather')
address_input = Input(shape=(1,), name='address')
tp_input = Input(shape=(1,), name='time_period')

# ▶ Stream 정의
temporal_out = layers.Dense(64, activation='relu')(temporal_input)
station_emb = layers.Embedding(input_dim=unknown_idx + 1, output_dim=8)(station_input)
station_flat = layers.Flatten()(station_emb)
weather_out = layers.Dense(32, activation='relu')(weather_input)
address_emb = layers.Embedding(input_dim=train_df['address'].nunique() + 2, output_dim=4)(address_input)
address_flat = layers.Flatten()(address_emb)
tp_emb = layers.Embedding(input_dim=train_df['tp_idx'].nunique() + 2, output_dim=4)(tp_input)
tp_flat = layers.Flatten()(tp_emb)

x = layers.concatenate([temporal_out, station_flat, weather_out, address_flat, tp_flat])
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(32, activation='relu')(x)
output = layers.Dense(1)(x)

model = Model(inputs=[temporal_input, station_input, weather_input, address_input, tp_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.summary()

# 숫자형은 float32, 범주형은 int32로 변환
X_train_temporal = X_train_temporal.astype(np.float32)
X_val_temporal   = X_val_temporal.astype(np.float32)

X_train_weather  = X_train_weather.astype(np.float32)
X_val_weather    = X_val_weather.astype(np.float32)

X_train_station  = X_train_station.astype(np.int32)
X_val_station    = X_val_station.astype(np.int32)

X_train_address  = X_train_address.astype(np.int32)
X_val_address    = X_val_address.astype(np.int32)

X_train_tp       = X_train_tp.astype(np.int32)
X_val_tp         = X_val_tp.astype(np.int32)

y_train = y_train.astype(np.float32)
y_val   = y_val.astype(np.float32)

import numpy as np
from sklearn.metrics import mean_squared_error
import tensorflow as tf

class OrigValRMSE(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        # validation_data: ({'temporal':..., ...}, y_val_log)
        self.x_val, self.y_val_log = validation_data

    def on_epoch_end(self, epoch, logs=None):
        # 1) 로그 스케일 예측
        y_pred_log = self.model.predict(self.x_val, verbose=0)
        # 2) expm1 역변환
        y_pred = np.expm1(y_pred_log).ravel()
        y_true = np.expm1(self.y_val_log).ravel()
        # 3) 원래 스케일 RMSE 계산
        rmse_orig = mean_squared_error(y_true, y_pred, squared=False)
        # 4) 로그 스케일 RMSE와 함께 출력
        print(f' — orig_val_RMSE: {rmse_orig:.4f}')
        if logs is not None:
            logs['orig_val_RMSE'] = rmse_orig
orig_rmse_cb = OrigValRMSE(
    validation_data=(
        {
            'temporal': X_val_temporal,
            'station':  X_val_station,
            'weather':  X_val_weather,
            'address':  X_val_address,
            'time_period': X_val_tp
        },
        y_val   # 이건 로그 변환된 y_val (log1p) 입니다
    )
)


# ▶ 학습
save_path = os.path.join('checkpoints', 'multi_hybrid_extended.keras')
model.fit(
    x={
        'temporal': X_train_temporal,
        'station':  X_train_station,
        'weather':  X_train_weather,
        'address':  X_train_address,
        'time_period': X_train_tp
    },
    y=y_train,  # 로그 변환된 y_train
    validation_data=(
        {
            'temporal': X_val_temporal,
            'station':  X_val_station,
            'weather':  X_val_weather,
            'address':  X_val_address,
            'time_period': X_val_tp
        },
        y_val  # 로그 변환된 y_val
    ),
    epochs=50,
    batch_size=1024,
    callbacks=[
        EarlyStopping(patience=5, restore_best_weights=True),
        ModelCheckpoint(filepath=save_path, monitor='val_loss', save_best_only=True),
        orig_rmse_cb
    ]
)

# ▶ 예측 후 역변환
# y_pred_log = model.predict({...})
# y_pred = np.expm1(y_pred_log)


Epoch 1/50
[1m10578/10595[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - loss: 7.3516 - root_mean_squared_error: 2.0894 — orig_val_RMSE: 23.5987
[1m10595/10595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 26ms/step - loss: 7.3415 - root_mean_squared_error: 2.0879 - val_loss: 0.9354 - val_root_mean_squared_error: 0.9672 - orig_val_RMSE: 23.5987
Epoch 2/50




[1m10583/10595[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.2810 - root_mean_squared_error: 0.5300 — orig_val_RMSE: 23.7424
[1m10595/10595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 28ms/step - loss: 0.2810 - root_mean_squared_error: 0.5300 - val_loss: 1.0274 - val_root_mean_squared_error: 1.0136 - orig_val_RMSE: 23.7424
Epoch 3/50




[1m10583/10595[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.2355 - root_mean_squared_error: 0.4853 — orig_val_RMSE: 23.1258
[1m10595/10595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 23ms/step - loss: 0.2355 - root_mean_squared_error: 0.4852 - val_loss: 0.8875 - val_root_mean_squared_error: 0.9421 - orig_val_RMSE: 23.1258
Epoch 4/50




[1m10589/10595[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - loss: 0.2021 - root_mean_squared_error: 0.4496


KeyboardInterrupt



- 일자별 15288 또는 15162 건의 데이터
- 24시간 중 지하철이 운행하지 않는 시간 제외, 21시간이 존재함
- 시간별 728 또는 722 건의 데이터 -> 날짜및시간순만으로 정렬하면 Line, station_nam, Direction이 뒤죽박죽-> 딥러닝 자체가 안 맞는 데이터인가

# test 셋 도출

In [None]:
import pandas as pd
import numpy as np
from holidayskr import year_holidays
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, QuantileTransformer, PowerTransformer
from tensorflow.keras.models import load_model

# ① 전처리 함수
def data_preprocessing(df, t, address, scalers, base_year=2021):
    df['TM'] = pd.to_datetime(df['TM'], format='%Y%m%d%H')
    df = df.sort_values('TM').reset_index(drop=True)

    cat_columns = ['Line', 'station_name', 'Direction']
    for col in cat_columns:
        df[col] = df[col].astype('category')

    df['WD'] = df['WD'].where(df['WD'] >= 0, np.nan)
    for col in ['WS', 'RN_DAY', 'RN_HR1', 'TA', 'ta_chi', 'SI', 'HM']:
        df[col] = df[col].replace(-99.0, np.nan)
    df['SI'] = df['SI'].notna().astype(int)

    df.station_name = df.station_name.replace({'당고개': '불암산', '자양(뚝섬한강공원)': '자양', '신촌(지하)': '신촌'})
    df = pd.merge(df, t, on=['Line', 'station_name'], how='left')
    df['transfer'] = df['transfer'].fillna(0).astype(int)
    df = pd.merge(df, address, on='station_name', how='left')

    df['key'] = df['Line'].astype(str) + '_' + df['station_name'].astype(str) + '_' + df['Direction'].astype(str)

    df['year'] = df['TM'].dt.year - base_year
    df['month'] = df['TM'].dt.month
    df['day'] = df['TM'].dt.day
    df['hour'] = df['TM'].dt.hour
    df['weekday'] = df['TM'].dt.weekday
    df['week_of_month'] = (df['TM'].dt.day.sub(1) // 7) + 1
    df['week_of_year'] = df['TM'].dt.isocalendar().week
    df['day_of_year'] = df['TM'].dt.dayofyear

    holidays = [d[0] for y in ['2021', '2022', '2023', '2024'] for d in year_holidays(y)]
    df['is_holiday'] = df['TM'].isin(holidays).astype(int)
    df['is_weekend'] = df['TM'].dt.dayofweek >= 5
    df['is_day_before_holiday'] = df['TM'].shift(-1).isin(holidays).astype(int)
    df['is_day_after_holiday'] = df['TM'].shift(1).isin(holidays).astype(int)

    df['time_period'] = np.select(
        [
            df['hour'].isin([7, 8, 9]),
            df['hour'].isin([17, 18, 19]),
            (df['hour'] > 9) & (df['hour'] < 17),
            (df['hour'] > 19) & (df['hour'] < 21),
        ],
        ['출근', '퇴근', '낮', '저녁'],
        default='밤'
    )

    from pandas.api.types import CategoricalDtype
    df['Direction'] = df['Direction'].astype(CategoricalDtype(categories=['상선', '하선', '외선', '내선'], ordered=True)).cat.codes
    df['time_period'] = df['time_period'].astype(CategoricalDtype(categories=['밤', '출근', '낮', '저녁', '퇴근'], ordered=True)).cat.codes

    df['sin_hod'] = np.sin(df['hour'] * 2 * np.pi / 21)
    df['cos_hod'] = np.cos(df['hour'] * 2 * np.pi / 21)
    df['sin_dow'] = np.sin(df['weekday'] * 2 * np.pi / 7)
    df['cos_dow'] = np.cos(df['weekday'] * 2 * np.pi / 7)
    df['sin_dom'] = np.sin(df['day'] * 2 * np.pi / 31)
    df['cos_dom'] = np.cos(df['day'] * 2 * np.pi / 31)
    df['sin_wom'] = np.sin(df['week_of_month'] * 2 * np.pi / 5)
    df['cos_wom'] = np.sin(df['week_of_month'] * 2 * np.pi / 5)
    df['sin_woy'] = np.sin(df['week_of_year'] * 2 * np.pi / 52)
    df['cos_woy'] = np.sin(df['week_of_year'] * 2 * np.pi / 52)
    df['sin_doy'] = np.sin(df['day_of_year'] * 2 * np.pi / 365)
    df['cos_doy'] = np.sin(df['day_of_year'] * 2 * np.pi / 365)

    df = df.sort_values(['key', 'TM']).reset_index(drop=True)
    df.drop(columns=['TM', 'key'], inplace=True)

    df[['WD', 'RN_DAY', 'RN_HR1', 'TA', 'ta_chi', 'SI', 'HM', 'WS']] = df[
        ['WD', 'RN_DAY', 'RN_HR1', 'TA', 'ta_chi', 'SI', 'HM', 'WS']
    ].interpolate(method='linear', limit_direction='both')

    for col, scaler in scalers.items():
        if 'RN' in col:
            df[col] = np.log1p(df[col])
        df[col] = scaler.transform(df[[col]])

    return df

# ② 모델 입력용 구성 함수
def prepare_model_input(test_df, station_dict):
    # 1) category encoding (Line, station_name, address)
    for col in ['Line','station_name','address']:
        test_df[col] = test_df[col].astype('category')
        # 학습 시 train_df[col].cat.categories를 여기서 hard-code로 써도 됩니다.
        test_df[col] = test_df[col].cat.codes

    # 2) station_idx, tp_idx
    test_df['station_idx'] = (
        test_df['Line'].astype(str) + '_' +
        test_df['station_name'].astype(str) + '_' +
        test_df['Direction'].astype(str)
    ).map(lambda x: station_dict.get(x, len(station_dict)))
    test_df['tp_idx'] = test_df['time_period']

    # 3) 학습 코드에서 사용한 정확한 feature 리스트
    weather_cols = ['TA','WD','WS','RN_DAY','RN_HR1','HM','ta_chi']
    temporal_cols = [
        'SI','STN',
        'cos_dom','cos_dow','cos_doy','cos_hod','cos_wom','cos_woy',
        'day','day_of_year','hour',
        'is_day_before_holiday','is_day_after_holiday','is_holiday','is_weekend',
        'month',
        'sin_dom','sin_dow','sin_doy','sin_hod','sin_wom','sin_woy',
        'transfer','week_of_month','week_of_year','weekday','year'
    ]

    return {
        'temporal': test_df[temporal_cols].values.astype(np.float32),
        'weather': test_df[weather_cols].values.astype(np.float32),
        'station': test_df['station_idx'].values.astype(np.int32).reshape(-1,1),
        'address': test_df['address'].values.astype(np.int32).reshape(-1,1),
        'time_period': test_df['tp_idx'].values.astype(np.int32).reshape(-1,1)
    }



# ③ 실행
test_raw = pd.read_csv('./test/test.csv', encoding='cp949')
test_df = data_preprocessing(test_raw.copy(), t, address, scalers)

model_input = prepare_model_input(test_df, station_dict)

In [None]:
model = load_model('./checkpoints/multi_hybrid_extended.keras')
y_pred_log = model.predict(model_input, batch_size=1024)  
y_pred = np.expm1(y_pred_log)
print('성공~~')

y_pred = np.where(y_pred<=0,0,y_pred)

y_int = np.rint(y_pred.ravel()).astype(int)
submission = pd.DataFrame({'Congestion': y_int})
submission.to_csv('./test/submission.csv', index=False)

pd.set_option('display.float_format', '{:.0f}'.format)
pd.read_csv('./test/submission.csv')['Congestion'].describe()

df = pd.read_csv('./data/train_subway23.csv', encoding='CP949')
df['Congestion'].describe()