In [1]:
import pandas as pd
df = pd.read_parquet(
    'data.parquet',
    engine='pyarrow'         # 저장 시 사용한 엔진과 동일하게 지정
)
test_loaded = pd.read_parquet(
    'test.parquet',
    engine='pyarrow'         # 저장 시 사용한 엔진과 동일하게 지정
)

In [2]:
# feature / target 정의
ordered_cols = ['Direction', 'time_period']
cat_cols     = [
                'station_number'
                , 'address'
               # , 'station_name'
               ] + ordered_cols
num_cols = [
    'HM','RN_DAY','RN_HR1',
    #'SI',
    'TA','WD','WS'
    ,'STN'
    ,'sin_dom','cos_dom','sin_dow','cos_dow','sin_hod','cos_hod'
    ,'sin_wom','cos_wom','sin_woy','cos_woy','sin_doy','cos_doy'
    ,'day','day_of_year','hour'
    ,'is_day_before_holiday','is_day_after_holiday','is_holiday','is_weekend'
    ,'month','transfer','week_of_month','week_of_year','weekday','year'
    ,'신설역', '신규관측소'
]
feature_cols = num_cols + ordered_cols + cat_cols
target_col   = 'Congestion'

# 모델 선택

In [3]:
def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred  = model.predict(X_val)
    elapsed = time.time() - t0
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2   = r2_score(y_val, y_pred)
    
    return {'Model': name, 'Time(s)': elapsed, 'RMSE': rmse, 'R2': r2}

# 1~8호선 각각 LGBM, CB 테스트

In [4]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

# 전처리·평가용
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics      import mean_squared_error, r2_score

# ── 선형 계열 회귀 모델 ──
from sklearn.linear_model import ARDRegression

# ── 트리 & 앙상블 ──
from sklearn.ensemble      import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor
)

# ── 신경망 & 부스팅 ──
from sklearn.neural_network import MLPRegressor
from xgboost                 import XGBRegressor
from lightgbm                import LGBMRegressor
from catboost                import CatBoostRegressor

# ------------------------------------------------------------------------------
# 미리 정의해야 할 변수
# df: 학습용 DataFrame (컬럼에 'Line', 'TM', STN, address, feature_cols, target_col 포함)
# test: 테스트용 DataFrame (컬럼 구조 동일)
# feature_cols: predictor로 사용할 컬럼 리스트
# target_col: 예측 대상 컬럼 이름 (문자열)
# cat_cols: 범주형으로 one-hot encoding 할 컬럼 리스트 (예: ['STN','address'])
# ------------------------------------------------------------------------------

def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return {
        'Model': name,
        'Time(s)': time.time() - t0,
        'RMSE': np.sqrt(mean_squared_error(y_val, y_pred)),
        'R2': r2_score(y_val, y_pred)
    }

all_results = []

for line in range(1, 9):
    # 1) subset & sort
    df_line   = df [df['Line']==line].sort_values('TM').copy()
    test_line = test_loaded[test_loaded['Line']==line].copy()

    # 2) 카테고리 지정
    for col in cat_cols:
        df_line[col]   = df_line[col].astype('category')
        test_line[col] = test_line[col].astype('category')

    # 3) feature & target
    X      = df_line[feature_cols]
    y      = df_line[target_col].astype(int)
    X_test = test_line[feature_cols]

    # 4) 원-핫 인코딩
    X_enc      = pd.get_dummies(X,      columns=cat_cols, drop_first=False)
    X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)

    # 5) 중복 컬럼 제거 & 정렬, 누락 채움
    X_enc      = X_enc.loc[:, ~X_enc.columns.duplicated()]
    X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
    X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

    # 6) 정규화
    mm             = MinMaxScaler()
    X_scaled       = mm.fit_transform(X_enc)
    X_test_scaled  = mm.transform(X_test_enc)

    # 7) 시간 순 분할 (train:val = 8:2)
    split_idx = int(len(X_scaled) * 0.8)
    X_train, X_val = X_scaled[:split_idx], X_scaled[split_idx:]
    y_train, y_val = y.values[:split_idx],    y.values[split_idx:]

    # 8) 모델별 평가
    for name, model in [

        ('LGBM', LGBMRegressor(n_jobs=-1, random_state=42)),
        ('CAT',  CatBoostRegressor(verbose=0, random_state=42))
    ]:
        res = evaluate_model(name, model, X_train, y_train, X_val, y_val)
        res['Line'] = line
        all_results.append(res)
        print(f"[Line {line}] {name}: RMSE={res['RMSE']:.3f}, R2={res['R2']:.3f}, Time={res['Time(s)']:.1f}s")

# 9) 종합 결과 DataFrame 생성 및 저장
results_df = pd.DataFrame(all_results)
print("\n=== 전체 라인·모델별 실행 시간·성능 비교 ===")
print(results_df)

# CSV로 저장 (필요시)
os.makedirs('results', exist_ok=True)
results_df.to_csv('results/model_performance_all_lines.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2612
[LightGBM] [Info] Number of data points in the train set: 2251468, number of used features: 112
[LightGBM] [Info] Start training from score 18.002817
[Line 1] LGBM: RMSE=7.877, R2=0.852, Time=9.6s
[Line 1] CAT: RMSE=5.479, R2=0.929, Time=255.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2542
[LightGBM] [Info] Number of data points in the train set: 1596672, number of used features: 95
[LightGBM] [Info] Start training from score 28.565433
[Line 2] LGBM: RMSE=10.431, R2=0.762, Time=10.3s
[Line 2] CAT: RMSE=11.665, R2=0.703, 