In [1]:
import os
import pandas as pd
import numpy as np
test = pd.read_csv('./test/test.csv', encoding='CP949')
df23 = pd.read_csv('./data/train_subway23.csv', encoding='CP949')
df22 = pd.read_csv('./data/train_subway22.csv', encoding='CP949')
df21 = pd.read_csv('./data/train_subway21.csv', encoding='CP949')
df = pd.concat([df21, df22, df23], axis=0, ignore_index=True)
t = pd.read_excel('./data/환승역.xlsx', names =['Line','station_name','transfer'], header=0)
address = pd.read_csv('./data/result_address.csv', encoding='CP949')
subway_13 = pd.DataFrame({'역명':['성수E', '응암S','불암산']
             ,'주소':['서울 성동구 아차산로 100','서울 은평구 증산로 477','서울 노원구 상계로 305']})
address = pd.concat([address, subway_13], axis=0).reset_index(drop=True)
df.shape

(16369332, 15)

In [2]:
address.columns=['station_name','address']
address.station_name = address.station_name.apply(lambda x: x.split('(')[0].strip() if '(' in x else x)
address.address = address.address.apply(lambda x: x.split()[0] if '서울' not in x else x.split()[1])
addr = address['address']  
address['address'] = np.where(addr.str.contains('인천'), '인천',np.where(addr.str.contains('경기'), '경기', addr))

In [3]:
import os
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from holidayskr import year_holidays
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import warnings

warnings.filterwarnings('ignore')

def preprocessing(data, t, address, is_train=True, known_stations=None):
    data = data.copy()
    # 1)  → datetime
    data['TM'] = data['TM'].astype(str)
    data['TM'] = pd.to_datetime(data['TM'], format='%Y%m%d%H')

    # 2) 범주형 변환
    cat_columns = ['Line', 'station_number', 'STN', 'station_name', 'Direction']
    for col in cat_columns:
        data[col] = data[col].astype('category')

    # 3) 결측값 placeholder
    data['WD']     = data['WD'].where(data['WD'] >=   0, np.nan)
    data['WS']     = data['WS'].replace(-99.0,          np.nan)
    data['RN_DAY'] = data['RN_DAY'].replace(-99.0,      np.nan)
    data['RN_HR1'] = data['RN_HR1'].replace(-99.0,      np.nan)
    data['TA']     = data['TA'].replace(-99.0,          np.nan)
    data['ta_chi'] = data['ta_chi'].replace(-99.0,      np.nan)
    data['SI']     = data['SI'].replace(-99.0,          np.nan)
    data['HM']     = data['HM'].replace(-99.0,          np.nan)

    # 4) SI 이진 플래그
    data['SI'] = data['SI'].notna().astype(int)

    # 5) station_name 교정
    data['station_name'] = data['station_name'].astype(str).replace({
        '당고개': '불암산',
        '자양(뚝섬한강공원)': '자양',
        '신촌(지하)': '신촌'
    })
    # 6) 신설역 변수 생성
    new_station_list = {'구리', '다산', '동구릉', '별내', '암사역사공원', '장자호수공원'}
    known_stations = data['STN'].unique()
    # 컬럼명이 'station_name'이 아니라 '역명'이라면 df['역명'] 으로 바꿔 주세요.
    # 7) 신규관측소 변수
    if is_train:
        # 학습 데이터는 기준이 될 known_stations를 만들어 두고
        data['신설역'] = 0
        data['신규관측소'] = 0
        이상치_4호선_역명 = ['한대앞','중앙','고잔','초지','안산','신길온천','정왕','오이도']
        이상치_8호선_역명 = ['남위례']
        
        pattern_8 = '|'.join(이상치_8호선_역명)
        pattern_4 = '|'.join(이상치_4호선_역명)
        
        mask_8 = (data['Line'] == 8) & data['station_name'].str.contains(pattern_8) & (data['TM'] < '2021-12-18')
        mask_4 = (data['Line'] == 4) & data['station_name'].str.contains(pattern_4) & (data['TM'] <= '2022-06-13')
        print('train셋 공지사항 이상치 ',(data.shape[0] - data[~(mask_8 | mask_4)].reset_index(drop=True).shape[0])/data.shape[0],'% 제거')
        data = data[~(mask_8 | mask_4)].reset_index(drop=True)

    else:
        # test에서는 train에서 넘어온 known_stations를 이용해 플래그 처리
        data['신설역'] = data['station_name'].apply(lambda x: 0 if x in known_stations else 1)
        data['신규관측소'] = data['STN'].apply(lambda x: 0 if x in known_stations else 1)

    # 7 ) 선형 보간
    cols_to_interp = ['TA', 'WD', 'WS', 'RN_DAY', 'RN_HR1', 'ta_chi','HM']
    data[cols_to_interp] = data[cols_to_interp].interpolate(method='linear', limit_direction='both')
    data[cols_to_interp] = data[cols_to_interp].interpolate(method='linear', limit_direction='both')
    
    # 8) 외부 테이블 병합
    data = data.merge(t, on=['Line','station_name'], how='left')
    data['transfer'] = data['transfer'].fillna(0).astype(int)
    data = data.merge(address, on=['station_name'], how='left')

    # 9) 파생 변수
    data['year']         = data['TM'].dt.year - 2021
    data['month']        = data['TM'].dt.month
    data['day']          = data['TM'].dt.day
    data['hour']         = data['TM'].dt.hour
    data['weekday']      = data['TM'].dt.dayofweek
    data['week_of_month']= (data['day'] - 1) // 7 + 1
    data['week_of_year'] = data['TM'].dt.isocalendar().week.astype(int)
    data['day_of_year']  = data['TM'].dt.dayofyear

    # 10) 공휴일 플래그
    holidays = []
    for yr in [2021,2022,2023,2024]:
        holidays += [d for d,_ in year_holidays(yr)]
    data['is_holiday']            = data['TM'].dt.date.isin(holidays).astype(int)
    data['is_day_before_holiday'] = data['TM'].dt.date.shift(-1).isin(holidays).astype(int)
    data['is_day_after_holiday']  = data['TM'].dt.date.shift(1).isin(holidays).astype(int)

    # 11) 주말 플래그
    data['is_weekend'] = data['weekday'].isin([5,6]).astype(int)

    # 12) 시간대 범주
    data['time_period'] = np.where(data['hour'].isin([7,8,9]), '출근',
                             np.where(data['hour'].isin([17,18,19]), '퇴근',
                             np.where((data['hour']>9)&(data['hour']<17), '낮',
                             np.where((data['hour']>19)&(data['hour']<21), '저녁',
                             '밤'))))
    direction_order   = ['상선','하선','외선','내선']
    time_period_order = ['밤','출근','낮','저녁','퇴근']
    data['Direction']   = data['Direction'].astype(
        CategoricalDtype(categories=direction_order, ordered=True)
    ).cat.codes
    data['time_period'] = data['time_period'].astype(
        CategoricalDtype(categories=time_period_order, ordered=True)
    ).cat.codes

    # 13) 주기성 sin/cos (24h, 7d, 31d, 5w, 52w, 365d)
    data['sin_hod'] = np.sin(2*np.pi * data['hour']        / 24)
    data['cos_hod'] = np.cos(2*np.pi * data['hour']        / 24)
    data['sin_dow'] = np.sin(2*np.pi * data['weekday']     / 7)
    data['cos_dow'] = np.cos(2*np.pi * data['weekday']     / 7)
    data['sin_dom'] = np.sin(2*np.pi * data['day']         / 31)
    data['cos_dom'] = np.cos(2*np.pi * data['day']         / 31)
    data['sin_wom'] = np.sin(2*np.pi * data['week_of_month'] / 5)
    data['cos_wom'] = np.cos(2*np.pi * data['week_of_month'] / 5)
    data['sin_woy'] = np.sin(2*np.pi * data['week_of_year']  / 52)
    data['cos_woy'] = np.cos(2*np.pi * data['week_of_year']  / 52)
    data['sin_doy'] = np.sin(2*np.pi * data['day_of_year']   / 365)
    data['cos_doy'] = np.cos(2*np.pi * data['day_of_year']   / 365)

    # 14) 그룹별 보간 (key 단위)
    # cols_to_fill = ['WD','RN_DAY','RN_HR1','TA','ta_chi','SI','HM','WS']
    # data[cols_to_fill] = data.groupby('key')[cols_to_fill] \
    #                      .transform(lambda grp: grp.interpolate(method='linear', limit_direction='both'))
    # print('보간 후 남은 결측값:\n', data[cols_to_fill].isna().sum())
    # return data
    return data

# 전처리
df   = preprocessing(df,   t, address, is_train=True)
known_stations = df['STN'].unique()
test = preprocessing(test, t, address,
                               is_train=False,
                               known_stations=known_stations)

le_m = LabelEncoder()
le_d = LabelEncoder()
df['month'] = le_m.fit_transform(df['month'])
df['day'] = le_d.fit_transform(df['day'])
test['month'] = le_m.transform(test['month'])
test['day'] = le_d.transform(test['day'])

print('전처리 완료')

# feature / target 정의
ordered_cols = ['Direction', 'time_period']
cat_cols     = [
                'station_number'
                , 'address'
               # , 'station_name'
               ] + ordered_cols
num_cols = [
    'HM','RN_DAY','RN_HR1','SI','TA','WD','WS'
    ,'STN'
    ,'sin_dom','cos_dom','sin_dow','cos_dow','sin_hod','cos_hod'
    ,'sin_wom','cos_wom','sin_woy','cos_woy','sin_doy','cos_doy'
    ,'day','day_of_year','hour'
    ,'is_day_before_holiday','is_day_after_holiday','is_holiday','is_weekend'
    ,'month','transfer','week_of_month','week_of_year','weekday','year'
    ,'신설역', '신규관측소'
]
feature_cols = num_cols + ordered_cols + cat_cols
target_col   = 'Congestion'

results = []
final_results = []
print('완료')

train셋 공지사항 이상치  0.009340271185165039 % 제거
전처리 완료
완료


- 메모리 효율화

In [4]:
del df23
del df22
del df21
del t
del address

# ET

In [5]:
import os
import time
import gc
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import ExtraTreesRegressor

def evaluate_model(name, model, line, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    elapsed = time.time() - t0

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)

    return {'Model': name, 'Line': line, 'Time(s)': elapsed, 'RMSE': rmse, 'R2': r2}

results = []
final_results = []

for line in df['Line'].unique():
    print(f"\n📘 [Line {line}] 모델 학습 중...")

    df_line = df[df['Line'] == line].copy()
    test_line = test[test['Line'] == line].copy()

    # 범주형 지정
    for col in ['STN', 'address']:
        df_line[col] = df_line[col].astype('category')
        test_line[col] = test_line[col].astype('category')

    df_line = df_line.sort_values('TM')

    # feature, target
    X = df_line[feature_cols]
    y = df_line[target_col].astype(int)
    X_test = test_line[feature_cols]

    # 인코딩
    X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=False)
    X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)

    # 컬럼 정렬 및 누락된 컬럼 채움
    X_enc = X_enc.loc[:, ~X_enc.columns.duplicated()]
    X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
    X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

    # 정규화 후 float32로 다운캐스트
    mm = MinMaxScaler()
    X_scaled = mm.fit_transform(X_enc).astype(np.float32)
    X_test_scaled = mm.transform(X_test_enc).astype(np.float32)

    # 시간 순 분할
    split_idx = int(len(X_scaled) * 0.8)
    X_train = X_scaled[:split_idx]
    X_val = X_scaled[split_idx:]
    y_train = y.values[:split_idx]
    y_val = y.values[split_idx:]

    # 모델 정의 및 학습
    et = ExtraTreesRegressor(
        n_estimators=1500,
        max_depth=12,
        max_features=0.9,
        min_samples_split=2,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=42
    )
    res = evaluate_model('ET', et, line, X_train, y_train, X_val, y_val)
    results.append(res)
    print('✅ ExtraTrees 결과:', res)

    # 예측 및 저장
    y_pred = np.round(et.predict(X_test_scaled)).astype(int)
    y_pred = np.where(y_pred < 0, 0, y_pred)
    temp = test_line[['hour', 'Line', 'station_number']].copy()
    temp['예측혼잡도'] = y_pred
    final_results.append(temp)

    # 모델 저장 (압축)
    os.makedirs('./models', exist_ok=True)
    joblib.dump(et, f"./models/extratrees_line{line}.pkl", compress=3)

    # 메모리 정리
    del df_line, test_line, X, y, X_test
    del X_enc, X_test_enc, X_scaled, X_test_scaled
    del X_train, X_val, y_train, y_val, et, y_pred, temp
    gc.collect()

# 결과 저장
final_df = pd.concat(final_results)
output_df = final_df[['예측혼잡도']].rename(columns={'예측혼잡도': 'Congestion'})
os.makedirs('./test', exist_ok=True)
output_df.to_csv('./test/250206-et.csv', index=False, encoding='utf-8')

# 성능 요약 출력
results_df = pd.DataFrame(results)
print("\n📊 전체 성능 요약:")
print(results_df)



📘 [Line 1] 모델 학습 중...
✅ ExtraTrees 결과: {'Model': 'ET', 'Line': 1, 'Time(s)': 7440.4131190776825, 'RMSE': 9.909858336973329, 'R2': 0.7665104761477816}

📘 [Line 2] 모델 학습 중...
✅ ExtraTrees 결과: {'Model': 'ET', 'Line': 2, 'Time(s)': 4069.8190853595734, 'RMSE': 9.054434823633885, 'R2': 0.8210492448064146}

📘 [Line 3] 모델 학습 중...



KeyboardInterrupt



In [None]:
gap = pd.read_csv('./test/250206-et.csv') # 내 데터터
gap.shape

import os
제출 = pd.read_csv('./test/minjeong.csv') # 민정언니 데이터

import numpy as np

# 마스크 생성
from sklearn.metrics import root_mean_squared_error
rmse = mean_squared_error(
    제출['Congestion'],
    gap['Congestion'],
    squared=False      # squared=False 하면 RMSE 를 직접 계산해 줌
)

print(f"RMSE: {rmse:.4f}")

검증결과 7.714

# XGB

In [6]:
import os
import time
import gc
import numpy as np
import pandas as pd
import joblib
import random
from tqdm import tqdm
from itertools import product
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

def evaluate_model(name, model, line, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    y_pred = model.predict(X_val)
    elapsed = time.time() - t0

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)

    return {'Model': name, 'Line': line, 'Time(s)': elapsed, 'RMSE': rmse, 'R2': r2}

# 랜덤서치용 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [1000, 1500, 2000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [8, 10, 12],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0, 0.3],
    'reg_lambda': [0.5, 0.8],
    'min_child_weight': [1, 3],
    'gamma': [0, 0.5],
}

# 랜덤 조합 15개 샘플링
param_list = list(product(*param_grid.values()))
random.seed(42)
sampled_params = random.sample(param_list, 30)
# sampled_params=param_list

results = []
final_results = []

for line in df['Line'].unique():
    print(f"\n📘 [Line {line}] 모델 학습 중...")

    df_line = df[df['Line'] == line].copy()
    test_line = test[test['Line'] == line].copy()

    for col in ['STN', 'address']:
        df_line[col] = df_line[col].astype('category')
        test_line[col] = test_line[col].astype('category')

    df_line = df_line.sort_values('TM')

    X = df_line[feature_cols]
    y = df_line[target_col].astype(int)
    X_test = test_line[feature_cols]

    X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=False)
    X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)
    X_enc.columns.duplicated()]
    X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
    X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

    mm = MinMaxScaler()
    X_scaled = mm.fit_transform(X_enc).astype(np.float32)
    X_test_scaled = mm.transform(X_test_enc).astype(np.float32)

    split_idx = int(len(X_scaled) * 0.8)
    X_train = X_scaled[:split_idx]
    X_val = X_scaled[split_idx:]
    y_train = y.values[:split_idx]
    y_val = y.values[split_idx:]

    # ✅ 수동 랜덤서치 (교차검증 없이 val set으로 평가)
    best_model = None
    best_rmse = float('inf')

    for params in sampled_params:
        model = XGBRegressor(
            n_estimators=params[0],
            learning_rate=params[1],
            max_depth=params[2],
            subsample=params[3],
            colsample_bytree=params[4],
            reg_alpha=params[5],
            reg_lambda=params[6],
            min_child_weight=params[7],
            gamma=params[8],
            tree_method='hist',
            eval_metric='rmse',
            early_stopping_rounds=30,
            random_state=42,
            verbosity=0
        )

        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)
        y_val_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

    # 평가
    res = evaluate_model('XGB', best_model, line, X_train, y_train, X_val, y_val)
    results.append(res)
    print('✅ XGB 결과:', res)
    print('🧪 최적 파라미터:', best_model.get_params())

    # 예측
    y_pred = np.round(best_model.predict(X_test_scaled)).astype(int)
    y_pred = np.where(y_pred < 0, 0, y_pred)
    temp = test_line[['hour', 'Line', 'station_number']].copy()
    temp['예측혼잡도'] = y_pred
    final_results.append(temp)

    os.makedirs('./models', exist_ok=True)
    joblib.dump(best_model, f"./models/xgb_line{line}.pkl", compress=3)

    del df_line, test_line, X, y, X_test
    del X_enc, X_test_enc, X_scaled, X_test_scaled
    del X_train, X_val, y_train, y_val, best_model, y_pred, temp
    gc.collect()

# 결과 저장
final_df = pd.concat(final_results)
output_df = final_df[['예측혼잡도']].rename(columns={'예측혼잡도': 'Congestion'})
os.makedirs('./test', exist_ok=True)
output_df.to_csv('./test/250206-xgb.csv', index=False, encoding='utf-8')

results_df = pd.DataFrame(results)
print("\n📊 전체 성능 요약:")
print(results_df)



📘 [Line 1] 모델 학습 중...
[0]	validation_0-rmse:20.67950
[100]	validation_0-rmse:11.89829
[200]	validation_0-rmse:7.97158
[300]	validation_0-rmse:6.07793
[400]	validation_0-rmse:5.31176
[500]	validation_0-rmse:5.04130
[600]	validation_0-rmse:4.89391
[700]	validation_0-rmse:4.76490
[800]	validation_0-rmse:4.65450
[900]	validation_0-rmse:4.55036
[1000]	validation_0-rmse:4.47896
[1100]	validation_0-rmse:4.41607
[1200]	validation_0-rmse:4.36560
[1300]	validation_0-rmse:4.32653
[1400]	validation_0-rmse:4.29663
[1500]	validation_0-rmse:4.27170
[1600]	validation_0-rmse:4.25128
[1700]	validation_0-rmse:4.23570
[1800]	validation_0-rmse:4.22040
[1900]	validation_0-rmse:4.20593
[1999]	validation_0-rmse:4.19329
[0]	validation_0-rmse:20.36684
[100]	validation_0-rmse:7.83633
[200]	validation_0-rmse:6.56234
[300]	validation_0-rmse:5.85892
[400]	validation_0-rmse:5.45772
[500]	validation_0-rmse:5.19289
[600]	validation_0-rmse:4.99103
[700]	validation_0-rmse:4.83981
[800]	validation_0-rmse:4.69203
[900]	v

In [7]:
gap = pd.read_csv('./test/250206-xgb.csv') # 내 데터터
gap.shape

import os
제출 = pd.read_csv('./test/minjeong.csv') # 민정언니 데이터

import numpy as np

# 마스크 생성
from sklearn.metrics import root_mean_squared_error
rmse = mean_squared_error(
    제출['Congestion'],
    gap['Congestion'],
    squared=False      # squared=False 하면 RMSE 를 직접 계산해 줌
)

print(f"RMSE: {rmse:.4f}")

RMSE: 4.3790


In [8]:
import joblib
import os
import pandas as pd

best_params_summary = []

for line in range(1, 9):  
    model_path = f'./models/xgb_line{line}.pkl'
    if os.path.exists(model_path):
        model = joblib.load(model_path)
        params = model.get_params()
        best_params_summary.append({
            'Line': line,
            **{k: v for k, v in params.items() if k in param_grid}
        })

# 결과 보기
df_params = pd.DataFrame(best_params_summary)
df_params

Unnamed: 0,Line,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,subsample
0,1,0.8,0.0,0.05,12,3,2000,0.3,0.8,0.9
1,2,0.9,0.5,0.05,8,1,1000,0.0,0.5,0.8
2,3,0.8,0.0,0.05,10,1,1500,0.0,0.5,0.9
3,4,0.8,0.0,0.05,12,3,2000,0.3,0.8,0.9
4,5,0.9,0.0,0.05,10,3,2000,0.3,0.5,0.9
5,6,0.8,0.0,0.05,8,1,1000,0.0,0.8,0.9
6,7,0.8,0.0,0.1,10,3,1500,0.3,0.5,0.8
7,8,0.9,0.0,0.05,10,1,2000,0.3,0.8,0.8


# XGB - 성능 좋게 나온 파라미터로 다시 돌리기

In [33]:
best_params = df_params.loc[[0,4, 7],'colsample_bytree	gamma	learning_rate	max_depth	min_child_weight	n_estimators	reg_alpha	reg_lambda	subsample'.split()]
best_params

Unnamed: 0,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,subsample
0,0.8,0.0,0.05,12,3,2000,0.3,0.8,0.9
4,0.9,0.0,0.05,10,3,2000,0.3,0.5,0.9
7,0.9,0.0,0.05,10,1,2000,0.3,0.8,0.8


In [None]:
param_grid = {
    'n_estimators': [2000, 2500, 3000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [10, 12, 14, 16],
    'subsample': [.9, 1,0],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0.3, 0.4, 0.5],
    'reg_lambda': [0.5, 0.8, 1.0],
    'min_child_weight': [1, 3],
    'gamma': [0],
}

In [5]:
import os
import time
import gc
import numpy as np
import pandas as pd
import joblib
import random
from tqdm import tqdm
from itertools import product
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

def evaluate_model(name, model, line, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    y_pred = model.predict(X_val)
    elapsed = time.time() - t0

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)

    return {'Model': name, 'Line': line, 'Time(s)': elapsed, 'RMSE': rmse, 'R2': r2}


results = []
final_results = []

for line in df['Line'].unique():
    print(f"\n📘 [Line {line}] 모델 학습 중...")

    df_line = df[df['Line'] == line].copy()
    test_line = test[test['Line'] == line].copy()

    for col in ['STN', 'address']:
        df_line[col] = df_line[col].astype('category')
        test_line[col] = test_line[col].astype('category')

    df_line = df_line.sort_values('TM')

    X = df_line[feature_cols]
    y = df_line[target_col].astype(int)
    X_test = test_line[feature_cols]

    X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=False)
    X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)

    X_enc = X_enc.loc[:, ~X_enc.columns.duplicated()]
    X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
    X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

    mm = MinMaxScaler()
    X_scaled = mm.fit_transform(X_enc).astype(np.float32)
    X_test_scaled = mm.transform(X_test_enc).astype(np.float32)

    split_idx = int(len(X_scaled) * 0.8)
    X_train = X_scaled[:split_idx]
    X_val = X_scaled[split_idx:]
    y_train = y.values[:split_idx]
    y_val = y.values[split_idx:]

    # ✅ 수동 랜덤서치 (교차검증 없이 val set으로 평가)
    best_model = None
    best_rmse = float('inf')

    
        # ✅ 단일 모델 학습 (더 이상 반복 없음)
    best_model = XGBRegressor(
        n_estimators=2000,
        learning_rate=0.05,
        max_depth=12,
        subsample=0.9,
        colsample_bytree=0.8,
        reg_alpha=0.3,
        reg_lambda=0.8,
        min_child_weight=3,
        gamma=0,
        tree_method='hist',
        eval_metric='rmse',
        early_stopping_rounds=30,
        random_state=42,
        verbosity=0
    )

    best_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)

    # 평가
    res = evaluate_model('XGB', best_model, line, X_train, y_train, X_val, y_val)
    results.append(res)
    print('✅ XGB 결과:', res)
    #print('🧪 파라미터:', best_model.get_params())

    # 예측
    y_pred = np.round(best_model.predict(X_test_scaled)).astype(int)
    y_pred = np.where(y_pred < 0, 0, y_pred)
    temp = test_line[['hour', 'Line', 'station_number']].copy()
    temp['예측혼잡도'] = y_pred
    final_results.append(temp)

    os.makedirs('./models', exist_ok=True)
    joblib.dump(best_model, f"./models/best_parmas1_xgb_line{line}.pkl", compress=3)

    del df_line, test_line, X, y, X_test
    del X_enc, X_test_enc, X_scaled, X_test_scaled
    del X_train, X_val, y_train, y_val, best_model, y_pred, temp
    gc.collect()

# 결과 저장
final_df = pd.concat(final_results)
output_df = final_df[['예측혼잡도']].rename(columns={'예측혼잡도': 'Congestion'})
os.makedirs('./test', exist_ok=True)
output_df.to_csv('./test/250206-xgb-best1.csv', index=False, encoding='utf-8')

results_df = pd.DataFrame(results)
print("\n📊 전체 성능 요약:")
print(results_df)


📘 [Line 1] 모델 학습 중...
[0]	validation_0-rmse:20.27514
[100]	validation_0-rmse:5.05032
[200]	validation_0-rmse:4.43756
[300]	validation_0-rmse:4.18801
[400]	validation_0-rmse:4.12169
[500]	validation_0-rmse:4.08802
[600]	validation_0-rmse:4.06709
[700]	validation_0-rmse:4.05029
[800]	validation_0-rmse:4.04292
[830]	validation_0-rmse:4.04401
✅ XGB 결과: {'Model': 'XGB', 'Line': 1, 'Time(s)': 691.4284400939941, 'RMSE': 4.042833518966586, 'R2': 0.9611398723725793}

📘 [Line 2] 모델 학습 중...
[0]	validation_0-rmse:21.42276
[100]	validation_0-rmse:9.27319
[200]	validation_0-rmse:8.67868
[300]	validation_0-rmse:8.50251
[400]	validation_0-rmse:8.44496
[500]	validation_0-rmse:8.40827
[600]	validation_0-rmse:8.39077
[700]	validation_0-rmse:8.38064
[716]	validation_0-rmse:8.38215
✅ XGB 결과: {'Model': 'XGB', 'Line': 2, 'Time(s)': 471.73917841911316, 'RMSE': 8.378386346861745, 'R2': 0.8467743023797729}

📘 [Line 3] 모델 학습 중...
[0]	validation_0-rmse:19.34543
[100]	validation_0-rmse:4.71453
[200]	validation_0-

In [6]:
gap = pd.read_csv('./test/250206-xgb-best1.csv') # 내 데터터
gap.shape

import os
제출 = pd.read_csv('./test/minjeong.csv') # 민정언니 데이터

import numpy as np

# 마스크 생성
from sklearn.metrics import root_mean_squared_error
rmse = mean_squared_error(
    제출['Congestion'],
    gap['Congestion'],
    squared=False      # squared=False 하면 RMSE 를 직접 계산해 줌
)

print(f"RMSE: {rmse:.4f}")

RMSE: 4.5445


# LGBM

In [5]:
import os
import time
import gc
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor  # ✅ LightGBM으로 변경

def evaluate_model(name, model, line, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    elapsed = time.time() - t0

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)

    return {'Model': name, 'Line': line, 'Time(s)': elapsed, 'RMSE': rmse, 'R2': r2}

results = []
final_results = []

for line in df['Line'].unique():
    print(f"\n📘 [Line {line}] 모델 학습 중...")

    df_line = df[df['Line'] == line].copy()
    test_line = test[test['Line'] == line].copy()

    for col in ['STN', 'address']:
        df_line[col] = df_line[col].astype('category')
        test_line[col] = test_line[col].astype('category')

    df_line = df_line.sort_values('TM')

    X = df_line[feature_cols]
    y = df_line[target_col].astype(int)
    X_test = test_line[feature_cols]

    X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=False)
    X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)

    X_enc = X_enc.loc[:, ~X_enc.columns.duplicated()]
    X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
    X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

    mm = MinMaxScaler()
    X_scaled = mm.fit_transform(X_enc).astype(np.float32)
    X_test_scaled = mm.transform(X_test_enc).astype(np.float32)

    split_idx = int(len(X_scaled) * 0.8)
    X_train = X_scaled[:split_idx]
    X_val = X_scaled[split_idx:]
    y_train = y.values[:split_idx]
    y_val = y.values[split_idx:]

    # ✅ LightGBM 모델 정의
    model = LGBMRegressor(
                            n_estimators=1500,
                            learning_rate=0.01,
                            max_depth=12,
                            subsample=0.9,
                            colsample_bytree=0.9,
                            reg_alpha=0.3,
                            reg_lambda=0.8,
                            min_child_samples=20,
                            random_state=42
                        )

    res = evaluate_model('LGBM', model, line, X_train, y_train, X_val, y_val)
    results.append(res)
    print('✅ LGBM 결과:', res)

    y_pred = np.round(model.predict(X_test_scaled)).astype(int)
    y_pred = np.where(y_pred < 0, 0, y_pred)
    temp = test_line[['hour', 'Line', 'station_number']].copy()
    temp['예측혼잡도'] = y_pred
    final_results.append(temp)

    os.makedirs('./models', exist_ok=True)
    joblib.dump(model, f"./models/lgbm_line{line}.pkl", compress=3)

    del df_line, test_line, X, y, X_test
    del X_enc, X_test_enc, X_scaled, X_test_scaled
    del X_train, X_val, y_train, y_val, model, y_pred, temp
    gc.collect()

final_df = pd.concat(final_results)
output_df = final_df[['예측혼잡도']].rename(columns={'예측혼잡도': 'Congestion'})
os.makedirs('./test', exist_ok=True)
output_df.to_csv('./test/250206-lgbm.csv', index=False, encoding='utf-8')

results_df = pd.DataFrame(results)
print("\n📊 전체 성능 요약:")
print(results_df)



📘 [Line 1] 모델 학습 중...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.302261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2518
[LightGBM] [Info] Number of data points in the train set: 2251468, number of used features: 113
[LightGBM] [Info] Start training from score 18.002817
✅ LGBM 결과: {'Model': 'LGBM', 'Line': 1, 'Time(s)': 193.3726589679718, 'RMSE': 7.496302841802039, 'R2': 0.8663937564784803}

📘 [Line 2] 모델 학습 중...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.218242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2448
[LightGBM] [Info] Number of data points in the train set: 1596672, number of used features: 96
[LightGBM] [Info] Start training from score 28.565433
✅ LGBM 결과: {'Model': 'LGBM', 'Line': 2, 'Time(s)': 137.63949418067932, 'RMSE': 9.388149707343784, 'R2': 0.8076151564277432}

📘 [Line 3] 모델 학

In [7]:
gap = pd.read_csv('./test/250206-lgbm.csv') # 내 데터터
gap.shape

import os
제출 = pd.read_csv('./test/minjeong.csv') # 민정언니 데이터

import numpy as np

# 마스크 생성
from sklearn.metrics import root_mean_squared_error
rmse = mean_squared_error(
    제출['Congestion'],
    gap['Congestion'],
    squared=False      # squared=False 하면 RMSE 를 직접 계산해 줌
)

print(f"RMSE: {rmse:.4f}")

RMSE: 7.4661
