# モデル学習

## Library Import

In [24]:
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold

import optuna
import lightgbm as lgb
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', 200)

In [25]:
ROOT_DIR = '../input/'
data_definition_path = ROOT_DIR + 'data_definition.xlsx'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
oof_path = '../output/oof/'
fi_path = '../output/fi/'

# スクリプトのバージョン指定
create_tbl_ver = 2
training_ver = 8

In [26]:
property_types = [
    'residential',
    # 'house',
    # 'other'
]

alg = 'lgb'     # ※この notebook の light 版は lgb 前提（cat は別途実装が必要）
# alg = 'cat'

In [27]:
date_col = 'target_ym'
target_col = 'money_room'
year_col = 'target_year'

## データ分割

In [28]:
# --- 東京23区 ---
TOKYO_23 = [
    '千代田区', '中央区', '港区', '新宿区', '文京区', '台東区',
    '墨田区', '江東区', '品川区', '目黒区', '大田区', '世田谷区',
    '渋谷区', '中野区', '杉並区', '豊島区', '北区', '荒川区',
    '板橋区', '練馬区', '足立区', '葛飾区', '江戸川区'
]

# --- 政令指定都市 ---
SEIREI_CITIES = [
    '札幌市', '仙台市', 'さいたま市', '千葉市', '横浜市', '川崎市', '相模原市',
    '新潟市', '静岡市', '浜松市', '名古屋市',
    '京都市', '大阪市', '堺市', '神戸市',
    '岡山市', '広島市', '北九州市', '福岡市', '熊本市'
]

# --- 首都圏（都道府県） ---
CAPITAL_PREFS = ['東京都', '神奈川県', '埼玉県', '千葉県']

# --- 県庁所在地（市名のみ） ---
PREF_CAPITALS = [
    '札幌市','青森市','盛岡市','仙台市','秋田市','山形市','福島市',
    '水戸市','宇都宮市','前橋市','さいたま市','千葉市','新宿区',
    '横浜市','新潟市','富山市','金沢市','福井市','甲府市','長野市',
    '岐阜市','静岡市','名古屋市','津市','大津市','京都市','大阪市',
    '神戸市','奈良市','和歌山市','鳥取市','松江市','岡山市','広島市',
    '山口市','徳島市','高松市','松山市','高知市','福岡市','佐賀市',
    '長崎市','熊本市','大分市','宮崎市','鹿児島市','那覇市'
]

## 関数

#### 読み込み

In [29]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.replace(' ', '_') for c in df.columns]
    return df

In [30]:
def prepare_training_inputs(
    target_model: str,
    alg: str,
):
    # 1) load
    train_df = pd.read_parquet(f'{intermediate_path}train_df_{target_model}_v{create_tbl_ver}.parquet')
    train_df = normalize_columns(train_df)

    # 2) feature cols
    fe_cols = train_df.columns.to_list()

    idx_key_cols = [
        'Prefecture_name',
        'City/town/village_name',
        'zone_residential_rank',
    ]
    drop_cols = set([target_col] + idx_key_cols)
    fe_cols = [c for c in fe_cols if c not in drop_cols]

    # 3) cat cols（lgb でもカテゴリとして使う）
    cat_cols_candidate = [
        'building_category', 'land_area_kind', 'walk_distance_bin', 'building_land_chimoku',
        'land_chisei', 'land_road_cond', 'access_zone', 'fireproof_x_structure', 'structure_group'
    ]
    cat_cols = [c for c in cat_cols_candidate if c in fe_cols]

    obj_cols = train_df[fe_cols].select_dtypes(['object']).columns.tolist()
    cat_cols = list(dict.fromkeys(cat_cols + obj_cols))

    if alg == 'lgb':
        cat_cols_use = [c for c in cat_cols if c in train_df.columns]
    else:
        cat_cols_use = cat_cols

    # 4) idx_dict
    if target_model == 'house':
        idx_low_density = train_df.index[
            train_df['zone_residential_rank'] == 1
        ]

        idx_mid_density = train_df.index[
            train_df['zone_residential_rank'] == 2
        ]

        idx_high_density = train_df.index[
            train_df['zone_residential_rank'].isin([3, 4, 0]) |
            train_df['zone_residential_rank'].isna()
        ]

        density_idx_dict = {
            'low': idx_low_density,
            'mid': idx_mid_density,
            'high': idx_high_density,
        }
        idx_dict = density_idx_dict
    elif target_model == 'residential':
        # residential 用の urban_idx_dict は train_df から作る（他propertyに依存させない）
        main_city = train_df.index[
            (
                (train_df['Prefecture_name'] == '東京都') &
                (train_df['City/town/village_name'].isin(TOKYO_23))
            )
            |
            (train_df['City/town/village_name'].isin(['大阪市', '名古屋市']))
        ]

        mid_city = train_df.index[
            (
                # 首都圏（23区除外）
                (
                    train_df['Prefecture_name'].isin(CAPITAL_PREFS)
                    &
                    ~(
                        (train_df['Prefecture_name'] == '東京都') &
                        (train_df['City/town/village_name'].isin(TOKYO_23))
                    )
                )
                |
                # 政令指定都市
                (train_df['City/town/village_name'].isin(SEIREI_CITIES))
                |
                # 県庁所在地
                (train_df['City/town/village_name'].isin(PREF_CAPITALS))
            )
            &
            ~train_df.index.isin(main_city)
        ]

        other = train_df.index[
            ~train_df.index.isin(main_city) &
            ~train_df.index.isin(mid_city)
        ]

        idx_dict = {
            'main_city': main_city,
            'mid_city': mid_city,
            'other': other,
        }
    else:
        idx_dict = None

    return train_df, fe_cols, cat_cols_use, idx_dict


#### パラメータチューニングの探索範囲

In [31]:
BASE_PARAMS_BY_MODEL = {
  'lgb': {
    'objective': 'regression',
    'n_estimators': 20000,
    'learning_rate': 0.03,
    'subsample': 0.8,
    'subsample_freq': 1,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.0,
    'reg_lambda': 3.0,
    'min_child_samples': 50,
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
  },
  'cat': {
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'iterations': 20000,
    'learning_rate': 0.03,
    'depth': 8,
    'l2_leaf_reg': 6.0,
    'random_strength': 1.0,
    'bagging_temperature': 0.5,
    'random_seed': 42,
    'verbose': False,
  }
}

In [32]:
LGB_2STAGE_CONF = {
    'residential': {
        'phase1': dict(
            num_leaves=(31, 127),
            max_depth=(4, 10),
            min_child_samples=(50, 200),
            learning_rate=(0.03, 0.12),
            subsample=(0.65, 0.95),
            colsample_bytree=(0.65, 0.95),
            reg_alpha=(0.0, 1.0),
            reg_lambda=(1.0, 20.0),
        ),
        'phase2': dict(
            num_leaves_width=32,
            max_depth_width=2,
            min_child_samples_width=40,
            learning_rate_rel=0.5,
            subsample_rel=0.15,
            colsample_bytree_rel=0.15,
            reg_alpha_rel=1.0,
            reg_lambda_rel=0.8,
        )
    },

    'house': {
        'phase1': dict(
            num_leaves=(31, 127),
            max_depth=(3, 9),
            min_child_samples=(80, 400),
            learning_rate=(0.02, 0.10),
            subsample=(0.65, 0.95),
            colsample_bytree=(0.65, 0.95),
            reg_alpha=(0.0, 2.0),
            reg_lambda=(2.0, 30.0),
        ),
        'phase2': dict(
            num_leaves_width=32,
            max_depth_width=2,
            min_child_samples_width=80,
            learning_rate_rel=0.6,
            subsample_rel=0.18,
            colsample_bytree_rel=0.18,
            reg_alpha_rel=1.0,
            reg_lambda_rel=0.9,
        )
    },
    'other': {
        # Phase1: 速度×当たり率（あなたの当初案ベース）
        'phase1': dict(
            num_leaves=(16, 63),
            max_depth=(2, 8),
            min_child_samples=(50, 500),
            learning_rate=(0.02, 0.10),
            subsample=(0.70, 0.95),
            colsample_bytree=(0.70, 0.95),
            reg_alpha=(0.0, 3.0),          # 一貫性のためPhase1から探索
            reg_lambda=(3.0, 25.0),
        ),
        # Phase2: best近傍だが「緩め」（ハマり回避）
        'phase2': dict(
            num_leaves_width=24,
            max_depth_width=2,
            min_child_samples_width=150,   # 広めが重要
            learning_rate_rel=0.9,         # かなり広いband（ほぼ再探索に近い）
            subsample_rel=0.25,
            colsample_bytree_rel=0.25,
            reg_alpha_rel=1.3,
            reg_lambda_rel=1.0,
        )
    }
}


In [33]:
def _space_lgb_phase1(trial, property_type):
    c = LGB_2STAGE_CONF[property_type]['phase1']
    return {
        'num_leaves': trial.suggest_int('num_leaves', *c['num_leaves']),
        'max_depth': trial.suggest_int('max_depth', *c['max_depth']),
        'min_child_samples': trial.suggest_int('min_child_samples', *c['min_child_samples']),
        'learning_rate': trial.suggest_float('learning_rate', *c['learning_rate']),
        'subsample': trial.suggest_float('subsample', *c['subsample']),
        'subsample_freq': 1,
        'colsample_bytree': trial.suggest_float('colsample_bytree', *c['colsample_bytree']),
        'reg_alpha': trial.suggest_float('reg_alpha', *c['reg_alpha']),
        'reg_lambda': trial.suggest_float('reg_lambda', *c['reg_lambda'], log=True),
        'max_bin': 255,
    }


In [34]:
def _clamp_int(x, lo, hi):
    return int(max(lo, min(hi, int(x))))

def _clamp_float(x, lo, hi):
    return float(max(lo, min(hi, float(x))))

def _band_int(center, width, lo, hi):
    return _clamp_int(center - width, lo, hi), _clamp_int(center + width, lo, hi)

def _band_float(center, rel, lo, hi):
    return _clamp_float(center * (1 - rel), lo, hi), _clamp_float(center * (1 + rel), lo, hi)

In [35]:
def _space_lgb_phase2(trial, best_params, property_type):
    c1 = LGB_2STAGE_CONF[property_type]['phase1']
    c2 = LGB_2STAGE_CONF[property_type]['phase2']

    nl_lo, nl_hi = _band_int(best_params['num_leaves'], c2['num_leaves_width'], *c1['num_leaves'])
    md_lo, md_hi = _band_int(best_params['max_depth'], c2['max_depth_width'], *c1['max_depth'])
    mcs_lo, mcs_hi = _band_int(best_params['min_child_samples'], c2['min_child_samples_width'], *c1['min_child_samples'])

    lr_lo, lr_hi = _band_float(best_params['learning_rate'], c2['learning_rate_rel'], *c1['learning_rate'])
    ss_lo, ss_hi = _band_float(best_params['subsample'], c2['subsample_rel'], *c1['subsample'])
    cs_lo, cs_hi = _band_float(best_params['colsample_bytree'], c2['colsample_bytree_rel'], *c1['colsample_bytree'])

    ra_lo, ra_hi = _band_float(best_params.get('reg_alpha', 0.0), c2['reg_alpha_rel'], *c1['reg_alpha'])
    rl_lo, rl_hi = _band_float(best_params.get('reg_lambda', 3.0), c2['reg_lambda_rel'], *c1['reg_lambda'])

    return {
        'num_leaves': trial.suggest_int('num_leaves', nl_lo, nl_hi),
        'max_depth': trial.suggest_int('max_depth', md_lo, md_hi),
        'min_child_samples': trial.suggest_int('min_child_samples', mcs_lo, mcs_hi),
        'learning_rate': trial.suggest_float('learning_rate', lr_lo, lr_hi),
        'subsample': trial.suggest_float('subsample', ss_lo, ss_hi),
        'subsample_freq': 1,
        'colsample_bytree': trial.suggest_float('colsample_bytree', cs_lo, cs_hi),
        'reg_alpha': trial.suggest_float('reg_alpha', ra_lo, ra_hi),
        'reg_lambda': trial.suggest_float('reg_lambda', rl_lo, rl_hi, log=True),
        'max_bin': 255,
    }


#### CV実行

In [36]:
def mae(y_true, y_pred) -> float:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return float(np.mean(np.abs(y_true - y_pred)))

def mape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return float(np.mean(np.abs((y_true - y_pred) / denom)))

In [37]:
def _safe_take_index(idx: pd.Index, pos: np.ndarray) -> pd.Index:
    arr = idx.to_numpy()
    return pd.Index(arr[pos])

In [38]:
def _force_lgbm_train_categories(X: pd.DataFrame, cat_cols_use: list[str], na_token: str = 'NA') -> pd.DataFrame:
    X = X.copy()
    for c in cat_cols_use:
        if c not in X.columns:
            continue
        s = X[c].astype('string').fillna(na_token)
        X[c] = s.astype('category')
    return X

In [39]:
def _infer_te_source_col(te_col: str) -> str:
    # 'xxx_te' -> 'xxx'
    return te_col[:-3] if te_col.endswith('_te') else te_col


def fit_target_encoding_map(
    s_cat: pd.Series,
    y: pd.Series,
    smoothing: float = 50.0,
    min_samples_leaf: int = 1,
) -> tuple[pd.Series, float]:
    """
    1列のカテゴリ s_cat をターゲット y で target encoding するための mapping を作る。
    smoothing: 大きいほど全体平均に寄る（過学習防止）
    min_samples_leaf: 出現数が小さいカテゴリは prior に寄せる
    """
    s_cat = s_cat.astype('object')
    y = y.astype(float)

    prior = float(y.mean())

    stats = (
        pd.DataFrame({'cat': s_cat, 'y': y})
        .groupby('cat')['y']
        .agg(['mean', 'count'])
    )

    enc = (stats['mean'] * stats['count'] + prior * smoothing) / (stats['count'] + smoothing)

    # min_samples_leaf 未満は prior に寄せる
    if min_samples_leaf > 1:
        enc = enc.where(stats['count'] >= min_samples_leaf, prior)

    # mapping: index=category, value=encoded
    mapping = enc.astype(float)
    return mapping, prior


def apply_target_encoding(
    s_cat: pd.Series,
    mapping: pd.Series,
    prior: float,
) -> pd.Series:
    s_cat = s_cat.astype('object')
    out = s_cat.map(mapping)
    return out.fillna(prior).astype(float)


def recompute_te_for_fold(
    train_df: pd.DataFrame,
    tr_idx: pd.Index,
    apply_idx_list: list[pd.Index],
    te_cols: list[str],
    y_tr: pd.Series,  # log(y) を渡す想定
    smoothing: float = 50.0,
    min_samples_leaf: int = 1,
    verbose: bool = False,
) -> dict:
    """
    foldごとにTE列を再計算して train_df の te_cols を（指定idxだけ）上書きする。
    重要：元列が存在しないTEはスキップ（元スクリプト互換）。
    """
    te_meta: dict[str, dict] = {}

    prior = float(y_tr.loc[tr_idx].mean())  # log空間のprior（元の挙動に合わせる）
    for te_col in te_cols:
        src_col = _infer_te_source_col(te_col)

        # ---- ここが今回の修正点：元列が無いならスキップ ----
        if src_col not in train_df.columns:
            if verbose:
                print(f'[TE] skip: src_col missing: {te_col} -> {src_col}')
            continue

        # tr_idx 内に src_col が無い/全欠損でも落ちないように
        s_tr = train_df.loc[tr_idx, src_col]
        if s_tr.isna().all():
            if verbose:
                print(f'[TE] skip: all NA on train fold: {te_col} -> {src_col}')
            continue

        mapping, _prior = fit_target_encoding_map(
            s_cat=s_tr,
            y=y_tr.loc[tr_idx],
            smoothing=smoothing,
            min_samples_leaf=min_samples_leaf,
        )

        # apply
        for idx in apply_idx_list:
            # apply側に元列が無いことは基本ないが、念のため
            if src_col not in train_df.columns:
                continue

            train_df.loc[idx, te_col] = apply_target_encoding(
                train_df.loc[idx, src_col],
                mapping,
                prior,
            ).values

        te_meta[te_col] = {'src_col': src_col, 'mapping': mapping, 'prior': prior}

    return te_meta

In [40]:
def run_cv_by_separate(
    train_df: pd.DataFrame,
    base_cols: list[str],
    cat_cols: list[str],
    target_col: str,
    year_col: str,
    base_params: dict,  # {'lgb':..., 'cat':...}
    alg: str,
    idx_dict: dict[str, pd.Index] | None = None,
    n_splits: int = 5,
    te_smoothing: float = 50.0,
    te_min_samples_leaf: int = 1,
    ho_year: int = 2022,
    cv_year_max: int = 2021,
    stage_name: str = 'CV',
    print_mape: bool = True,
):
    y = train_df[target_col].astype(float)
    y_log = np.log(y)

    if idx_dict is None:
        idx_dict = {'all': train_df.index}

    te_cols_base = [
        c for c in base_cols
        if c.endswith('_te') and (_infer_te_source_col(c) in train_df.columns)
    ]

    results: dict[str, dict] = {}

    print(f'\n[{stage_name}] start: alg={alg}, n_splits={n_splits}, n_features={len(base_cols)}, n_te_cols={len(te_cols_base)}')

    for split_key, split_idx in idx_dict.items():
        idx_cv = split_idx.intersection(train_df.index[train_df[year_col] <= cv_year_max])
        idx_ho = split_idx.intersection(train_df.index[train_df[year_col] == ho_year])

        if len(idx_cv) == 0:
            print(f'[{stage_name}] split={split_key} skip (no CV rows)')
            continue

        has_ho = len(idx_ho) > 0

        print(f'\n[{stage_name}] split={split_key} | CV={len(idx_cv)} | HO={len(idx_ho)}')

        oof_pred_log = pd.Series(np.nan, index=idx_cv, dtype=float)
        ho_pred_log_accum = pd.Series(0.0, index=idx_ho, dtype=float) if has_ho else None
        fi_list = []

        X_cv = train_df.loc[idx_cv, base_cols]
        y_cv_log = y_log.loc[idx_cv]
        groups_cv = train_df.loc[idx_cv, 'building_id']

        gkf = GroupKFold(n_splits=n_splits)

        for fold, (tr_pos, va_pos) in enumerate(gkf.split(X_cv, y_cv_log, groups_cv), 1):
            tr_idx = _safe_take_index(idx_cv, tr_pos)
            va_idx = _safe_take_index(idx_cv, va_pos)

            print(f'[{stage_name}] split={split_key} fold={fold}/{n_splits} | tr={len(tr_idx)} va={len(va_idx)}')

            # TE 再計算
            if te_cols_base:
                apply_list = [tr_idx, va_idx]
                if has_ho:
                    apply_list.append(idx_ho)

                recompute_te_for_fold(
                    train_df=train_df,
                    tr_idx=tr_idx,
                    apply_idx_list=apply_list,
                    te_cols=te_cols_base,
                    y_tr=y_log,
                    smoothing=te_smoothing,
                    min_samples_leaf=te_min_samples_leaf,
                    verbose=False,
                )

            if alg == 'lgb':
                model = lgb.LGBMRegressor(**base_params['lgb'])
                X_train = train_df.loc[tr_idx, base_cols].copy()
                y_train = y_log.loc[tr_idx].to_numpy()
                X_valid = train_df.loc[va_idx, base_cols].copy()
                y_valid = y_log.loc[va_idx].to_numpy()
                X_ho = train_df.loc[idx_ho, base_cols]

                # categorical_feature に渡す列は X に存在するものだけに限定（安全）
                cat_cols_use_in_X = [c for c in (cat_cols or []) if c in X_train.columns]

                # cat列を category dtype に統一（学習・推論で同じ前処理にする）
                if cat_cols_use_in_X:
                    X_train = _force_lgbm_train_categories(X_train, cat_cols_use_in_X, na_token='NA')
                    X_valid = _force_lgbm_train_categories(X_valid, cat_cols_use_in_X, na_token='NA')
                    X_ho = _force_lgbm_train_categories(X_ho, cat_cols_use_in_X, na_token='NA')
                
                model.fit(
                    X_train,
                    y_train,
                    categorical_feature=cat_cols_use_in_X,
                    eval_set=[(X_valid, y_valid)],
                    eval_metric='rmse',
                    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)],
                )
                oof_pred_log.loc[va_idx] = model.predict(X_valid)

                if has_ho:
                    ho_pred_log_accum += model.predict(X_ho) / n_splits

                fi_list.append(pd.DataFrame({'feature': base_cols, 'importance': model.feature_importances_}))

            elif alg == 'cat':
                X_tr = train_df.loc[tr_idx, base_cols].copy()
                X_va = train_df.loc[va_idx, base_cols].copy()

                if cat_cols:
                    for c in cat_cols:
                        if c in X_tr.columns:
                            X_tr[c] = X_tr[c].astype('string').fillna('NA')
                            X_va[c] = X_va[c].astype('string').fillna('NA')

                model = CatBoostRegressor(**base_params['cat'], cat_features=(cat_cols or []))
                model.fit(
                    X_tr, y_log.loc[tr_idx],
                    eval_set=(X_va, y_log.loc[va_idx]),
                    use_best_model=True,
                    verbose=False,
                )
                oof_pred_log.loc[va_idx] = model.predict(X_va)

                if has_ho:
                    X_ho = train_df.loc[idx_ho, base_cols].copy()
                    if cat_cols:
                        for c in cat_cols:
                            if c in X_ho.columns:
                                X_ho[c] = X_ho[c].astype('string').fillna('NA')
                    ho_pred_log_accum += model.predict(X_ho) / n_splits

                fi_list.append(pd.DataFrame({'feature': base_cols, 'importance': model.get_feature_importance()}))

            else:
                raise ValueError(f'alg must be lgb or cat, got: {alg}')

        # metrics
        cv_mae_log = float(np.mean(np.abs(y_cv_log.values - oof_pred_log.values)))
        cv_mape = mape(y.loc[idx_cv].values, np.exp(oof_pred_log.values))

        ho_mape = None
        if has_ho:
            ho_mape = mape(y.loc[idx_ho].values, np.exp(ho_pred_log_accum.values))

        if print_mape:
            if has_ho:
                print(f'[{stage_name}] split={split_key} DONE | log-MAE={cv_mae_log:.6f} | OOF MAPE={cv_mape:.6f} | HO MAPE={ho_mape:.6f}')
            else:
                print(f'[{stage_name}] split={split_key} DONE | log-MAE={cv_mae_log:.6f} | OOF MAPE={cv_mape:.6f} | HO MAPE=NA')

        results[split_key] = {
            'cv_mae_log': cv_mae_log,
            'cv_mape': cv_mape,
            'ho_mape': ho_mape,
            'oof_pred_log': oof_pred_log,
            'ho_pred_log': ho_pred_log_accum if has_ho else None,
            'fi': pd.concat(fi_list, ignore_index=True) if fi_list else pd.DataFrame(),
            'used_cols': base_cols,
            'idx_cv': idx_cv,
            'idx_ho': idx_ho,
        }

    print(f'\n[{stage_name}] done.')

    return {
        'results_by_split': results,
        'bias_table_ho_log_final': pd.DataFrame(),  # 今回はprint運用なので空でOK
    }

#### 特徴量選択

In [41]:
def reduce_features_by_fi(
    fi_df: pd.DataFrame,
    base_cols: list[str],
    fi_drop_threshold: float = 1.0,
    fi_keep_topk: int | None = None,
) -> list[str]:
    if fi_df is None or fi_df.empty:
        return base_cols

    fi_mean = fi_df.groupby('feature', as_index=True)['importance'].mean()

    if fi_keep_topk is not None:
        keep = set(fi_mean.sort_values(ascending=False).head(fi_keep_topk).index.tolist())
    else:
        keep = set(fi_mean[fi_mean > fi_drop_threshold].index.tolist())

    return [c for c in base_cols if c in keep]

#### パラメータチューニング

In [42]:
def tune_params_light_two_stage_lgb(
    train_df: pd.DataFrame,
    idx_cv: pd.Index,
    idx_ho: pd.Index,
    base_cols: list[str],
    cat_cols: list[str],
    target_col: str,
    base_params: dict,          # BASE_PARAMS_BY_MODEL
    property_type: str,         # 'residential' 想定
    n_trials_phase1: int = 20,
    n_trials_phase2: int = 30,

    hpo_n_splits: int = 3,
    hpo_n_estimators: int = 4000,
    es_rounds: int = 30,        # ★HPO中だけ強める

    te_smoothing: float = 50.0,
    te_min_samples_leaf: int = 1,
    seed: int = 42,
):
    if property_type not in LGB_2STAGE_CONF:
        raise ValueError(f'unknown property_type: {property_type}')

    y = train_df[target_col].astype(float)
    y_log = np.log(y)

    X_cv_full = train_df.loc[idx_cv, base_cols]
    y_cv_full = y_log.loc[idx_cv]
    groups = train_df.loc[idx_cv, 'building_id']

    gkf = GroupKFold(n_splits=hpo_n_splits)

    has_ho = idx_ho is not None and len(idx_ho) > 0

    te_cols_base = [
        c for c in base_cols
        if c.endswith('_te') and (_infer_te_source_col(c) in train_df.columns)
    ]

    def _objective(trial: optuna.Trial, space_fn, best_params_ref=None) -> float:
        params = dict(base_params['lgb'])
        if best_params_ref is None:
            params.update(space_fn(trial))
        else:
            params.update(space_fn(trial, best_params_ref))

        params['n_estimators'] = hpo_n_estimators
        params['verbosity'] = -1

        oof_pred_log = pd.Series(np.nan, index=idx_cv, dtype=float)
        ho_pred_log_accum = pd.Series(0.0, index=idx_ho, dtype=float) if has_ho else None

        fold_scores = []

        for fold, (tr_pos, va_pos) in enumerate(gkf.split(X_cv_full, y_cv_full, groups), 1):
            tr_idx = _safe_take_index(idx_cv, tr_pos)
            va_idx = _safe_take_index(idx_cv, va_pos)

            # TE再計算（tr/va/ho）
            if te_cols_base:
                apply_list = [tr_idx, va_idx]
                if has_ho:
                    apply_list.append(idx_ho)
                recompute_te_for_fold(
                    train_df=train_df,
                    tr_idx=tr_idx,
                    apply_idx_list=apply_list,
                    te_cols=te_cols_base,
                    y_tr=y_log,
                    smoothing=te_smoothing,
                    min_samples_leaf=te_min_samples_leaf,
                    verbose=False,
                )

            model = lgb.LGBMRegressor(**params)
            X_train = train_df.loc[tr_idx, base_cols].copy()
            y_train = y_log.loc[tr_idx].to_numpy()
            X_valid = train_df.loc[va_idx, base_cols].copy()
            y_valid = y_log.loc[va_idx].to_numpy()
            X_ho = train_df.loc[idx_ho, base_cols]

            # categorical_feature に渡す列は X に存在するものだけに限定（安全）
            cat_cols_use_in_X = [c for c in (cat_cols or []) if c in X_train.columns]

            # cat列を category dtype に統一（学習・推論で同じ前処理にする）
            if cat_cols_use_in_X:
                X_train = _force_lgbm_train_categories(X_train, cat_cols_use_in_X, na_token='NA')
                X_valid = _force_lgbm_train_categories(X_valid, cat_cols_use_in_X, na_token='NA')
                X_ho = _force_lgbm_train_categories(X_ho, cat_cols_use_in_X, na_token='NA')


            model.fit(
                X_train,
                y_train,
                categorical_feature=cat_cols_use_in_X,
                eval_set=[(X_valid, y_valid)],
                eval_metric='rmse',
                callbacks=[lgb.early_stopping(stopping_rounds=es_rounds, verbose=False)],
            )

            pred_va = model.predict(X_valid)
            oof_pred_log.loc[va_idx] = pred_va

            fold_mae = float(np.mean(np.abs(y_log.loc[va_idx].values - pred_va)))
            fold_scores.append(fold_mae)

            # ★Pruner判定：foldごとに報告して剪定
            trial.report(float(np.mean(fold_scores)), step=fold)
            if trial.should_prune():
                raise optuna.TrialPruned()

            if has_ho:
                ho_pred_log_accum += model.predict(X_ho) / hpo_n_splits

        score = float(np.mean(fold_scores))

        # best更新時にMAPEをprintするため保持
        trial.set_user_attr('oof_pred_log', oof_pred_log)
        if has_ho:
            trial.set_user_attr('ho_pred_log', ho_pred_log_accum)

        return score

    def _run_phase(phase_name: str, n_trials: int, space_fn, best_params_ref=None):
        print(f'\n[HPO-2STAGE] {phase_name} start: trials={n_trials}, folds={hpo_n_splits}, est={hpo_n_estimators}, es={es_rounds}')

        # Phase2は pruner を強める
        if phase_name == 'PHASE2':
            pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1, interval_steps=1)
        else:
            pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=1, interval_steps=1)

        sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True, group=True)
        study = optuna.create_study(direction='minimize', sampler=sampler, pruner=pruner)

        best_value_seen = None

        def _callback(study: optuna.Study, trial: optuna.Trial):
            nonlocal best_value_seen
            if study.best_trial.number != trial.number:
                return

            if best_value_seen is None or study.best_value < best_value_seen:
                best_value_seen = study.best_value

                oof_pred_log = trial.user_attrs.get('oof_pred_log')
                ho_pred_log = trial.user_attrs.get('ho_pred_log')

                oof_m = mape(y.loc[idx_cv].values, np.exp(oof_pred_log.values)) if oof_pred_log is not None else None
                ho_m = mape(y.loc[idx_ho].values, np.exp(ho_pred_log.values)) if (has_ho and ho_pred_log is not None) else None

                print(f'[HPO-2STAGE] {phase_name} NEW BEST: trial={trial.number} logMAE={study.best_value:.6f} | OOF_MAPE={oof_m:.6f} | HO_MAPE={(ho_m if ho_m is not None else float("nan")):.6f}')

        study.optimize(
            lambda t: _objective(t, space_fn, best_params_ref),
            n_trials=n_trials,
            callbacks=[_callback],
        )

        print(f'[HPO-2STAGE] {phase_name} done: best_logMAE={study.best_value:.6f}')
        print(f'[HPO-2STAGE] {phase_name} best_params={study.best_params}')
        return study

    # Phase1
    study1 = _run_phase(
        'PHASE1',
        n_trials_phase1,
        lambda t: _space_lgb_phase1(t, property_type),
    )
    best1 = study1.best_params

    # Phase2
    study2 = _run_phase(
        'PHASE2',
        n_trials_phase2,
        lambda t, bp: _space_lgb_phase2(t, bp, property_type),
        best_params_ref=best1,
    )

    # 最終best（Phase2 bestを採用）
    best = study2.best_params
    merged = dict(base_params['lgb'])
    merged.update(best)
    merged['verbosity'] = -1
    return merged


#### 最終モデル学習

In [43]:
def train_final_models_by_split(
    train_df: pd.DataFrame,
    idx_dict: dict[str, pd.Index] | None,
    feature_cols_by_split: dict[str, list[str]],
    cat_cols_by_split: dict[str, list[str]],
    target_col: str,
    alg: str,
    params_by_split: dict[str, dict],
    te_smoothing: float = 50.0,
    te_min_samples_leaf: int = 1,
):
    """
    splitごとに、与えられた feature_cols で最終モデルを学習する。
    TE列(*_te)が含まれている場合：
      - split内の全行をtrとしてTEをfit
      - split内の全行にTEを適用
    してから学習する（最終モデル整合）。
    """
    if idx_dict is None:
        idx_dict = {'all': train_df.index}

    y_log = np.log(train_df[target_col].astype(float))

    models_by_split = {}
    final_cols_by_split = {}

    for split_key, idx in idx_dict.items():
        cols = feature_cols_by_split[split_key]
        cat_cols = cat_cols_by_split[split_key]
        te_cols = [c for c in cols if c.endswith('_te')]

        # 最終学習用に TE を split全体で fit→apply
        if te_cols:
            recompute_te_for_fold(
                train_df=train_df,
                tr_idx=idx,
                apply_idx_list=[idx],
                te_cols=te_cols,
                y_tr=y_log,
                smoothing=te_smoothing,
                min_samples_leaf=te_min_samples_leaf,
                verbose=False,
            )

        final_cols_by_split[split_key] = cols

        if alg == 'lgb':
            model = lgb.LGBMRegressor(**params_by_split[split_key])
            X_train = train_df.loc[idx, cols].copy()
            y_train = y_log.loc[idx].to_numpy()

            # categorical_feature に渡す列は X に存在するものだけに限定（安全）
            cat_cols_use_in_X = [c for c in (cat_cols or []) if c in X_train.columns]

            # cat列を category dtype に統一（学習・推論で同じ前処理にする）
            if cat_cols_use_in_X:
                X_train = _force_lgbm_train_categories(X_train, cat_cols_use_in_X, na_token='NA')

            model.fit(
                X_train,
                y_train,
                categorical_feature=cat_cols_use_in_X,
            )

            models_by_split[split_key] = model

        elif alg == 'cat':
            X = train_df.loc[idx, cols].copy()
            if cat_cols:
                for c in cat_cols:
                    if c in X.columns:
                        X[c] = X[c].astype('string').fillna('NA')

            model = CatBoostRegressor(**params_by_split[split_key], cat_features=(cat_cols or []))
            model.fit(X, y_log.loc[idx], verbose=False)
            models_by_split[split_key] = model

        else:
            raise ValueError(f'alg must be lgb or cat, got: {alg}')

    return models_by_split


#### パイプライン

In [44]:
def train_with_fs_and_hpo_by_separate_light(
    train_df: pd.DataFrame,
    base_cols: list[str],
    cat_cols: list[str],
    target_col: str,
    year_col: str,
    property_type: str,
    alg: str,
    idx_dict: dict[str, pd.Index] | None,
    base_params: dict,
    n_splits_fixed: int = 5,

    # TE
    te_smoothing: float = 50.0,
    te_min_samples_leaf: int = 1,

    # FS
    fi_drop_threshold: float = 1.0,
    fi_keep_topk: int | None = None,

    # HPO (two-stage)
    n_trials_phase1: int = 20,
    n_trials_phase2: int = 30,
    hpo_n_splits: int = 3,
    hpo_n_estimators: int = 4000,
    es_rounds: int = 30,

    # time split
    ho_year: int = 2022,
    cv_year_max: int = 2021,
):
    if idx_dict is None:
        idx_dict = {'all': train_df.index}

    if alg != 'lgb':
        raise ValueError('この軽量二段階実装はまずlgb対象です（catは別途実装）')

    print('\n' + '=' * 80)
    print(f'[PIPELINE-LIGHT] START | target_model={property_type} | alg={alg}')
    print('=' * 80)

    # Step1: FIXED CV（FI用）
    print('\n[PIPELINE-LIGHT] Step1: FIXED CV (for FI only)')
    cv_fixed = run_cv_by_separate(
        train_df=train_df,
        base_cols=base_cols,
        cat_cols=cat_cols,
        target_col=target_col,
        year_col=year_col,
        base_params=base_params,
        alg=alg,
        idx_dict=idx_dict,
        n_splits=n_splits_fixed,
        te_smoothing=te_smoothing,
        te_min_samples_leaf=te_min_samples_leaf,
        ho_year=ho_year,
        cv_year_max=cv_year_max,
        stage_name='CV_FIXED',
        print_mape=False,
    )

    # Step2: FS by FI
    print('\n[PIPELINE-LIGHT] Step2: Feature selection by FI')
    reduced_cols_by_split = {}
    cat_cols_by_split = {}
    for split_key, res in cv_fixed['results_by_split'].items():
        fi_df = res.get('fi')
        reduced_cols = reduce_features_by_fi(
            fi_df=fi_df,
            base_cols=base_cols,
            fi_drop_threshold=fi_drop_threshold,
            fi_keep_topk=fi_keep_topk,
        )
        cat_cols_use = [c for c in cat_cols if c in reduced_cols]
        reduced_cols_by_split[split_key] = reduced_cols
        cat_cols_by_split[split_key] = cat_cols_use
        print(f'  [FS] split={split_key} | {len(base_cols)} -> {len(reduced_cols)}, {len(cat_cols_use)}')

    # Step3: HPO（二段階 + Pruner + es_rounds=30）
    print('\n[PIPELINE-LIGHT] Step3: HPO 2-stage (Pruner + strong early stop)')

    best_params_by_split = {}
    for split_key, split_idx in idx_dict.items():
        idx_cv = split_idx.intersection(train_df.index[train_df[year_col] <= cv_year_max])
        idx_ho = split_idx.intersection(train_df.index[train_df[year_col] == ho_year])

        cols = reduced_cols_by_split[split_key]
        cat_cols = cat_cols_by_split[split_key]
        print(f'\n[PIPELINE-LIGHT] HPO split={split_key} | cv={len(idx_cv)} ho={len(idx_ho)} | n_features={len(cols)}')

        tuned_params = tune_params_light_two_stage_lgb(
            train_df=train_df,
            idx_cv=idx_cv,
            idx_ho=idx_ho,
            base_cols=cols,
            cat_cols=cat_cols,
            target_col=target_col,
            base_params=base_params,
            property_type=property_type,
            n_trials_phase1=n_trials_phase1,
            n_trials_phase2=n_trials_phase2,
            hpo_n_splits=hpo_n_splits,
            hpo_n_estimators=hpo_n_estimators,
            es_rounds=es_rounds,
            te_smoothing=te_smoothing,
            te_min_samples_leaf=te_min_samples_leaf,
        )
        best_params_by_split[split_key] = tuned_params

    # Step4: Final train（tuned再CVなし）
    print('\n[PIPELINE-LIGHT] Step4: Final training (no tuned re-CV)')
    final_models_by_split= train_final_models_by_split(
        train_df=train_df,
        idx_dict=idx_dict,
        feature_cols_by_split=reduced_cols_by_split,
        cat_cols_by_split=cat_cols_by_split,
        target_col=target_col,
        alg=alg,
        params_by_split=best_params_by_split,
        te_smoothing=te_smoothing,
        te_min_samples_leaf=te_min_samples_leaf,
    )
    print('[PIPELINE-LIGHT] Final training done.')

    print('\n' + '=' * 80)
    print('[PIPELINE-LIGHT] DONE')
    print('=' * 80)

    return {
        'cv_fixed': cv_fixed,
        'reduced_cols_by_split': reduced_cols_by_split,
        'cat_cols_by_split': cat_cols_by_split,
        'best_params_by_split': best_params_by_split,
        'final_models_by_split': final_models_by_split,
    }

## 関数の呼び出し

In [45]:
train_df, fe_cols, cat_cols, idx_dict = prepare_training_inputs(
    target_model=property_types[0],
    alg=alg,
)

In [46]:
all_bundles = {}

for target_model in property_types:
    print('\n' + '#' * 100)
    print(f'[RUN] target_model={target_model} | alg={alg}')
    print('#' * 100)

    train_df, fe_cols, cat_cols, idx_dict = prepare_training_inputs(
        target_model=target_model,
        alg=alg,
    )

    bundle = train_with_fs_and_hpo_by_separate_light(
        train_df=train_df,
        base_cols=fe_cols,
        cat_cols=cat_cols,
        target_col=target_col,
        year_col=year_col,
        property_type=target_model,
        alg=alg,
        idx_dict=idx_dict,
        base_params=BASE_PARAMS_BY_MODEL,

        # TE
        te_smoothing=50.0,
        te_min_samples_leaf=1,

        # FS
        fi_drop_threshold=1.0,
        fi_keep_topk=None,

        # HPO(two-stage)
        n_trials_phase1=20,
        n_trials_phase2=30,
        hpo_n_splits=3,
        hpo_n_estimators=4000,
        es_rounds=30,

        ho_year=2022,
        cv_year_max=2021,
    )

    all_bundles[(target_model, alg)] = bundle


####################################################################################################
[RUN] target_model=residential | alg=lgb
####################################################################################################

[PIPELINE-LIGHT] START | target_model=residential | alg=lgb

[PIPELINE-LIGHT] Step1: FIXED CV (for FI only)

[CV_FIXED] start: alg=lgb, n_splits=5, n_features=365, n_te_cols=2

[CV_FIXED] split=main_city | CV=31627 | HO=11070
[CV_FIXED] split=main_city fold=1/5 | tr=25301 va=6326
[CV_FIXED] split=main_city fold=2/5 | tr=25301 va=6326
[CV_FIXED] split=main_city fold=3/5 | tr=25302 va=6325
[CV_FIXED] split=main_city fold=4/5 | tr=25302 va=6325
[CV_FIXED] split=main_city fold=5/5 | tr=25302 va=6325

[CV_FIXED] split=mid_city | CV=54265 | HO=15471
[CV_FIXED] split=mid_city fold=1/5 | tr=43412 va=10853
[CV_FIXED] split=mid_city fold=2/5 | tr=43412 va=10853
[CV_FIXED] split=mid_city fold=3/5 | tr=43412 va=10853
[CV_FIXED] split=mid_city fold=4/5 | tr=

[I 2026-01-09 07:13:28,798] A new study created in memory with name: no-name-d508181c-ea42-43e1-a9c5-b946f3f91f07



[CV_FIXED] done.

[PIPELINE-LIGHT] Step2: Feature selection by FI
  [FS] split=main_city | 365 -> 258, 5
  [FS] split=mid_city | 365 -> 280, 5
  [FS] split=other | 365 -> 279, 6

[PIPELINE-LIGHT] Step3: HPO 2-stage (Pruner + strong early stop)

[PIPELINE-LIGHT] HPO split=main_city | cv=31627 ho=11070 | n_features=258

[HPO-2STAGE] PHASE1 start: trials=20, folds=3, est=4000, es=30


[I 2026-01-09 07:13:40,777] Trial 0 finished with value: 0.12353618876655109 and parameters: {'num_leaves': 67, 'max_depth': 10, 'min_child_samples': 160, 'learning_rate': 0.08387926357773329, 'subsample': 0.696805592132731, 'colsample_bytree': 0.6967983561008608, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 13.39433470675048}. Best is trial 0 with value: 0.12353618876655109.


[HPO-2STAGE] PHASE1 NEW BEST: trial=0 logMAE=0.123536 | OOF_MAPE=0.125176 | HO_MAPE=0.124499


[I 2026-01-09 07:13:54,441] Trial 1 finished with value: 0.12537333208121818 and parameters: {'num_leaves': 89, 'max_depth': 8, 'min_child_samples': 53, 'learning_rate': 0.11729188669457949, 'subsample': 0.8997327922401265, 'colsample_bytree': 0.7137017332034828, 'reg_alpha': 0.18182496720710062, 'reg_lambda': 1.7322667470546256}. Best is trial 0 with value: 0.12353618876655109.
[I 2026-01-09 07:14:12,408] Trial 2 finished with value: 0.12143331811092095 and parameters: {'num_leaves': 60, 'max_depth': 7, 'min_child_samples': 115, 'learning_rate': 0.056210622617823766, 'subsample': 0.8335558684167138, 'colsample_bytree': 0.6918481581956125, 'reg_alpha': 0.29214464853521815, 'reg_lambda': 2.9967309097101573}. Best is trial 2 with value: 0.12143331811092095.


[HPO-2STAGE] PHASE1 NEW BEST: trial=2 logMAE=0.121433 | OOF_MAPE=0.122948 | HO_MAPE=0.123250


[I 2026-01-09 07:14:29,340] Trial 3 finished with value: 0.12213392682183592 and parameters: {'num_leaves': 75, 'max_depth': 9, 'min_child_samples': 80, 'learning_rate': 0.07628109945722504, 'subsample': 0.8277243706586127, 'colsample_bytree': 0.6639351238159993, 'reg_alpha': 0.6075448519014384, 'reg_lambda': 1.6666983286066417}. Best is trial 2 with value: 0.12143331811092095.
[I 2026-01-09 07:14:39,811] Trial 4 finished with value: 0.12325661913230106 and parameters: {'num_leaves': 37, 'max_depth': 10, 'min_child_samples': 195, 'learning_rate': 0.1027557613304815, 'subsample': 0.7413841307520113, 'colsample_bytree': 0.6793016342019151, 'reg_alpha': 0.6842330265121569, 'reg_lambda': 3.738105868191796}. Best is trial 2 with value: 0.12143331811092095.
[I 2026-01-09 07:14:49,589] Trial 5 finished with value: 0.1245325361590387 and parameters: {'num_leaves': 42, 'max_depth': 7, 'min_child_samples': 55, 'learning_rate': 0.11183883618709038, 'subsample': 0.7276339944800051, 'colsample_bytr

[HPO-2STAGE] PHASE1 NEW BEST: trial=7 logMAE=0.120945 | OOF_MAPE=0.122437 | HO_MAPE=0.122780


[I 2026-01-09 07:15:25,774] Trial 8 finished with value: 0.12396853864021068 and parameters: {'num_leaves': 58, 'max_depth': 7, 'min_child_samples': 71, 'learning_rate': 0.10219772826786357, 'subsample': 0.6723651931039313, 'colsample_bytree': 0.9460660809801551, 'reg_alpha': 0.7722447692966574, 'reg_lambda': 1.8135730867783397}. Best is trial 7 with value: 0.12094471177874984.
[I 2026-01-09 07:15:37,657] Trial 9 finished with value: 0.12315150250392144 and parameters: {'num_leaves': 31, 'max_depth': 9, 'min_child_samples': 156, 'learning_rate': 0.09561064512368886, 'subsample': 0.8813811040057837, 'colsample_bytree': 0.6722133955202271, 'reg_alpha': 0.3584657285442726, 'reg_lambda': 1.414976157494142}. Best is trial 7 with value: 0.12094471177874984.
[I 2026-01-09 07:15:52,538] Trial 10 finished with value: 0.120692863373012 and parameters: {'num_leaves': 47, 'max_depth': 5, 'min_child_samples': 55, 'learning_rate': 0.050540451698769064, 'subsample': 0.6537610918690696, 'colsample_byt

[HPO-2STAGE] PHASE1 NEW BEST: trial=10 logMAE=0.120693 | OOF_MAPE=0.122210 | HO_MAPE=0.123309


[I 2026-01-09 07:16:08,100] Trial 11 finished with value: 0.12075656852627575 and parameters: {'num_leaves': 33, 'max_depth': 5, 'min_child_samples': 69, 'learning_rate': 0.05557330416752304, 'subsample': 0.7613062360755578, 'colsample_bytree': 0.7560461445705187, 'reg_alpha': 0.9009551686189577, 'reg_lambda': 3.1432862812561972}. Best is trial 10 with value: 0.120692863373012.
[I 2026-01-09 07:16:20,979] Trial 12 finished with value: 0.12151937046729493 and parameters: {'num_leaves': 72, 'max_depth': 6, 'min_child_samples': 93, 'learning_rate': 0.06337841838276713, 'subsample': 0.6595652648832914, 'colsample_bytree': 0.667259855326242, 'reg_alpha': 0.6060559890252085, 'reg_lambda': 1.3034044501969555}. Best is trial 10 with value: 0.120692863373012.
[I 2026-01-09 07:16:36,109] Trial 13 finished with value: 0.12091090268402295 and parameters: {'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 70, 'learning_rate': 0.03918997076277981, 'subsample': 0.6674808566760786, 'colsample_byt

[HPO-2STAGE] PHASE1 NEW BEST: trial=17 logMAE=0.120681 | OOF_MAPE=0.122182 | HO_MAPE=0.123413


[I 2026-01-09 07:17:28,093] Trial 18 pruned. 
[I 2026-01-09 07:17:46,336] Trial 19 finished with value: 0.12045594864585728 and parameters: {'num_leaves': 104, 'max_depth': 4, 'min_child_samples': 94, 'learning_rate': 0.03293805506756523, 'subsample': 0.6751457447050042, 'colsample_bytree': 0.7480848723782374, 'reg_alpha': 0.8972661621428522, 'reg_lambda': 1.4083606825059687}. Best is trial 19 with value: 0.12045594864585728.
[I 2026-01-09 07:17:46,341] A new study created in memory with name: no-name-c2434729-efa0-4086-a56e-c5b57ae61925


[HPO-2STAGE] PHASE1 NEW BEST: trial=19 logMAE=0.120456 | OOF_MAPE=0.121975 | HO_MAPE=0.123895
[HPO-2STAGE] PHASE1 done: best_logMAE=0.120456
[HPO-2STAGE] PHASE1 best_params={'num_leaves': 104, 'max_depth': 4, 'min_child_samples': 94, 'learning_rate': 0.03293805506756523, 'subsample': 0.6751457447050042, 'colsample_bytree': 0.7480848723782374, 'reg_alpha': 0.8972661621428522, 'reg_lambda': 1.4083606825059687}

[HPO-2STAGE] PHASE2 start: trials=30, folds=3, est=4000, es=30


[I 2026-01-09 07:18:02,031] Trial 0 finished with value: 0.12061297439722268 and parameters: {'num_leaves': 92, 'max_depth': 6, 'min_child_samples': 113, 'learning_rate': 0.041618214652809585, 'subsample': 0.669723503080193, 'colsample_bytree': 0.6828052737444927, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 2.2383262123610987}. Best is trial 0 with value: 0.12061297439722268.


[HPO-2STAGE] PHASE2 NEW BEST: trial=0 logMAE=0.120613 | OOF_MAPE=0.122107 | HO_MAPE=0.123422


[I 2026-01-09 07:18:20,671] Trial 1 finished with value: 0.12088748478121869 and parameters: {'num_leaves': 105, 'max_depth': 6, 'min_child_samples': 55, 'learning_rate': 0.04882312061676891, 'subsample': 0.755235406124237, 'colsample_bytree': 0.6946544060486871, 'reg_alpha': 0.18182496720710062, 'reg_lambda': 1.1860224975542433}. Best is trial 0 with value: 0.12061297439722268.
[I 2026-01-09 07:18:39,719] Trial 2 finished with value: 0.12047276628722965 and parameters: {'num_leaves': 89, 'max_depth': 5, 'min_child_samples': 88, 'learning_rate': 0.035651907979742915, 'subsample': 0.7273489784262948, 'colsample_bytree': 0.6793352245611177, 'reg_alpha': 0.29214464853521815, 'reg_lambda': 1.4060643633794512}. Best is trial 2 with value: 0.12047276628722965.


[HPO-2STAGE] PHASE2 NEW BEST: trial=2 logMAE=0.120473 | OOF_MAPE=0.121904 | HO_MAPE=0.122943


[I 2026-01-09 07:19:00,561] Trial 3 finished with value: 0.12029579801779074 and parameters: {'num_leaves': 97, 'max_depth': 6, 'min_child_samples': 70, 'learning_rate': 0.03997979022275068, 'subsample': 0.7248916317983987, 'colsample_bytree': 0.6597684104642908, 'reg_alpha': 0.6075448519014384, 'reg_lambda': 1.1718969565798714}. Best is trial 3 with value: 0.12029579801779074.


[HPO-2STAGE] PHASE2 NEW BEST: trial=3 logMAE=0.120296 | OOF_MAPE=0.121886 | HO_MAPE=0.123169


[I 2026-01-09 07:19:18,197] Trial 4 finished with value: 0.12090200628624598 and parameters: {'num_leaves': 75, 'max_depth': 6, 'min_child_samples': 132, 'learning_rate': 0.04568863410960672, 'subsample': 0.6885085435786557, 'colsample_bytree': 0.6705402114784356, 'reg_alpha': 0.6842330265121569, 'reg_lambda': 1.5059675718009737}. Best is trial 3 with value: 0.12029579801779074.
[I 2026-01-09 07:19:36,175] Trial 5 finished with value: 0.12025572639211063 and parameters: {'num_leaves': 78, 'max_depth': 5, 'min_child_samples': 56, 'learning_rate': 0.047647256154233764, 'subsample': 0.6827143458608933, 'colsample_bytree': 0.7893268484894016, 'reg_alpha': 0.31171107608941095, 'reg_lambda': 1.6221850934359126}. Best is trial 5 with value: 0.12025572639211063.


[HPO-2STAGE] PHASE2 NEW BEST: trial=5 logMAE=0.120256 | OOF_MAPE=0.121793 | HO_MAPE=0.122273


[I 2026-01-09 07:19:41,526] Trial 6 pruned. 
[I 2026-01-09 07:20:00,291] Trial 7 finished with value: 0.12031194087929524 and parameters: {'num_leaves': 76, 'max_depth': 4, 'min_child_samples': 57, 'learning_rate': 0.03631371260184649, 'subsample': 0.6991356526287639, 'colsample_bytree': 0.7070640510221808, 'reg_alpha': 0.8287375091519293, 'reg_lambda': 1.3935530020271982}. Best is trial 5 with value: 0.12025572639211063.
[I 2026-01-09 07:20:06,377] Trial 8 pruned. 
[I 2026-01-09 07:20:12,684] Trial 9 pruned. 
[I 2026-01-09 07:20:18,734] Trial 10 pruned. 
[I 2026-01-09 07:20:24,084] Trial 11 pruned. 
[I 2026-01-09 07:20:27,714] Trial 12 pruned. 
[I 2026-01-09 07:20:34,690] Trial 13 pruned. 
[I 2026-01-09 07:20:38,255] Trial 14 pruned. 
[I 2026-01-09 07:20:43,462] Trial 15 pruned. 
[I 2026-01-09 07:20:48,066] Trial 16 pruned. 
[I 2026-01-09 07:20:54,064] Trial 17 pruned. 
[I 2026-01-09 07:21:29,725] Trial 18 finished with value: 0.11956200962607516 and parameters: {'num_leaves': 87, 'ma

[HPO-2STAGE] PHASE2 NEW BEST: trial=18 logMAE=0.119562 | OOF_MAPE=0.121048 | HO_MAPE=0.122545


[I 2026-01-09 07:22:01,669] Trial 19 finished with value: 0.11953482077458695 and parameters: {'num_leaves': 84, 'max_depth': 6, 'min_child_samples': 81, 'learning_rate': 0.030489907734239114, 'subsample': 0.7373480809994508, 'colsample_bytree': 0.6977671371137035, 'reg_alpha': 0.7013659436962031, 'reg_lambda': 1.363365061625741}. Best is trial 19 with value: 0.11953482077458695.


[HPO-2STAGE] PHASE2 NEW BEST: trial=19 logMAE=0.119535 | OOF_MAPE=0.121038 | HO_MAPE=0.122709


[I 2026-01-09 07:22:30,241] Trial 20 finished with value: 0.11999924899527377 and parameters: {'num_leaves': 80, 'max_depth': 6, 'min_child_samples': 84, 'learning_rate': 0.03194188032890291, 'subsample': 0.741112425037547, 'colsample_bytree': 0.7277194835680821, 'reg_alpha': 0.9337716061483156, 'reg_lambda': 1.5074798910468705}. Best is trial 19 with value: 0.11953482077458695.
[I 2026-01-09 07:23:01,022] Trial 21 finished with value: 0.11970904313143323 and parameters: {'num_leaves': 89, 'max_depth': 6, 'min_child_samples': 84, 'learning_rate': 0.03138567049403701, 'subsample': 0.7572715339055287, 'colsample_bytree': 0.7251504986225761, 'reg_alpha': 0.8760320587469266, 'reg_lambda': 1.3600856721982202}. Best is trial 19 with value: 0.11953482077458695.
[I 2026-01-09 07:23:35,281] Trial 22 finished with value: 0.11985818759316798 and parameters: {'num_leaves': 90, 'max_depth': 6, 'min_child_samples': 55, 'learning_rate': 0.0302591130054256, 'subsample': 0.7688036031232437, 'colsample_

[HPO-2STAGE] PHASE2 done: best_logMAE=0.119535
[HPO-2STAGE] PHASE2 best_params={'num_leaves': 84, 'max_depth': 6, 'min_child_samples': 81, 'learning_rate': 0.030489907734239114, 'subsample': 0.7373480809994508, 'colsample_bytree': 0.6977671371137035, 'reg_alpha': 0.7013659436962031, 'reg_lambda': 1.363365061625741}

[PIPELINE-LIGHT] HPO split=mid_city | cv=54265 ho=15471 | n_features=280

[HPO-2STAGE] PHASE1 start: trials=20, folds=3, est=4000, es=30


[I 2026-01-09 07:26:18,784] Trial 0 finished with value: 0.14042470248964725 and parameters: {'num_leaves': 67, 'max_depth': 10, 'min_child_samples': 160, 'learning_rate': 0.08387926357773329, 'subsample': 0.696805592132731, 'colsample_bytree': 0.6967983561008608, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 13.39433470675048}. Best is trial 0 with value: 0.14042470248964725.


[HPO-2STAGE] PHASE1 NEW BEST: trial=0 logMAE=0.140425 | OOF_MAPE=0.142806 | HO_MAPE=0.142731


[I 2026-01-09 07:26:39,945] Trial 1 finished with value: 0.14227522076130514 and parameters: {'num_leaves': 89, 'max_depth': 8, 'min_child_samples': 53, 'learning_rate': 0.11729188669457949, 'subsample': 0.8997327922401265, 'colsample_bytree': 0.7137017332034828, 'reg_alpha': 0.18182496720710062, 'reg_lambda': 1.7322667470546256}. Best is trial 0 with value: 0.14042470248964725.
[I 2026-01-09 07:27:15,047] Trial 2 finished with value: 0.13831968674556286 and parameters: {'num_leaves': 60, 'max_depth': 7, 'min_child_samples': 115, 'learning_rate': 0.056210622617823766, 'subsample': 0.8335558684167138, 'colsample_bytree': 0.6918481581956125, 'reg_alpha': 0.29214464853521815, 'reg_lambda': 2.9967309097101573}. Best is trial 2 with value: 0.13831968674556286.


[HPO-2STAGE] PHASE1 NEW BEST: trial=2 logMAE=0.138320 | OOF_MAPE=0.140713 | HO_MAPE=0.141371


[I 2026-01-09 07:27:45,949] Trial 3 finished with value: 0.1393790284886747 and parameters: {'num_leaves': 75, 'max_depth': 9, 'min_child_samples': 80, 'learning_rate': 0.07628109945722504, 'subsample': 0.8277243706586127, 'colsample_bytree': 0.6639351238159993, 'reg_alpha': 0.6075448519014384, 'reg_lambda': 1.6666983286066417}. Best is trial 2 with value: 0.13831968674556286.
[I 2026-01-09 07:28:04,051] Trial 4 finished with value: 0.14075281345668655 and parameters: {'num_leaves': 37, 'max_depth': 10, 'min_child_samples': 195, 'learning_rate': 0.1027557613304815, 'subsample': 0.7413841307520113, 'colsample_bytree': 0.6793016342019151, 'reg_alpha': 0.6842330265121569, 'reg_lambda': 3.738105868191796}. Best is trial 2 with value: 0.13831968674556286.
[I 2026-01-09 07:28:18,381] Trial 5 finished with value: 0.14151304530009778 and parameters: {'num_leaves': 42, 'max_depth': 7, 'min_child_samples': 55, 'learning_rate': 0.11183883618709038, 'subsample': 0.7276339944800051, 'colsample_bytr

[HPO-2STAGE] PHASE1 NEW BEST: trial=7 logMAE=0.138314 | OOF_MAPE=0.140667 | HO_MAPE=0.142631


[I 2026-01-09 07:29:17,595] Trial 8 finished with value: 0.14163109877152946 and parameters: {'num_leaves': 58, 'max_depth': 7, 'min_child_samples': 71, 'learning_rate': 0.10219772826786357, 'subsample': 0.6723651931039313, 'colsample_bytree': 0.9460660809801551, 'reg_alpha': 0.7722447692966574, 'reg_lambda': 1.8135730867783397}. Best is trial 7 with value: 0.13831379829923304.
[I 2026-01-09 07:29:34,828] Trial 9 finished with value: 0.14025044585473176 and parameters: {'num_leaves': 31, 'max_depth': 9, 'min_child_samples': 156, 'learning_rate': 0.09561064512368886, 'subsample': 0.8813811040057837, 'colsample_bytree': 0.6722133955202271, 'reg_alpha': 0.3584657285442726, 'reg_lambda': 1.414976157494142}. Best is trial 7 with value: 0.13831379829923304.
[I 2026-01-09 07:30:00,229] Trial 10 finished with value: 0.13823068424623144 and parameters: {'num_leaves': 47, 'max_depth': 5, 'min_child_samples': 55, 'learning_rate': 0.050540451698769064, 'subsample': 0.6537610918690696, 'colsample_b

[HPO-2STAGE] PHASE1 NEW BEST: trial=10 logMAE=0.138231 | OOF_MAPE=0.140554 | HO_MAPE=0.143000


[I 2026-01-09 07:30:29,964] Trial 11 finished with value: 0.13801642738784006 and parameters: {'num_leaves': 33, 'max_depth': 5, 'min_child_samples': 69, 'learning_rate': 0.05557330416752304, 'subsample': 0.7613062360755578, 'colsample_bytree': 0.7560461445705187, 'reg_alpha': 0.9009551686189577, 'reg_lambda': 3.1432862812561972}. Best is trial 11 with value: 0.13801642738784006.


[HPO-2STAGE] PHASE1 NEW BEST: trial=11 logMAE=0.138016 | OOF_MAPE=0.140340 | HO_MAPE=0.142110


[I 2026-01-09 07:30:51,520] Trial 12 finished with value: 0.1390054505492133 and parameters: {'num_leaves': 72, 'max_depth': 6, 'min_child_samples': 93, 'learning_rate': 0.06337841838276713, 'subsample': 0.6595652648832914, 'colsample_bytree': 0.667259855326242, 'reg_alpha': 0.6060559890252085, 'reg_lambda': 1.3034044501969555}. Best is trial 11 with value: 0.13801642738784006.
[I 2026-01-09 07:31:18,313] Trial 13 finished with value: 0.13839556790930954 and parameters: {'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 70, 'learning_rate': 0.03918997076277981, 'subsample': 0.6674808566760786, 'colsample_bytree': 0.699214081203387, 'reg_alpha': 0.3011815988734019, 'reg_lambda': 1.0794404011899306}. Best is trial 11 with value: 0.13801642738784006.
[I 2026-01-09 07:31:34,115] Trial 14 finished with value: 0.1395772927152276 and parameters: {'num_leaves': 48, 'max_depth': 4, 'min_child_samples': 88, 'learning_rate': 0.0810853353539809, 'subsample': 0.6735701371522077, 'colsample_byt

[HPO-2STAGE] PHASE1 done: best_logMAE=0.138016
[HPO-2STAGE] PHASE1 best_params={'num_leaves': 33, 'max_depth': 5, 'min_child_samples': 69, 'learning_rate': 0.05557330416752304, 'subsample': 0.7613062360755578, 'colsample_bytree': 0.7560461445705187, 'reg_alpha': 0.9009551686189577, 'reg_lambda': 3.1432862812561972}

[HPO-2STAGE] PHASE2 start: trials=30, folds=3, est=4000, es=30


[I 2026-01-09 07:33:39,827] Trial 0 finished with value: 0.13858722646053187 and parameters: {'num_leaves': 44, 'max_depth': 7, 'min_child_samples': 93, 'learning_rate': 0.0619443905262142, 'subsample': 0.685182542212202, 'colsample_bytree': 0.6842334758069287, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 4.48676223929695}. Best is trial 0 with value: 0.13858722646053187.


[HPO-2STAGE] PHASE2 NEW BEST: trial=0 logMAE=0.138587 | OOF_MAPE=0.140987 | HO_MAPE=0.141914


[I 2026-01-09 07:34:01,805] Trial 1 finished with value: 0.13946340722015318 and parameters: {'num_leaves': 52, 'max_depth': 6, 'min_child_samples': 51, 'learning_rate': 0.0817543472790539, 'subsample': 0.8377176231387774, 'colsample_bytree': 0.6965984689244403, 'reg_alpha': 0.18182496720710062, 'reg_lambda': 1.3741703885495165}. Best is trial 0 with value: 0.13858722646053187.
[I 2026-01-09 07:34:34,327] Trial 2 finished with value: 0.1379836207489079 and parameters: {'num_leaves': 41, 'max_depth': 6, 'min_child_samples': 75, 'learning_rate': 0.045539974180066736, 'subsample': 0.7879741563904369, 'colsample_bytree': 0.6806123554439912, 'reg_alpha': 0.29214464853521815, 'reg_lambda': 1.886881391738385}. Best is trial 2 with value: 0.1379836207489079.


[HPO-2STAGE] PHASE2 NEW BEST: trial=2 logMAE=0.137984 | OOF_MAPE=0.140340 | HO_MAPE=0.141397


[I 2026-01-09 07:35:07,196] Trial 3 finished with value: 0.13789703371009535 and parameters: {'num_leaves': 46, 'max_depth': 7, 'min_child_samples': 61, 'learning_rate': 0.0574395271366542, 'subsample': 0.7835907716988612, 'colsample_bytree': 0.6601936855002647, 'reg_alpha': 0.6075448519014384, 'reg_lambda': 1.3438354090636417}. Best is trial 3 with value: 0.13789703371009535.


[HPO-2STAGE] PHASE2 NEW BEST: trial=3 logMAE=0.137897 | OOF_MAPE=0.140255 | HO_MAPE=0.141237


[I 2026-01-09 07:35:28,031] Trial 4 finished with value: 0.13913055261764448 and parameters: {'num_leaves': 33, 'max_depth': 7, 'min_child_samples': 107, 'learning_rate': 0.07313604712914881, 'subsample': 0.7186910664134017, 'colsample_bytree': 0.671434444906416, 'reg_alpha': 0.6842330265121569, 'reg_lambda': 2.1442904429757697}. Best is trial 3 with value: 0.13789703371009535.
[I 2026-01-09 07:35:32,812] Trial 5 pruned. 
[I 2026-01-09 07:35:37,847] Trial 6 pruned. 
[I 2026-01-09 07:36:06,038] Trial 7 finished with value: 0.13796428964484395 and parameters: {'num_leaves': 34, 'max_depth': 4, 'min_child_samples': 52, 'learning_rate': 0.047359612216743716, 'subsample': 0.7376475728326177, 'colsample_bytree': 0.7095483770484045, 'reg_alpha': 0.8287375091519293, 'reg_lambda': 1.855721030767088}. Best is trial 3 with value: 0.13789703371009535.
[I 2026-01-09 07:36:14,687] Trial 8 pruned. 
[I 2026-01-09 07:36:22,438] Trial 9 pruned. 
[I 2026-01-09 07:36:59,283] Trial 10 finished with value: 

[HPO-2STAGE] PHASE2 NEW BEST: trial=13 logMAE=0.137363 | OOF_MAPE=0.139669 | HO_MAPE=0.140986


[I 2026-01-09 07:38:51,825] Trial 14 finished with value: 0.1367896778292751 and parameters: {'num_leaves': 39, 'max_depth': 6, 'min_child_samples': 54, 'learning_rate': 0.0313651935682593, 'subsample': 0.6983900848067615, 'colsample_bytree': 0.6526130984393448, 'reg_alpha': 0.5683254849222301, 'reg_lambda': 2.7069261751102918}. Best is trial 14 with value: 0.1367896778292751.


[HPO-2STAGE] PHASE2 NEW BEST: trial=14 logMAE=0.136790 | OOF_MAPE=0.138967 | HO_MAPE=0.141172


[I 2026-01-09 07:39:15,331] Trial 15 pruned. 
[I 2026-01-09 07:39:22,496] Trial 16 pruned. 
[I 2026-01-09 07:39:29,246] Trial 17 pruned. 
[I 2026-01-09 07:40:22,476] Trial 18 finished with value: 0.13691227531703168 and parameters: {'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 52, 'learning_rate': 0.031290728093612705, 'subsample': 0.751353777349883, 'colsample_bytree': 0.7197220611498398, 'reg_alpha': 0.5914369476972747, 'reg_lambda': 2.3732106070802264}. Best is trial 14 with value: 0.1367896778292751.
[I 2026-01-09 07:41:10,004] Trial 19 finished with value: 0.1367669355940985 and parameters: {'num_leaves': 31, 'max_depth': 6, 'min_child_samples': 61, 'learning_rate': 0.031184219138361842, 'subsample': 0.7396122849938691, 'colsample_bytree': 0.6595455497217406, 'reg_alpha': 0.7467028182925457, 'reg_lambda': 4.195068665331079}. Best is trial 19 with value: 0.1367669355940985.


[HPO-2STAGE] PHASE2 NEW BEST: trial=19 logMAE=0.136767 | OOF_MAPE=0.139089 | HO_MAPE=0.142003


[I 2026-01-09 07:42:00,697] Trial 20 finished with value: 0.1369847728731672 and parameters: {'num_leaves': 36, 'max_depth': 6, 'min_child_samples': 61, 'learning_rate': 0.030225263232843944, 'subsample': 0.6943154183395582, 'colsample_bytree': 0.7763094558769664, 'reg_alpha': 0.8504598317215939, 'reg_lambda': 4.722629339374298}. Best is trial 19 with value: 0.1367669355940985.
[I 2026-01-09 07:42:46,445] Trial 21 finished with value: 0.13716250102674477 and parameters: {'num_leaves': 35, 'max_depth': 6, 'min_child_samples': 54, 'learning_rate': 0.03581332089690793, 'subsample': 0.7576306093716522, 'colsample_bytree': 0.6591374931404884, 'reg_alpha': 0.8774921379488253, 'reg_lambda': 5.418091926062006}. Best is trial 19 with value: 0.1367669355940985.
[I 2026-01-09 07:43:41,022] Trial 22 finished with value: 0.13759845441251672 and parameters: {'num_leaves': 51, 'max_depth': 6, 'min_child_samples': 50, 'learning_rate': 0.03070940069454622, 'subsample': 0.8251967033584078, 'colsample_by

[HPO-2STAGE] PHASE2 done: best_logMAE=0.136767
[HPO-2STAGE] PHASE2 best_params={'num_leaves': 31, 'max_depth': 6, 'min_child_samples': 61, 'learning_rate': 0.031184219138361842, 'subsample': 0.7396122849938691, 'colsample_bytree': 0.6595455497217406, 'reg_alpha': 0.7467028182925457, 'reg_lambda': 4.195068665331079}

[PIPELINE-LIGHT] HPO split=other | cv=62062 ho=20659 | n_features=279

[HPO-2STAGE] PHASE1 start: trials=20, folds=3, est=4000, es=30


[I 2026-01-09 07:46:58,914] Trial 0 finished with value: 0.1508174373792283 and parameters: {'num_leaves': 67, 'max_depth': 10, 'min_child_samples': 160, 'learning_rate': 0.08387926357773329, 'subsample': 0.696805592132731, 'colsample_bytree': 0.6967983561008608, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 13.39433470675048}. Best is trial 0 with value: 0.1508174373792283.


[HPO-2STAGE] PHASE1 NEW BEST: trial=0 logMAE=0.150817 | OOF_MAPE=0.153769 | HO_MAPE=0.135970


[I 2026-01-09 07:47:26,495] Trial 1 finished with value: 0.15152501623395384 and parameters: {'num_leaves': 89, 'max_depth': 8, 'min_child_samples': 53, 'learning_rate': 0.11729188669457949, 'subsample': 0.8997327922401265, 'colsample_bytree': 0.7137017332034828, 'reg_alpha': 0.18182496720710062, 'reg_lambda': 1.7322667470546256}. Best is trial 0 with value: 0.1508174373792283.
[I 2026-01-09 07:47:57,677] Trial 2 finished with value: 0.14887687154337725 and parameters: {'num_leaves': 60, 'max_depth': 7, 'min_child_samples': 115, 'learning_rate': 0.056210622617823766, 'subsample': 0.8335558684167138, 'colsample_bytree': 0.6918481581956125, 'reg_alpha': 0.29214464853521815, 'reg_lambda': 2.9967309097101573}. Best is trial 2 with value: 0.14887687154337725.


[HPO-2STAGE] PHASE1 NEW BEST: trial=2 logMAE=0.148877 | OOF_MAPE=0.151909 | HO_MAPE=0.135995


[I 2026-01-09 07:48:31,526] Trial 3 finished with value: 0.14936170127091022 and parameters: {'num_leaves': 75, 'max_depth': 9, 'min_child_samples': 80, 'learning_rate': 0.07628109945722504, 'subsample': 0.8277243706586127, 'colsample_bytree': 0.6639351238159993, 'reg_alpha': 0.6075448519014384, 'reg_lambda': 1.6666983286066417}. Best is trial 2 with value: 0.14887687154337725.
[I 2026-01-09 07:48:48,245] Trial 4 finished with value: 0.15121467508339018 and parameters: {'num_leaves': 37, 'max_depth': 10, 'min_child_samples': 195, 'learning_rate': 0.1027557613304815, 'subsample': 0.7413841307520113, 'colsample_bytree': 0.6793016342019151, 'reg_alpha': 0.6842330265121569, 'reg_lambda': 3.738105868191796}. Best is trial 2 with value: 0.14887687154337725.
[I 2026-01-09 07:49:05,581] Trial 5 finished with value: 0.15174275958743735 and parameters: {'num_leaves': 42, 'max_depth': 7, 'min_child_samples': 55, 'learning_rate': 0.11183883618709038, 'subsample': 0.7276339944800051, 'colsample_byt

[HPO-2STAGE] PHASE1 NEW BEST: trial=10 logMAE=0.148556 | OOF_MAPE=0.151679 | HO_MAPE=0.137738


[I 2026-01-09 07:51:33,573] Trial 11 finished with value: 0.14875846263688552 and parameters: {'num_leaves': 69, 'max_depth': 6, 'min_child_samples': 154, 'learning_rate': 0.05666416481000376, 'subsample': 0.8820560226149139, 'colsample_bytree': 0.682247992506388, 'reg_alpha': 0.5788984085155222, 'reg_lambda': 6.045757551325803}. Best is trial 10 with value: 0.14855644096905782.
[I 2026-01-09 07:52:10,415] Trial 12 finished with value: 0.14891318524885708 and parameters: {'num_leaves': 69, 'max_depth': 5, 'min_child_samples': 184, 'learning_rate': 0.0412126397061572, 'subsample': 0.9083096987339445, 'colsample_bytree': 0.7135523370935994, 'reg_alpha': 0.3607358635444904, 'reg_lambda': 5.378494987602776}. Best is trial 10 with value: 0.14855644096905782.
[I 2026-01-09 07:52:39,122] Trial 13 finished with value: 0.14889171279318592 and parameters: {'num_leaves': 54, 'max_depth': 7, 'min_child_samples': 160, 'learning_rate': 0.05407687149682746, 'subsample': 0.7656210019810323, 'colsample

[HPO-2STAGE] PHASE1 NEW BEST: trial=14 logMAE=0.148354 | OOF_MAPE=0.151355 | HO_MAPE=0.137134


[I 2026-01-09 07:54:00,199] Trial 15 finished with value: 0.14852543719919084 and parameters: {'num_leaves': 100, 'max_depth': 7, 'min_child_samples': 128, 'learning_rate': 0.04468893529305789, 'subsample': 0.814530575776954, 'colsample_bytree': 0.8213575256131221, 'reg_alpha': 0.22778702059031886, 'reg_lambda': 16.467079136820082}. Best is trial 14 with value: 0.1483537242452704.
[I 2026-01-09 07:54:07,979] Trial 16 pruned. 
[I 2026-01-09 07:54:17,121] Trial 17 pruned. 
[I 2026-01-09 07:54:51,913] Trial 18 finished with value: 0.1490940116117865 and parameters: {'num_leaves': 108, 'max_depth': 8, 'min_child_samples': 142, 'learning_rate': 0.05732949133811417, 'subsample': 0.8235348654297622, 'colsample_bytree': 0.8162562047786919, 'reg_alpha': 0.44144257021674516, 'reg_lambda': 9.275102715669616}. Best is trial 14 with value: 0.1483537242452704.
[I 2026-01-09 07:55:28,921] Trial 19 finished with value: 0.14869899082610957 and parameters: {'num_leaves': 80, 'max_depth': 6, 'min_child_s

[HPO-2STAGE] PHASE1 done: best_logMAE=0.148354
[HPO-2STAGE] PHASE1 best_params={'num_leaves': 82, 'max_depth': 5, 'min_child_samples': 105, 'learning_rate': 0.04280758615653949, 'subsample': 0.813173685821792, 'colsample_bytree': 0.7275010992069361, 'reg_alpha': 0.15560915150688848, 'reg_lambda': 19.744904505461903}

[HPO-2STAGE] PHASE2 start: trials=30, folds=3, est=4000, es=30


[I 2026-01-09 07:56:03,632] Trial 0 finished with value: 0.14878881157624735 and parameters: {'num_leaves': 74, 'max_depth': 7, 'min_child_samples': 124, 'learning_rate': 0.050480932435000875, 'subsample': 0.7292587088201674, 'colsample_bytree': 0.6791126745485414, 'reg_alpha': 0.018076683211897402, 'reg_lambda': 16.09700479149139}. Best is trial 0 with value: 0.14878881157624735.


[HPO-2STAGE] PHASE2 NEW BEST: trial=0 logMAE=0.148789 | OOF_MAPE=0.151833 | HO_MAPE=0.136119


[I 2026-01-09 07:56:33,250] Trial 1 finished with value: 0.14909871077363027 and parameters: {'num_leaves': 89, 'max_depth': 6, 'min_child_samples': 66, 'learning_rate': 0.06318195377589175, 'subsample': 0.8942737680849946, 'colsample_bytree': 0.68962805494565, 'reg_alpha': 0.056587257739729495, 'reg_lambda': 5.317429698234924}. Best is trial 0 with value: 0.14878881157624735.
[I 2026-01-09 07:57:13,572] Trial 2 finished with value: 0.1483564295624424 and parameters: {'num_leaves': 69, 'max_depth': 6, 'min_child_samples': 99, 'learning_rate': 0.03996335055954264, 'subsample': 0.8404604350231621, 'colsample_bytree': 0.6760332180766994, 'reg_alpha': 0.0909207617516869, 'reg_lambda': 7.15489763017556}. Best is trial 2 with value: 0.1483564295624424.


[HPO-2STAGE] PHASE2 NEW BEST: trial=2 logMAE=0.148356 | OOF_MAPE=0.151280 | HO_MAPE=0.136238


[I 2026-01-09 07:57:54,499] Trial 3 finished with value: 0.14814280752615433 and parameters: {'num_leaves': 79, 'max_depth': 7, 'min_child_samples': 81, 'learning_rate': 0.047592669388167225, 'subsample': 0.8357184144973455, 'colsample_bytree': 0.6586688669912778, 'reg_alpha': 0.1890790778135221, 'reg_lambda': 5.2074720257072}. Best is trial 3 with value: 0.14814280752615433.


[HPO-2STAGE] PHASE2 NEW BEST: trial=3 logMAE=0.148143 | OOF_MAPE=0.151012 | HO_MAPE=0.134946


[I 2026-01-09 07:58:26,000] Trial 4 finished with value: 0.1488341883586289 and parameters: {'num_leaves': 54, 'max_depth': 7, 'min_child_samples': 143, 'learning_rate': 0.05765638824882635, 'subsample': 0.7655088033777566, 'colsample_bytree': 0.6682281817425864, 'reg_alpha': 0.21294584137709413, 'reg_lambda': 8.064774999571034}. Best is trial 3 with value: 0.14814280752615433.
[I 2026-01-09 07:58:32,239] Trial 5 pruned. 
[I 2026-01-09 07:58:41,258] Trial 6 pruned. 
[I 2026-01-09 07:59:02,561] Trial 7 pruned. 
[I 2026-01-09 07:59:11,706] Trial 8 pruned. 
[I 2026-01-09 07:59:44,847] Trial 9 finished with value: 0.14838526496947235 and parameters: {'num_leaves': 50, 'max_depth': 7, 'min_child_samples': 122, 'learning_rate': 0.05494034069074452, 'subsample': 0.8793506581224216, 'colsample_bytree': 0.6638186767288285, 'reg_alpha': 0.11156109572614575, 'reg_lambda': 4.765626427965508}. Best is trial 3 with value: 0.14814280752615433.
[I 2026-01-09 07:59:56,996] Trial 10 pruned. 
[I 2026-01-

[HPO-2STAGE] PHASE2 NEW BEST: trial=15 logMAE=0.147764 | OOF_MAPE=0.150730 | HO_MAPE=0.134859


[I 2026-01-09 08:03:06,273] Trial 16 finished with value: 0.1482046121294213 and parameters: {'num_leaves': 53, 'max_depth': 7, 'min_child_samples': 71, 'learning_rate': 0.04554415834661815, 'subsample': 0.762130504946761, 'colsample_bytree': 0.6551587635603401, 'reg_alpha': 0.14351199479673765, 'reg_lambda': 7.173320002687351}. Best is trial 15 with value: 0.14776415286920286.
[I 2026-01-09 08:03:16,870] Trial 17 pruned. 
[I 2026-01-09 08:03:32,002] Trial 18 pruned. 
[I 2026-01-09 08:04:18,740] Trial 19 finished with value: 0.14798750346264303 and parameters: {'num_leaves': 55, 'max_depth': 6, 'min_child_samples': 70, 'learning_rate': 0.03841981574677708, 'subsample': 0.751441557898022, 'colsample_bytree': 0.6631606639682249, 'reg_alpha': 0.2831563285055117, 'reg_lambda': 15.420905059002145}. Best is trial 15 with value: 0.14776415286920286.
[I 2026-01-09 08:04:41,507] Trial 20 pruned. 
[I 2026-01-09 08:05:25,452] Trial 21 finished with value: 0.14812095719816112 and parameters: {'num

[HPO-2STAGE] PHASE2 NEW BEST: trial=22 logMAE=0.147756 | OOF_MAPE=0.150670 | HO_MAPE=0.134966


[I 2026-01-09 08:06:22,550] Trial 23 pruned. 
[I 2026-01-09 08:06:33,453] Trial 24 pruned. 
[I 2026-01-09 08:07:27,113] Trial 25 finished with value: 0.14780582252843913 and parameters: {'num_leaves': 95, 'max_depth': 6, 'min_child_samples': 93, 'learning_rate': 0.03166704111177063, 'subsample': 0.77373533445806, 'colsample_bytree': 0.6884359759708392, 'reg_alpha': 0.1905151102376661, 'reg_lambda': 12.056284954077169}. Best is trial 22 with value: 0.14775611098617594.
[I 2026-01-09 08:08:05,174] Trial 26 pruned. 
[I 2026-01-09 08:08:48,003] Trial 27 finished with value: 0.1478228236854508 and parameters: {'num_leaves': 86, 'max_depth': 5, 'min_child_samples': 90, 'learning_rate': 0.034679053242352646, 'subsample': 0.733682953801987, 'colsample_bytree': 0.6780575459630569, 'reg_alpha': 0.24725880112729645, 'reg_lambda': 12.835379432609756}. Best is trial 22 with value: 0.14775611098617594.
[I 2026-01-09 08:09:38,393] Trial 28 finished with value: 0.14784186021013446 and parameters: {'nu

[HPO-2STAGE] PHASE2 done: best_logMAE=0.147756
[HPO-2STAGE] PHASE2 best_params={'num_leaves': 89, 'max_depth': 7, 'min_child_samples': 88, 'learning_rate': 0.03774671099802793, 'subsample': 0.7403143621814818, 'colsample_bytree': 0.6806785927458112, 'reg_alpha': 0.24015452133865717, 'reg_lambda': 13.48337661307892}

[PIPELINE-LIGHT] Step4: Final training (no tuned re-CV)
[PIPELINE-LIGHT] Final training done.

[PIPELINE-LIGHT] DONE


## モデルの保存

In [47]:
def save_model_bundle_by_split(
    models: dict[str, object],
    fe_cols: dict[str, list[str]],
    cat_cols: dict[str, list[str]] | None,
    save_path: str,
    meta: dict | None = None,
):
    bundle = {
        'models': models,
        'fe_cols': fe_cols,
        'cat_cols': cat_cols,
        'meta': meta or {},
    }
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)
    with open(save_path, 'wb') as f:
        pickle.dump(bundle, f)


In [48]:
def export_models_from_all_bundles(
    all_bundles: dict,
    model_dir: str,
    training_ver: int,
):
    for (property_type, alg), bundle in all_bundles.items():
        # bundle 内のキー差を吸収
        models = bundle['final_models_by_split']
        fe_cols = bundle['reduced_cols_by_split']
        cat_cols = bundle['cat_cols_by_split']

        if models is None or fe_cols is None:
            raise KeyError(f'Cannot find models/fe_cols in bundle for {property_type},{alg}. keys={list(bundle.keys())}')

        # fe_cols も dict である想定（違う場合は dict 化）
        if not isinstance(fe_cols, dict):
            fe_cols = {k: list(fe_cols) for k in models.keys()}

        save_path = f'{model_dir}/model_{property_type}_{alg}_v{training_ver}.pkl'
        meta = {
            'property_type': property_type,
            'alg': alg,
            'create_tbl_ver': create_tbl_ver,
            'training_ver': training_ver,
        }

        save_model_bundle_by_split(
            models=models,
            fe_cols=fe_cols,
            cat_cols=cat_cols,
            save_path=save_path,
            meta=meta,
        )

        print(f'[SAVED] {save_path}')

In [49]:
export_models_from_all_bundles(
    all_bundles=all_bundles,
    model_dir=model_path.rstrip('/'),
    training_ver=training_ver,
)

[SAVED] ../output/model/model_residential_lgb_v8.pkl


## 特徴量重要度・oof結果の出力

In [50]:
from __future__ import annotations

import os
from typing import Callable

import numpy as np
import pandas as pd


def _as_array(x):
    if x is None:
        return None
    if isinstance(x, (pd.Series, pd.Index)):
        return x.to_numpy()
    return np.asarray(x)


def _collect_pred_series_try_index_then_idx_keys(
    cv_out: dict,
    pred_key: str,
    train_index: pd.Index,
    idx_key_for_oof: str = 'idx_cv',
    idx_key_for_ho: str = 'idx_ho',
    strict: bool = False,
) -> pd.Series:
    """
    1) pred が pd.Series なら、その index で train_index にアラインして埋める
    2) intersection が小さすぎる場合は、idx_cv/idx_ho を使って埋める（フォールバック）
    """
    # --- まず Series index を信用して埋める（house 系はこれで通る）
    out = pd.Series(np.nan, index=train_index, dtype='float32')

    any_written = 0
    total_pred = 0

    for split_key, res in cv_out.items():
        if res is None or pred_key not in res:
            continue
        pred = res.get(pred_key, None)
        if not isinstance(pred, pd.Series):
            continue

        total_pred += len(pred)
        inter = train_index.intersection(pred.index)
        if len(inter) == 0:
            continue

        out.loc[inter] = pred.loc[inter].astype('float32')
        any_written += len(inter)

    # --- そこそこ埋まっているなら、これで返す
    # 目安: 交差が 1% もない場合は、別体系の可能性が高いので idx_key フォールバック
    if total_pred > 0:
        fill_ratio = any_written / max(total_pred, 1)
        if fill_ratio >= 0.01:
            return out

    # --- フォールバック: idx_cv/idx_ho を使って埋める（residential 対策）
    out2 = pd.Series(np.nan, index=train_index, dtype='float32')

    if pred_key.startswith('oof'):
        idx_key = idx_key_for_oof
    elif pred_key.startswith('ho'):
        idx_key = idx_key_for_ho
    else:
        raise ValueError(f'pred_key must start with oof/ho: {pred_key}')

    for split_key, res in cv_out.items():
        if res is None:
            continue
        pred = res.get(pred_key, None)
        idx = res.get(idx_key, None)

        if pred is None or idx is None:
            if strict:
                raise ValueError(f'{split_key}: missing {pred_key} or {idx_key}')
            continue

        pred_arr = _as_array(pred)
        idx_arr = _as_array(idx)

        if pred_arr is None or idx_arr is None:
            continue
        if len(pred_arr) != len(idx_arr):
            msg = f'{split_key}: len(pred)={len(pred_arr)} != len({idx_key})={len(idx_arr)}'
            if strict:
                raise ValueError(msg)
            print(f'[WARN] {msg}')
            continue

        idx_idx = pd.Index(idx_arr)

        # (A) idx_arr が train_index のラベルとして存在する場合
        inter = train_index.intersection(idx_idx)
        if len(inter) > 0:
            m = pd.Series(pred_arr, index=idx_idx)
            out2.loc[inter] = m.loc[inter].astype('float32')
            continue

        # (B) 位置 (iloc) として扱える場合
        if np.issubdtype(idx_arr.dtype, np.integer):
            if idx_arr.min() >= 0 and idx_arr.max() < len(train_index):
                out2.iloc[idx_arr] = pred_arr.astype('float32')
                continue

        msg = f'{split_key}: {idx_key} not alignable to train_index'
        if strict:
            raise ValueError(msg)
        print(f'[WARN] {msg}')

    return out2


def export_cv_outputs_for_each_property_type(
    all_bundles: dict,
    property_types: list[str],
    alg: str,
    prepare_training_inputs: Callable[..., tuple[pd.DataFrame, list[str], list[str], dict]],
    oof_path: str,
    fi_path: str,
    training_ver: int,
    include_feature_cols: bool = True,
    include_exp_preds: bool = True,
    fi_agg_how: str = 'mean',
    feature_col: str = 'feature',
    importance_col: str = 'importance',
    debug: bool = True,
) -> None:
    """
    target_model ごとに prepare_training_inputs() で train_df を作り、その train_df に対して
    (target_model, alg) の bundle を使って予測/FI を出力する。

    これにより「bundle の index 体系」と「train_df の index 体系」を揃えられる。
    """
    os.makedirs(oof_path, exist_ok=True)
    os.makedirs(fi_path, exist_ok=True)

    for target_model in property_types:
        key = (target_model, alg)
        print('\n' + '#' * 100)
        print(f'[RUN] target_model={target_model} | alg={alg}')
        print('#' * 100)

        if key not in all_bundles:
            print(f'[SKIP] bundle not found: {key}')
            continue

        bundle = all_bundles[key]
        if 'cv_fixed' not in bundle or 'results_by_split' not in bundle['cv_fixed']:
            print(f'[SKIP] no cv_fixed/results_by_split in bundle: {key}')
            continue

        # target_model ごとに train_df を作る（ここが重要）
        train_df, fe_cols, cat_cols, idx_dict = prepare_training_inputs(
            target_model=target_model,
            alg=alg,
        )

        cv_out = bundle['cv_fixed']['results_by_split']
        prefix = f'{target_model}_{alg}_v{training_ver}'

        # =========================
        # 予測: split横断で集約 → train_dfへ結合
        # =========================
        oof_pred_log = _collect_pred_series_try_index_then_idx_keys(cv_out, 'oof_pred_log', train_df.index)
        ho_pred_log = _collect_pred_series_try_index_then_idx_keys(cv_out, 'ho_pred_log', train_df.index)

        col_oof_log = f'{target_model}_{alg}_oof_pred_log'
        col_ho_log = f'{target_model}_{alg}_ho_pred_log'

        out_df = train_df.copy()
        out_df[col_oof_log] = oof_pred_log
        out_df[col_ho_log] = ho_pred_log

        pred_cols = [col_oof_log, col_ho_log]

        if include_exp_preds:
            col_oof = f'{target_model}_{alg}_oof_pred'
            col_ho = f'{target_model}_{alg}_ho_pred'
            out_df[col_oof] = np.exp(out_df[col_oof_log])
            out_df[col_ho] = np.exp(out_df[col_ho_log])
            pred_cols += [col_oof, col_ho]

        if include_feature_cols:
            fe_cols_use = [c for c in fe_cols if c in out_df.columns]
            keep_cols = list(dict.fromkeys(fe_cols_use + pred_cols + [target_col]))
            out_df = out_df.loc[:, keep_cols]

        if debug:
            nn_oof = int(out_df[col_oof_log].notna().sum())
            nn_ho = int(out_df[col_ho_log].notna().sum())
            print(f'  pred filled: oof={nn_oof}/{len(out_df)} ho={nn_ho}/{len(out_df)}')

        out_df.to_csv(f'{oof_path}/train_with_pred_{prefix}.csv', index=False)

        # =========================
        # FI: split内集約 + 全体stats
        # =========================
        fi_all_list: list[pd.DataFrame] = []
        for split_key, res in cv_out.items():
            fi_raw = res.get('fi')
            if fi_raw is None or len(fi_raw) == 0:
                continue

            fi_agg = (
                fi_raw.groupby(feature_col, as_index=False)[importance_col]
                .agg(fi_agg_how)
                .sort_values(importance_col, ascending=False)
                .reset_index(drop=True)
            )

            fi_agg.to_csv(
                f'{fi_path}/fi_{target_model}_{alg}_{split_key}_v{training_ver}.csv',
                index=False
            )

            tmp = fi_agg.copy()
            tmp['split'] = split_key
            fi_all_list.append(tmp)

        if fi_all_list:
            fi_all = pd.concat(fi_all_list, axis=0, ignore_index=True)
            fi_stats = (
                fi_all.groupby(feature_col)[importance_col]
                .agg(['mean', 'std', 'count'])
                .reset_index()
                .sort_values('mean', ascending=False)
                .reset_index(drop=True)
            )
            fi_stats['cv'] = fi_stats['std'] / fi_stats['mean'].replace(0, np.nan)
            fi_stats.to_csv(
                f'{fi_path}/fi_{target_model}_{alg}_ALL_v{training_ver}_stats.csv',
                index=False
            )

        print(f'[EXPORTED] {prefix}')


In [51]:
export_cv_outputs_for_each_property_type(
    all_bundles=all_bundles,
    property_types=property_types,
    alg=alg,
    prepare_training_inputs=prepare_training_inputs,
    oof_path=oof_path,
    fi_path=fi_path,
    training_ver=training_ver,
    include_feature_cols=True,
    include_exp_preds=True,
    fi_agg_how='mean',
    debug=True,
)


####################################################################################################
[RUN] target_model=residential | alg=lgb
####################################################################################################
  pred filled: oof=147954/195154 ho=47200/195154
[EXPORTED] residential_lgb_v8
