# モデル学習

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import japanize_matplotlib

from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
data_definition_path = ROOT_DIR + 'data_definition.xlsx'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
oof_path = '../output/oof/'
fi_path = '../output/fi/'

# スクリプトのバージョン指定
create_tbl_ver = 2
training_ver = 7


today = dt.datetime.today().strftime("%Y%m%d")

In [3]:
target_model = 'residential'
# target_model = 'house'
# target_model = 'other'

alg = 'lgb'
# alg = 'cat'

## File Import

In [4]:
train_df = pd.read_parquet(f'{intermediate_path}train_df_{target_model}_v{create_tbl_ver}.parquet')

In [5]:
date_col = 'target_ym'
target_col = 'money_room'

In [6]:
fe_cols = train_df.columns.to_list()

idx_key_cols = [
    'Prefecture name',
    'City/town/village name',
    'zone_residential_rank'
]
drop_cols = set([target_col] + idx_key_cols)

fe_cols = [c for c in fe_cols if c not in drop_cols]

#### カテゴリ型へ変更

In [7]:
cat_cols_candidate = ['building_category', 'land_area_kind', 'walk_distance_bin', 'building_land_chimoku',
            'land_chisei','land_road_cond', 'access_zone', 'fireproof_x_structure', 'structure_group'
]

cat_cols = [c for c in cat_cols_candidate if c in fe_cols]
train_df[cat_cols] = train_df[cat_cols].astype('category')

In [8]:
# すべての category 列のリスト
obj_cols = train_df[fe_cols].select_dtypes(['object']).columns.tolist()
train_df[obj_cols] = train_df[obj_cols].astype('category')
cat_cols += obj_cols

## データ分割

In [9]:
# --- 東京23区 ---
TOKYO_23 = [
    '千代田区', '中央区', '港区', '新宿区', '文京区', '台東区',
    '墨田区', '江東区', '品川区', '目黒区', '大田区', '世田谷区',
    '渋谷区', '中野区', '杉並区', '豊島区', '北区', '荒川区',
    '板橋区', '練馬区', '足立区', '葛飾区', '江戸川区'
]

# --- 政令指定都市 ---
SEIREI_CITIES = [
    '札幌市', '仙台市', 'さいたま市', '千葉市', '横浜市', '川崎市', '相模原市',
    '新潟市', '静岡市', '浜松市', '名古屋市',
    '京都市', '大阪市', '堺市', '神戸市',
    '岡山市', '広島市', '北九州市', '福岡市', '熊本市'
]

# --- 首都圏（都道府県） ---
CAPITAL_PREFS = ['東京都', '神奈川県', '埼玉県', '千葉県']

# --- 県庁所在地（市名のみ） ---
PREF_CAPITALS = [
    '札幌市','青森市','盛岡市','仙台市','秋田市','山形市','福島市',
    '水戸市','宇都宮市','前橋市','さいたま市','千葉市','新宿区',
    '横浜市','新潟市','富山市','金沢市','福井市','甲府市','長野市',
    '岐阜市','静岡市','名古屋市','津市','大津市','京都市','大阪市',
    '神戸市','奈良市','和歌山市','鳥取市','松江市','岡山市','広島市',
    '山口市','徳島市','高松市','松山市','高知市','福岡市','佐賀市',
    '長崎市','熊本市','大分市','宮崎市','鹿児島市','那覇市'
]

In [10]:
main_city = train_df.index[
    (
        (train_df['Prefecture_name'] == '東京都') &
        (train_df['City/town/village_name'].isin(TOKYO_23))
    )
    |
    (train_df['City/town/village_name'].isin(['大阪市', '名古屋市']))
]

mid_city = train_df.index[
    (
        # 首都圏（23区除外）
        (
            train_df['Prefecture_name'].isin(CAPITAL_PREFS)
            &
            ~(
                (train_df['Prefecture_name'] == '東京都') &
                (train_df['City/town/village_name'].isin(TOKYO_23))
            )
        )
        |
        # 政令指定都市
        (train_df['City/town/village_name'].isin(SEIREI_CITIES))
        |
        # 県庁所在地
        (train_df['City/town/village_name'].isin(PREF_CAPITALS))
    )
    &
    ~train_df.index.isin(main_city)
]

other = train_df.index[
    ~train_df.index.isin(main_city)
    &
    ~train_df.index.isin(mid_city)
]

urban_idx_dict = {
    'main_city': main_city,
    'mid_city': mid_city,
    'other': other,
}

In [11]:
idx_low_density = train_df.index[
    train_df['zone_residential_rank'] == 1
]

idx_mid_density = train_df.index[
    train_df['zone_residential_rank'] == 2
]

idx_high_density = train_df.index[
    train_df['zone_residential_rank'].isin([3, 4, 0]) |
    train_df['zone_residential_rank'].isna()
]

density_idx_dict = {
    'low': idx_low_density,
    'mid': idx_mid_density,
    'high': idx_high_density,
}

In [12]:
if target_model == 'house':
    idx_dict = density_idx_dict
elif target_model == 'residential':
    idx_dict = urban_idx_dict
elif target_model == 'other':
    idx_dict = None

## モデル学習

In [13]:
train_df_for_oof = train_df.copy()

In [14]:
year_col = 'target_year'

# LightGBM のベースパラメータ（あなたの設定）
base_params = {
    'lgb': {
        # 'objective': 'regression_l1',  # ← log-MAEを直接最適化
        # 'metric': 'l1',
        'min_child_samples': 20,       # houseはノイズが出やすい
        'reg_alpha': 0.5,
        'reg_lambda': 1.0,
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'num_leaves': 100,
        'max_depth': -1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42,
        'n_jobs': -1
    },
    'cat': {
        'loss_function': 'MAE',          # ← 重要（歪み抑制）
        'iterations': 2000,
        'learning_rate': 0.03,
        'depth': 6,                      # 深くしすぎない
        'l2_leaf_reg': 5,
        'random_strength': 0.8,
        'bagging_temperature': 0.5,
        'eval_metric': 'MAE',
        'early_stopping_rounds': 100,
        'verbose': False
    },
    'enet': {
        'alpha': 0.001,        # 小さめ
        'l1_ratio': 0.2,       # Ridge寄り
        'fit_intercept': True
    }
}

#### 関数

In [15]:
def mape(y_true, y_pred):
    """MAPE計算（%ではなくratio）。"""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.mean(np.abs(y_true - y_pred) / np.clip(y_true, 1e-6, None))

In [16]:
def _infer_te_source_col(te_col: str) -> str:
    # "xxx_te" -> "xxx"
    return te_col[:-3] if te_col.endswith("_te") else te_col

def fit_target_encoding_map(
    s_cat: pd.Series,
    y: pd.Series,
    smoothing: float = 50.0,
    min_samples_leaf: int = 1,
) -> tuple[pd.Series, float]:
    """
    1列のカテゴリ s_cat をターゲット y で target encoding するための mapping を作る。
    smoothing: 大きいほど全体平均に寄る（過学習防止）
    """
    s_cat = s_cat.astype("object")
    y = y.astype(float)

    prior = float(y.mean())

    stats = (
        pd.DataFrame({"cat": s_cat, "y": y})
          .groupby("cat")["y"]
          .agg(["count", "mean"])
    )

    # count が小さいカテゴリは prior に寄せる（smoothing + min_samples_leaf）
    count = stats["count"].astype(float)
    mean  = stats["mean"].astype(float)

    # smoothing 係数（一般的な ridge-like smoothing）
    # enc = (count*mean + smoothing*prior) / (count + smoothing)
    enc = (count * mean + smoothing * prior) / (count + smoothing)

    # さらに min_samples_leaf 未満は prior へ
    if min_samples_leaf > 1:
        enc[count < min_samples_leaf] = prior

    return enc, prior

def apply_target_encoding(
    s_cat: pd.Series,
    mapping: pd.Series,
    prior: float,
) -> pd.Series:
    s_cat = s_cat.astype("object")
    return s_cat.map(mapping).fillna(prior).astype(float)

def recompute_te_for_fold(
    train_df: pd.DataFrame,
    tr_idx: pd.Index,
    apply_idx_list: list[pd.Index],
    te_cols: list[str],
    y_tr: pd.Series,
    smoothing: float = 50.0,
    min_samples_leaf: int = 1,
    verbose: bool = False,
) -> dict:
    """
    te_cols（例: eki_name1_te）の各列について、
    学習fold（tr_idx）だけでTEをfitし、apply_idx_list の各Indexへ適用する。

    返り値: {te_col: {"src_col":..., "mapping":..., "prior":...}} （必要なら保存用）
    """
    te_meta = {}

    for te_col in te_cols:
        src_col = _infer_te_source_col(te_col)
        if src_col not in train_df.columns:
            if verbose:
                print(f"[WARN] TE元列が見つからないためスキップ: {te_col} (src={src_col})")
            continue

        mapping, prior = fit_target_encoding_map(
            train_df.loc[tr_idx, src_col],
            y_tr.loc[tr_idx],
            smoothing=smoothing,
            min_samples_leaf=min_samples_leaf,
        )

        # 各適用先（train/val/ho など）へ適用
        for idx in apply_idx_list:
            train_df.loc[idx, te_col] = apply_target_encoding(
                train_df.loc[idx, src_col],
                mapping,
                prior
            ).values

        te_meta[te_col] = {"src_col": src_col, "mapping": mapping, "prior": prior}

    return te_meta

In [17]:
from typing import Optional

from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet

import lightgbm as lgb
from catboost import CatBoostRegressor


def run_cv_by_separate(
    train_df: pd.DataFrame,
    base_cols: list[str],
    target_col: str,
    year_col: str,
    base_params: dict,
    alg: str,
    idx_dict: Optional[dict[str, pd.Index]] = None,
    n_splits: int = 5,
    te_smoothing: float = 50.0,
    te_min_samples_leaf: int = 1,
    cat_cols: list[str] | None = None,
):
    """
    （idx_dictがあれば）分割ごとに CV / 最終モデル学習 / HO 予測を行う
    idx_dictがNoneなら全データを1分割(all)として実行する

    追加対応:
      - alg == 'enet' のとき cat_cols を除外し、Scaling(Pipeline) を入れる
    """

    y = train_df[target_col].astype(float)
    y_log = np.log(y)

    results = {}

    te_cols_base = [c for c in base_cols if c.endswith('_te')]

    if idx_dict is None:
        idx_dict = {'all': train_df.index}

    bias_rows: list[dict] = []

    # ===== enet 用の特徴量リスト（cat_cols を除外） =====
    if alg == 'enet':
        cat_cols_set = set(cat_cols or [])
        base_cols_enet = [c for c in base_cols if c not in cat_cols_set]
        if len(base_cols_enet) == 0:
            raise ValueError('alg=="enet" ですが、cat_cols除外後に特徴量が0件になりました。')
    else:
        base_cols_enet = base_cols
    # ===================================================

    for split_key, urban_idx in idx_dict.items():
        print(f'\n==============================')
        print(f' Split: {split_key}')
        print(f'==============================')

        idx_cv = urban_idx.intersection(train_df.index[train_df[year_col] <= 2021])
        idx_ho = urban_idx.intersection(train_df.index[train_df[year_col] == 2022])

        if len(idx_cv) == 0 or len(idx_ho) == 0:
            print(f'Skip {split_key} (no data)')
            continue

        y_ho_log = np.log(y.loc[idx_ho])

        print(f'CV rows: {len(idx_cv)}')
        print(f'HO rows: {len(idx_ho)}')

        oof_pred_log = pd.Series(np.nan, index=idx_cv, dtype=float)
        ho_pred_log_accum = pd.Series(0.0, index=idx_ho, dtype=float)

        ho_log_by_fold = []
        fi_list = []

        # enet のときだけ base_cols を差し替える
        use_cols = base_cols_enet if alg == 'enet' else base_cols

        X_cv = train_df.loc[idx_cv, use_cols]
        y_cv_log = y_log.loc[idx_cv]
        groups_cv = train_df.loc[idx_cv, 'building_id']

        gkf = GroupKFold(n_splits=n_splits)

        for fold, (tr_pos, va_pos) in enumerate(gkf.split(X_cv, y_cv_log, groups_cv), 1):
            print(f'[{split_key}] Fold {fold}')

            tr_idx = idx_cv[tr_pos]
            va_idx = idx_cv[va_pos]

            # --- TE 再計算（TE列がある場合のみ）---
            if te_cols_base:
                recompute_te_for_fold(
                    train_df=train_df,
                    tr_idx=tr_idx,
                    apply_idx_list=[tr_idx, va_idx, idx_ho],
                    te_cols=te_cols_base,
                    y_tr=y_log,
                    smoothing=te_smoothing,
                    min_samples_leaf=te_min_samples_leaf,
                )

            # --- 学習 ---
            if alg == 'lgb':
                model = lgb.LGBMRegressor(**base_params[alg])
                model.fit(train_df.loc[tr_idx, base_cols], y_log.loc[tr_idx])

            elif alg == 'cat':
                # CatBoost は文字列化が必要になりがちなので、対象列だけ局所的に作る
                if cat_cols:
                    for c in cat_cols:
                        train_df.loc[:, c] = train_df.loc[:, c].astype('string').fillna('NA')
                model = CatBoostRegressor(**base_params[alg], cat_features=(cat_cols or []))
                model.fit(train_df.loc[tr_idx, base_cols], y_log.loc[tr_idx])

            elif alg == 'enet':
                # enet: cat_cols は除外済み(use_cols)、スケーリング＋欠損補完を必ず入れる
                model = Pipeline(
                    steps=[
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler(with_mean=True)),
                        ('enet', ElasticNet(**base_params[alg], random_state=42)),
                    ]
                )
                model.fit(train_df.loc[tr_idx, use_cols], y_log.loc[tr_idx])

            else:
                raise ValueError(f'Unknown alg: {alg}')

            # --- OOF ---
            va_pred_log = model.predict(train_df.loc[va_idx, use_cols if alg == "enet" else base_cols])
            oof_pred_log.loc[va_idx] = va_pred_log

            # --- HO ---
            ho_pred_log = model.predict(train_df.loc[idx_ho, use_cols if alg == "enet" else base_cols])
            ho_pred_log_accum += ho_pred_log / n_splits
            ho_log_by_fold.append(pd.Series(ho_pred_log, index=idx_ho))

            # --- FI ---
            if alg == 'lgb':
                fi_list.append(pd.DataFrame({
                    'feature': base_cols,
                    'importance': model.booster_.feature_importance(importance_type='gain'),
                    'fold': fold,
                    'urban': split_key,
                }))
            elif alg == 'cat':
                fi_list.append(pd.DataFrame({
                    'feature': base_cols,
                    'importance': model.get_feature_importance(type='PredictionValuesChange'),
                    'fold': fold,
                    'urban': split_key,
                }))
            elif alg == 'enet':
                # enet は係数を FI として保存（安全化）
                coef = model.named_steps['enet'].coef_

                # coef が 2次元の可能性に備える（念のため）
                coef = np.asarray(coef).ravel()

                n_feat = len(use_cols)
                n_coef = len(coef)

                if n_feat != n_coef:
                    print(f'[WARN] enet FI skipped: n_feat={n_feat}, n_coef={n_coef} (split={split_key}, fold={fold})')
                else:
                    fi_list.append(pd.DataFrame({
                        'feature': use_cols,
                        'importance': np.abs(coef),
                        'fold': fold,
                        'urban': split_key,
                    }))

        ho_log_stack = pd.concat(ho_log_by_fold, axis=1)
        ho_mu = ho_log_stack.mean(axis=1)
        ho_sigma = ho_log_stack.std(axis=1)

        print(f'[{split_key}] Final model training')

        if te_cols_base:
            recompute_te_for_fold(
                train_df=train_df,
                tr_idx=idx_cv,
                apply_idx_list=[idx_cv, idx_ho],
                te_cols=te_cols_base,
                y_tr=y_log,
                smoothing=te_smoothing,
                min_samples_leaf=te_min_samples_leaf,
                verbose=True,
            )

        if alg == 'lgb':
            final_model = lgb.LGBMRegressor(**base_params[alg])
            final_model.fit(train_df.loc[idx_cv, base_cols], y_log.loc[idx_cv])

        elif alg == 'cat':
            if cat_cols:
                for c in cat_cols:
                    train_df.loc[:, c] = train_df.loc[:, c].astype('string').fillna('NA')
            final_model = CatBoostRegressor(**base_params[alg], cat_features=(cat_cols or []))
            final_model.fit(train_df.loc[idx_cv, base_cols], y_log.loc[idx_cv])

        elif alg == 'enet':
            final_model = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler(with_mean=True)),
                    ('enet', ElasticNet(**base_params[alg], random_state=42)),
                ]
            )
            final_model.fit(train_df.loc[idx_cv, use_cols], y_log.loc[idx_cv])

        ho_pred_log_final = pd.Series(
            final_model.predict(train_df.loc[idx_ho, use_cols if alg == "enet" else base_cols]),
            index=idx_ho
        )

        ho_residual_log_final = (y_ho_log - ho_pred_log_final)
        bias_ho_log_final = float(ho_residual_log_final.mean())

        bias_rows.append({
            'split': split_key,
            'bias_ho_log_final': bias_ho_log_final,
            'n_ho': int(len(idx_ho)),
        })

        results[split_key] = {
            'idx_cv': idx_cv,
            'idx_ho': idx_ho,
            'y_ho_log': y_ho_log,
            'oof_pred_log': oof_pred_log,
            'ho_pred_log_cv_mean': ho_pred_log_accum,
            'ho_pred_log_final': ho_pred_log_final,
            'ho_residual_log_final': ho_residual_log_final,
            'bias_ho_log_final': bias_ho_log_final,
            'ho_mu': ho_mu,
            'ho_sigma': ho_sigma,
            'final_model': final_model,
            'fi': pd.concat(fi_list, ignore_index=True) if fi_list else pd.DataFrame(),
            'used_cols': use_cols,
        }

    bias_table = pd.DataFrame(bias_rows).sort_values('split').reset_index(drop=True)

    return {
        'results_by_split': results,
        'bias_table_ho_log_final': bias_table,
    }


#### 学習

In [18]:
out_lgb = run_cv_by_separate(
    train_df=train_df_for_oof,
    base_cols=fe_cols,
    target_col=target_col,
    year_col=year_col,
    alg=alg,
    base_params=base_params,
    idx_dict=idx_dict,
    cat_cols=cat_cols,
)

results_lgb = out_lgb['results_by_split']


 Split: main_city
CV rows: 31627
HO rows: 11070
[main_city] Fold 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34183
[LightGBM] [Info] Number of data points in the train set: 25301, number of used features: 290
[LightGBM] [Info] Start training from score 17.378418
[main_city] Fold 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34146
[LightGBM] [Info] Number of data points in the train set: 25301, number of used features: 290
[LightGBM] [Info] Start training from score 17.385160
[main_city] Fold 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of te

In [19]:
# from sklearn.linear_model import Ridge

# X_oof = pd.concat([
#     pd.concat([r['oof_pred_log'] for r in results_lgb.values()]),
#     pd.concat([r['oof_pred_log'] for r in results_cat.values()]),
#     pd.concat([r['oof_pred_log'] for r in results_enet.values()]),
# ], axis=1).sort_index()

# X_oof.columns = ['lgb', 'cat', 'enet']

# y_oof_log = np.log(train_df_for_oof.loc[X_oof.index, target_col])

# ridge = Ridge(alpha=1.0, fit_intercept=False)
# ridge.fit(X_oof, y_oof_log)

# print(dict(zip(X_oof.columns, ridge.coef_)))

In [20]:
# w_lgb, w_cat, w_enet = ridge.coef_  # fit_intercept=False の場合
# # intercept があるなら別途

# ho_log = (
#     w_lgb * pd.concat([r['ho_pred_log_final'] for r in results_lgb.values()]) +
#     w_cat * pd.concat([r['ho_pred_log_final'] for r in results_cat.values()]) +
#     w_enet * pd.concat([r['ho_pred_log_final'] for r in results_enet.values()])
# ).sort_index()

# y_ho = train_df_for_oof.loc[ho_log.index, target_col]
# print('HO MAPE (stacked):', mape(y_ho, np.exp(ho_log)))


#### 評価

In [21]:
def _sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def fit_low_calibrator_by_effective_land_price(
    oof_df: pd.DataFrame,
    y_col: str,
    pred_log_col: str,
    elp_col: str = 'effective_land_price',
    n_bins: int = 20,
    min_bin: int = 200,
    low_q: float = 0.2,
):
    d = oof_df[[y_col, pred_log_col, elp_col]].copy()
    mask = d[y_col].notna() & d[pred_log_col].notna() & d[elp_col].notna() & (d[y_col] > 0)
    d = d.loc[mask].copy()

    low_th = float(d[y_col].quantile(low_q))
    d_low = d[d[y_col] <= low_th].copy()

    y = d_low[y_col].astype('float64').to_numpy()
    pred = np.exp(d_low[pred_log_col].astype('float64').to_numpy())
    elp = d_low[elp_col].astype('float64').to_numpy()

    ratio = y / np.clip(pred, 1e-12, None)

    qs = np.linspace(0.0, 1.0, n_bins + 1)
    edges = np.unique(np.nanquantile(elp, qs))
    bins = pd.IntervalIndex.from_breaks(edges, closed='right')
    b = pd.cut(elp, bins=bins)

    s = pd.Series(ratio)
    med = s.groupby(b).median()
    cnt = s.groupby(b).size()

    global_med = float(np.nanmedian(ratio))
    global_log_delta = float(np.log(np.clip(global_med, 1e-12, None)))

    bin_log_delta = np.full(len(bins), global_log_delta, dtype='float64')
    for i, iv in enumerate(bins):
        c = int(cnt.get(iv, 0))
        m = float(med.get(iv, np.nan))
        if (c >= min_bin) and np.isfinite(m):
            bin_log_delta[i] = float(np.log(np.clip(m, 1e-12, None)))

    return {
        'elp_col': elp_col,
        'edges': edges,
        'bin_log_delta': bin_log_delta,
        'global_log_delta': global_log_delta,
        'low_q': low_q,
        'low_th': low_th,
    }

def _lookup_bin_log_delta(elp: np.ndarray, edges: np.ndarray, bin_log_delta: np.ndarray, global_log_delta: float):
    idx = np.digitize(elp, edges, right=True) - 1
    idx = np.clip(idx, 0, len(edges) - 2)
    out = bin_log_delta[idx]
    out = np.where(np.isfinite(elp), out, global_log_delta)
    return out

def apply_low_gate_and_calibration_log(
    df: pd.DataFrame,
    base_pred_log: np.ndarray,
    calibrator: dict,
    gate_q_lo: float = 0.5,
    gate_q_hi: float = 0.8,
    k: float = 10.0,
):
    elp = df[calibrator['elp_col']].astype('float64').to_numpy()
    q_lo = float(np.nanquantile(elp, gate_q_lo))
    q_hi = float(np.nanquantile(elp, gate_q_hi))
    denom = max(q_hi - q_lo, 1e-6)

    z = (elp - q_lo) / denom
    w = _sigmoid(k * (z - 0.5))  # elpが高いほど補正を強く

    # is_low_like = (
    #     (df['listing_months_log'] >= df['listing_months_log'].quantile(0.7)) &
    #     (df['livability_score'] <= df['livability_score'].quantile(0.6))
    # )

    is_low_like = (
        (df['effective_land_price_div_median_price_1000m'] >= df['effective_land_price_div_median_price_1000m'].quantile(0.7)) &
        (df['livability_score'] <= df['livability_score'].quantile(0.6))
    )


    w = w * is_low_like.astype('float64')

    add_log = _lookup_bin_log_delta(
        elp,
        calibrator['edges'],
        calibrator['bin_log_delta'],
        calibrator['global_log_delta'],
    )

    base_pred_log = np.asarray(base_pred_log, dtype='float64')
    calibrated_log = base_pred_log + add_log
    final_pred_log = base_pred_log * (1.0 - w) + calibrated_log * w
    return final_pred_log, w

In [22]:
def eval_results(results, train_df, target_col, label):
    print(f'\n===== {label} =====')

    print('--- OOF MAPE ---')
    for urban_key, res in results.items():
        idx_cv = res['idx_cv']
        y_true = train_df.loc[idx_cv, target_col]
        y_pred = np.exp(res['oof_pred_log'])
        print(f'{urban_key:10s} | {mape(y_true, y_pred):.6f}')

    print('--- HO MAPE ---')
    for urban_key, res in results.items():
        idx_ho = res['idx_ho']
        y_true = train_df.loc[idx_ho, target_col]
        y_pred = np.exp(res['ho_pred_log_final'])
        print(f'{urban_key:10s} | {mape(y_true, y_pred):.6f}')

    # all
    oof_log_all = pd.concat([r['oof_pred_log'] for r in results.values()]).sort_index()
    y_oof_all = train_df.loc[oof_log_all.index, target_col]
    print(f'OOF MAPE (all): {mape(y_oof_all, np.exp(oof_log_all)):.6f}')

    ho_log_all = pd.concat([r['ho_pred_log_final'] for r in results.values()]).sort_index()
    y_ho_all = train_df.loc[ho_log_all.index, target_col]
    print(f'HO  MAPE (all): {mape(y_ho_all, np.exp(ho_log_all)):.6f}')

In [23]:
eval_results(results_lgb, train_df_for_oof, target_col, 'LightGBM')
# eval_results(results_cat,  train_df_for_oof, target_col, 'CatBoost')
# eval_results(results_enet, train_df_for_oof, target_col, 'ElasticNet')


===== LightGBM =====
--- OOF MAPE ---
main_city  | 0.121656
mid_city   | 0.140458
other      | 0.150801
--- HO MAPE ---
main_city  | 0.120234
mid_city   | 0.139134
other      | 0.129812
OOF MAPE (all): 0.140778
HO  MAPE (all): 0.130621


In [24]:
# weights = {
#     'lgb':  0.55,
#     'cat':  0.30,
#     'enet': 0.15,
# }

In [25]:
# def eval_ensemble_by_split(
#     results_lgb,
#     results_cat,
#     results_enet,
#     train_df,
#     target_col,
#     weights,
# ):
#     print('\n===== Ensemble =====')

#     print('--- OOF MAPE ---')
#     for urban_key in results_lgb.keys():
#         idx_cv = results_lgb[urban_key]['idx_cv']

#         oof_log = (
#             weights['lgb']  * results_lgb[urban_key]['oof_pred_log'] +
#             weights['cat']  * results_cat[urban_key]['oof_pred_log'] +
#             weights['enet'] * results_enet[urban_key]['oof_pred_log']
#         )

#         y_true = train_df.loc[idx_cv, target_col]
#         y_pred = np.exp(oof_log)
#         print(f'{urban_key:10s} | {mape(y_true, y_pred):.6f}')

#     print('--- HO MAPE ---')
#     for urban_key in results_lgb.keys():
#         idx_ho = results_lgb[urban_key]['idx_ho']

#         ho_log = (
#             weights['lgb']  * results_lgb[urban_key]['ho_pred_log_final'] +
#             weights['cat']  * results_cat[urban_key]['ho_pred_log_final'] +
#             weights['enet'] * results_enet[urban_key]['ho_pred_log_final']
#         )

#         y_true = train_df.loc[idx_ho, target_col]
#         y_pred = np.exp(ho_log)
#         print(f'{urban_key:10s} | {mape(y_true, y_pred):.6f}')


# def eval_ensemble_all(
#     results_lgb,
#     results_cat,
#     results_enet,
#     train_df,
#     target_col,
#     weights,
# ):
#     # --- OOF ---
#     oof_log = (
#         weights['lgb']  * pd.concat([r['oof_pred_log'] for r in results_lgb.values()]) +
#         weights['cat']  * pd.concat([r['oof_pred_log'] for r in results_cat.values()]) +
#         weights['enet'] * pd.concat([r['oof_pred_log'] for r in results_enet.values()])
#     ).sort_index()

#     y_oof = train_df.loc[oof_log.index, target_col]
#     print(f'OOF MAPE (ensemble, all): {mape(y_oof, np.exp(oof_log)):.6f}')

#     # --- HO ---
#     ho_log = (
#         weights['lgb']  * pd.concat([r['ho_pred_log_final'] for r in results_lgb.values()]) +
#         weights['cat']  * pd.concat([r['ho_pred_log_final'] for r in results_cat.values()]) +
#         weights['enet'] * pd.concat([r['ho_pred_log_final'] for r in results_enet.values()])
#     ).sort_index()

#     y_ho = train_df.loc[ho_log.index, target_col]
#     print(f'HO  MAPE (ensemble, all): {mape(y_ho, np.exp(ho_log)):.6f}')


In [26]:
# eval_ensemble_by_split(
#     results_lgb,
#     results_cat,
#     results_enet,
#     train_df_for_oof,
#     target_col,
#     weights,
# )

# eval_ensemble_all(
#     results_lgb,
#     results_cat,
#     results_enet,
#     train_df_for_oof,
#     target_col,
#     weights,
# )

#### 特徴量重要度

In [27]:
fi_dict = {}

for key, res in results_lgb.items():
    fi_df_raw = res['fi']  # ← これは DataFrame

    fi_df = (
        fi_df_raw
        .groupby('feature', as_index=False)['importance']
        .mean()
        .sort_values('importance', ascending=False)
    )

    fi_dict[key] = fi_df
    print(key)
    display(fi_dict[key].head(20))

main_city


Unnamed: 0,feature,importance
86,has_ldk,18737.204115
215,log_weighted_land_price_3_x_livability_score,11814.949881
302,senyu_area,7748.294436
248,money_kyoueki_std,6466.40972
211,log_weighted_land_price_3_x_age_decay_30,2288.777053
209,log_land_price_x_livability_score,1872.668973
359,year_built,1489.074419
26,building_senyu_area_median,1443.141045
356,wet_area_upgrade_count,1322.530986
0,City/town/village_name,1270.08635


mid_city


Unnamed: 0,feature,importance
211,log_weighted_land_price_3_x_age_decay_30,28020.154994
205,log_land_price_x_age_decay_30,14655.026858
215,log_weighted_land_price_3_x_livability_score,13839.162457
0,City/town/village_name,7327.796269
86,has_ldk,6415.30139
302,senyu_area,5423.895033
268,premium_equipment_count,5080.761378
209,log_land_price_x_livability_score,4019.958909
238,mean_price_500m_mansion_log,2995.897241
232,mean_price_1000m_mansion_log,2650.141231


other


Unnamed: 0,feature,importance
211,log_weighted_land_price_3_x_age_decay_30,45504.278106
205,log_land_price_x_age_decay_30,15199.736111
215,log_weighted_land_price_3_x_livability_score,13394.109102
0,City/town/village_name,11380.676344
86,has_ldk,10872.404782
218,low_price_proxy,5546.67258
268,premium_equipment_count,5184.110216
302,senyu_area,5175.457046
209,log_land_price_x_livability_score,4975.039903
248,money_kyoueki_std,3202.413422


In [28]:
fi_all = (
    pd.concat(
        [
            res['fi'].assign(key=key)
            for key, res in results_lgb.items()
        ],
        ignore_index=True,
    )
    .groupby('feature', as_index=False)['importance']
    .mean()
    .sort_values('importance', ascending=False)
)

In [29]:
fi_all.to_csv(f'{fi_path}feature_importance_{target_model}.csv', index=False)

#### 特徴量重要度==0を削除

In [30]:
# excluded_fe_cols = fi_all.query('importance == 0')['feature'].tolist()
excluded_fe_cols = fi_all.query('importance <= 1')['feature'].tolist()

print(f'Removed {len(excluded_fe_cols)} features')

fe_cols_filtered = [c for c in fe_cols if c not in excluded_fe_cols]

Removed 121 features


In [31]:
cat_cols_filtered = [c for c in cat_cols if c not in excluded_fe_cols]

#### 特徴量選択して再学習

In [32]:
out_lgb_re = run_cv_by_separate(
    train_df=train_df_for_oof,
    base_cols=fe_cols_filtered,
    target_col=target_col,
    year_col=year_col,
    alg=alg,
    base_params=base_params,
    idx_dict=idx_dict,
    cat_cols=cat_cols_filtered,
)

results_lgb_re = out_lgb_re['results_by_split']


 Split: main_city
CV rows: 31627
HO rows: 11070
[main_city] Fold 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 33881
[LightGBM] [Info] Number of data points in the train set: 25301, number of used features: 237
[LightGBM] [Info] Start training from score 17.378418
[main_city] Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33854
[LightGBM] [Info] Number of data points in the train set: 25301, number of used features: 239
[LightGBM] [Info] Start training from score 17.385160
[main_city] Fold 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010727 seconds.
You can set `force_row_wise=true` to

In [33]:
eval_results(results_lgb_re,  train_df_for_oof, target_col, f'LightGBM（{target_model}）')


===== LightGBM（residential） =====
--- OOF MAPE ---
main_city  | 0.121730
mid_city   | 0.140349
other      | 0.150698
--- HO MAPE ---
main_city  | 0.118958
mid_city   | 0.138846
other      | 0.129870
OOF MAPE (all): 0.140710
HO  MAPE (all): 0.130253


#### 学習データの予測結果出力

In [34]:
oof_log_all = pd.concat([r['oof_pred_log'] for r in results_lgb_re.values()]).sort_index()
ho_log_all = pd.concat([r['ho_pred_log_final'] for r in results_lgb_re.values()]).sort_index()

oof_pred_all = np.exp(oof_log_all)
ho_pred_all = np.exp(ho_log_all)

In [35]:
# ============================================================
# 1) 予測結果を格納する DataFrame
# ============================================================
train_result_df = train_df_for_oof[fe_cols + [target_col]].copy()

train_result_df['oof_pred'] = np.nan
train_result_df['ho_pred'] = np.nan

# --- HO のマスク（2022年） ---
mask_ho_all = train_result_df['target_ym'].astype(str).str.startswith('2022')

# index が完全一致するのでそのまま代入可
train_result_df.loc[~mask_ho_all,     'oof_pred'] = oof_pred_all
train_result_df.loc[mask_ho_all,      'ho_pred'] = ho_pred_all

# 保存
train_result_df.to_csv(f'{oof_path}oof_{target_model}_{today}.csv', index=False)

## 最終モデル

In [36]:
y_all     = train_df[target_col].astype(float)
y_all_log = np.log(y_all)

In [37]:
from typing import Optional, Any

def train_final_models_by_split(
    train_df: pd.DataFrame,
    feature_cols: list[str],
    y_log: pd.Series,
    alg: str,
    params: dict,
    idx_dict: Optional[dict[str, pd.Index]] = None,
    cat_cols: Optional[list[str]] = None,
) -> dict[str, Any]:
    """
    splitごとに最終モデルを学習して返す。
    idx_dict=None の場合は全データを 'all' として1モデル学習。
    """

    models: dict[str, Any] = {}

    # ===== Noneフォールバック =====
    if idx_dict is None:
        idx_dict = {'all': train_df.index}

    # ===== cat_cols: 存在する列だけ使う =====
    cat_cols_exist: list[str] = []
    if cat_cols:
        cat_cols_exist = [c for c in cat_cols if c in train_df.columns]

    for split_key, split_idx in idx_dict.items():
        if split_idx is None or len(split_idx) == 0:
            print(f'Skip {split_key} (empty split_idx)')
            continue

        idx_use = pd.Index(split_idx).intersection(train_df.index)

        if len(idx_use) == 0:
            print(f'Skip {split_key} (no rows after intersection)')
            continue

        print(f'=== Final model training: {split_key} | rows={len(idx_use)} ===')

        X_tr = train_df.loc[idx_use, feature_cols]
        y_tr = y_log.loc[idx_use]

        if alg == 'lgb':
            model = lgb.LGBMRegressor(**params[alg])
            model.fit(X_tr, y_tr)

        elif alg == 'cat':
            model = CatBoostRegressor(**params[alg], cat_features=cat_cols_exist)

            # CatBoostはカテゴリ列を文字列化しておくのが安全（既存方針に合わせる）
            if cat_cols_exist:
                X_tr = X_tr.copy()
                for c in cat_cols_exist:
                    X_tr[c] = X_tr[c].astype('string').fillna('NA')

            model.fit(X_tr, y_tr)

        else:
            raise ValueError(f'Unknown alg: {alg}')

        models[split_key] = model

    return models


In [38]:
final_models = train_final_models_by_split(
    train_df=train_df,
    feature_cols=fe_cols_filtered,
    y_log=np.log(train_df[target_col].astype(float)),
    alg=alg,
    params=base_params,
    idx_dict=idx_dict,
    cat_cols=cat_cols_filtered,
)

=== Final model training: main_city | rows=42697 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34187
[LightGBM] [Info] Number of data points in the train set: 42697, number of used features: 241
[LightGBM] [Info] Start training from score 17.408726
=== Final model training: mid_city | rows=69736 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35655
[LightGBM] [Info] Number of data points in the train set: 69736, number of used features: 246
[LightGBM] [Info] Start training from score 16.867233
=== Final model training: other | rows=82721 ===
[LightGBM] [Info] Auto-choos

## モデルの出力

In [39]:
import pickle
from pathlib import Path

def save_model_bundle(
    model_dict: dict,
    base_cols: list,
    cat_cols: list | None,
    save_path: str,
):
    bundle = {
        'models': model_dict,
        'base_cols': base_cols,
        'cat_cols': cat_cols,
    }

    Path(save_path).parent.mkdir(parents=True, exist_ok=True)
    with open(save_path, 'wb') as f:
        pickle.dump(bundle, f)

In [40]:
save_model_bundle(
    model_dict=final_models,
    base_cols=fe_cols_filtered,
    cat_cols=cat_cols_filtered,
    save_path=f'{model_path}/{target_model}_model_v{training_ver}.pkl',
)