# residential: low-classifier + low-regressor ブレンド（v1）

このノートブックは、既存の `03_02_training_v8.ipynb` / `04_01_inference_v8.ipynb` / `04_02_post_revision_v1.ipynb` の出力を流用し、

- 低価格帯 classifier（確率）
- 低価格帯 regressor
- 確率ブレンドによる後補正
- 提出ファイル作成

までを一気通貫で実行します。

## 0. 設定・読み込み

In [21]:
import os
import pickle
import datetime as dt

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# =========================
# Path / Version
# =========================
ROOT_DIR = '../input/'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
oof_path = '../output/oof/'
pred_path = '../output/pred/'

submit_file_path = ROOT_DIR + 'sample_submit.csv'

# 既存ノートブックに合わせる
create_tbl_ver = 2
training_ver = 8
inference_ver = 8
submit_ver = 1

# 本ノートブックのバージョン
low_blend_ver = 1
today = dt.datetime.today().strftime('%Y%m%d')

# 対象
target_model = 'residential'
alg = 'lgb'

# low の閾値（residential）
LOW_THRESHOLD = 10_000_000

# CV（既存 training_v8 の main CV に合わせる）
N_SPLITS = 3

print('today:', today)
print('LOW_THRESHOLD:', LOW_THRESHOLD, 'N_SPLITS:', N_SPLITS)


today: 20260109
LOW_THRESHOLD: 10000000 N_SPLITS: 3


In [22]:
# =========================
# 既存モデル辞書（fe_cols/cat_cols を取得）
# =========================
with open(f'{model_path}/model_{target_model}_{alg}_v{training_ver}.pkl', "rb") as f:
    models_dict = pickle.load(f)

fe_cols = models_dict['fe_cols']
cat_cols = models_dict['cat_cols']

def _union_if_dict(x):
    if isinstance(x, dict):
        out = set()
        for v in x.values():
            out |= set(v)
        return sorted(out)
    return list(x)

fe_cols_union = _union_if_dict(fe_cols)
cat_cols_union = _union_if_dict(cat_cols)

print('n_features(fe_cols):', len(fe_cols_union))
print('n_cat_cols:', len(cat_cols_union))


n_features(fe_cols): 308
n_cat_cols: 6


In [23]:
# OOF 学習用
oof_csv_path = f'{oof_path}train_with_pred_residential_lgb_v{training_ver}.csv'
train_with_pred = pd.read_csv(oof_csv_path)

TARGET_COL = 'money_room'
GROUP_COL = 'building_id'

cand = [c for c in train_with_pred.columns if ('oof' in c) and ('pred' in c) and ('residential' in c)]
MAIN_OOF_COL = cand[0] if len(cand) > 0 else 'residential_lgb_oof_pred'

assert TARGET_COL in train_with_pred.columns
assert GROUP_COL in train_with_pred.columns
assert MAIN_OOF_COL in train_with_pred.columns

print('MAIN_OOF_COL:', MAIN_OOF_COL)
print('train_with_pred:', train_with_pred.shape)

MAIN_OOF_COL: residential_lgb_oof_pred_log
train_with_pred: (195154, 392)


In [24]:
# test 特徴量 & main test 予測
test_df = pd.read_parquet(f'{intermediate_path}test_df_{target_model}_v{create_tbl_ver}.parquet')
pred_df_main = pd.read_parquet(f'{intermediate_path}pred_df_{target_model}_v{inference_ver}.parquet').reindex(test_df.index)

assert 'pred' in pred_df_main.columns
pred_main_test = pred_df_main['pred'].astype(float).values

print('test_df:', test_df.shape, 'pred_df_main:', pred_df_main.shape)

test_df: (58834, 374) pred_df_main: (58834, 382)


## 1. 学習データの feature セットを確定（train/test の共通部分のみ使用）

In [25]:
def detect_feature_cols(train_df, test_df, target_col, group_col, extra_drop=None):
    extra_drop = extra_drop or []
    drop_cols = set([target_col, group_col] + extra_drop)
    pred_like = [c for c in train_df.columns if ('pred' in c) or ('oof' in c) or ('ho' in c)]
    drop_cols.update(pred_like)
    id_like = [c for c in train_df.columns if c.endswith('_id') or c in ['id', 'unit_id', 'property_id']]
    drop_cols.update(id_like)
    return [c for c in train_df.columns if (c in test_df.columns) and (c not in drop_cols)]

common_cols = [c for c in fe_cols_union if (c in train_with_pred.columns) and (c in test_df.columns)]
if len(common_cols) == 0:
    common_cols = detect_feature_cols(train_with_pred, test_df, TARGET_COL, GROUP_COL, extra_drop=[MAIN_OOF_COL])

print('common_cols:', len(common_cols))
print('common_cols head:', common_cols[:30])

common_cat_cols = [c for c in cat_cols_union if c in common_cols]
if len(common_cat_cols) == 0:
    common_cat_cols = [c for c in common_cols if train_with_pred[c].dtype == 'object']

print('common_cat_cols:', len(common_cat_cols))
print('common_cat_cols head:', common_cat_cols[:30])

assert len(common_cols) > 0

X_train = train_with_pred[common_cols]
y_train = train_with_pred[TARGET_COL].astype(float)
g_train = train_with_pred[GROUP_COL]
X_test = test_df[common_cols]
y_low = (y_train < LOW_THRESHOLD).astype('int8')
print('low_rate:', float(y_low.mean()))

common_cols: 282
common_cols head: ['City/town/village_name_te', 'PTN_2020_nn', 'Prefecture_name_te', 'RTA_2025_nn', 'RTB_2025_nn', 'RTC_2025_nn', 'RTD_2025_nn', 'RTE_2025_nn', 'access_zone', 'ame_dist_log', 'amenity_count_within_1000m', 'amenity_count_within_500m', 'area_per_room', 'area_per_room_x_built_diff', 'balcony_area', 'balcony_area_log', 'basement_floor_count', 'building_area_kind_missing', 'building_category', 'building_id', 'building_room_floor_max', 'building_senyu_area_median', 'building_unit_count', 'built_diff', 'cert_score', 'cert_strong_flag', 'clinic_500m', 'convenience_distance', 'convenience_distance_log', 'count_neighbors_1000m']
common_cat_cols: 6
common_cat_cols head: ['access_zone', 'building_category', 'fireproof_x_structure', 'land_road_cond', 'structure_group', 'walk_distance_bin']
low_rate: 0.11277760127898992


## 2. 評価関数（MAPE）と p のキャリブレーション

In [None]:
def mape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs(y_true - y_pred) / denom)

def calibrate_p_temperature(p, t=0.8):
    eps = 1e-6
    p = np.clip(p, eps, 1 - eps)
    logit = np.log(p / (1 - p))
    return 1.0 / (1.0 + np.exp(-logit / t))

def blend_pred(p_low, pred_low, pred_main, t=0.8, clip_min=0.02, clip_max=0.98):
    p = calibrate_p_temperature(p_low, t=t)
    p = np.clip(p, clip_min, clip_max)
    return p * pred_low + (1 - p) * pred_main


## 3. OOF: low classifier（確率）

In [30]:
def _force_lgbm_train_categories(X: pd.DataFrame, cat_cols_use: list[str], na_token: str = 'NA') -> pd.DataFrame:
    X = X.copy()
    for c in cat_cols_use:
        if c not in X.columns:
            continue
        s = X[c].astype('string').fillna(na_token)
        X[c] = s.astype('category')
    return X

In [None]:

clf_params = dict(
    objective='binary',
    learning_rate=0.05,
    n_estimators=3000,
    num_leaves=64,
    min_data_in_leaf=300,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    random_state=42,
    n_jobs=-1,
    forced_splits='',
    verbosity=-1,
)

gkf = GroupKFold(n_splits=N_SPLITS)
p_low_oof = np.zeros(len(X_train), dtype=float)

for fold, (tr_pos, va_pos) in enumerate(gkf.split(X_train, y_low, groups=g_train), 1):
    X_tr, y_tr = X_train.iloc[tr_pos], y_low.iloc[tr_pos]
    X_va, y_va = X_train.iloc[va_pos], y_low.iloc[va_pos]
    
    clf = lgb.LGBMClassifier(**clf_params)
    X_tr = _force_lgbm_train_categories(X_tr, common_cat_cols, na_token='NA')
    X_va = _force_lgbm_train_categories(X_va, common_cat_cols, na_token='NA')
    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='binary_logloss',
        categorical_feature=common_cat_cols if len(common_cat_cols) > 0 else 'auto',
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    p_low_oof[va_pos] = clf.predict_proba(X_va)[:, 1]
    print(f'[clf] fold={fold} best_iter={clf.best_iteration_} p_low_mean={p_low_oof[va_pos].mean():.4f}')


[clf] fold=1 best_iter=345 p_low_mean=0.1102
[clf] fold=2 best_iter=429 p_low_mean=0.1077
[clf] fold=3 best_iter=340 p_low_mean=0.1088


## 4. OOF: low regressor（低価格帯専用モデル）

In [34]:

low_params = dict(
    objective='regression_l1',
    learning_rate=0.03,
    n_estimators=6000,
    num_leaves=64,
    min_data_in_leaf=300,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=3.0,
    random_state=42,
    n_jobs=-1,
    forced_splits='',
    verbosity=-1,
)

pred_low_oof = np.zeros(len(X_train), dtype=float)

X_tr, y_tr = X_train.iloc[tr_pos], y_low.iloc[tr_pos]
X_va, y_va = X_train.iloc[va_pos], y_low.iloc[va_pos]

for fold, (tr_pos, va_pos) in enumerate(gkf.split(X_train, y_train, groups=g_train), 1):
    tr_idx = train_with_pred.index[tr_pos]
    va_idx = train_with_pred.index[va_pos]
    tr_low_idx = y_train.loc[tr_idx][y_train.loc[tr_idx] < LOW_THRESHOLD].index
    X_tr, y_tr = X_train.loc[tr_low_idx], y_train.loc[tr_low_idx]
    X_va, y_va = X_train.loc[va_idx], y_train.loc[va_idx]

    reg = lgb.LGBMRegressor(**low_params)
    X_tr = _force_lgbm_train_categories(X_tr, common_cat_cols, na_token='NA')
    X_va = _force_lgbm_train_categories(X_va, common_cat_cols, na_token='NA')
    reg.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='l1',
        categorical_feature=common_cat_cols if len(common_cat_cols) > 0 else 'auto',
        callbacks=[lgb.early_stopping(stopping_rounds=150, verbose=False)]
    )
    pred_low_oof[va_pos] = reg.predict(X_va)
    print(f'[lowreg] fold={fold} best_iter={reg.best_iteration_} n_tr_low={len(tr_low_idx)}')


[lowreg] fold=1 best_iter=5969 n_tr_low=14577
[lowreg] fold=2 best_iter=5997 n_tr_low=14751
[lowreg] fold=3 best_iter=5996 n_tr_low=14690


## 5. OOF: 既存 main OOF と blend（温度 t を探索）

In [37]:
y_train.values

array([18800000., 16900000., 16700000., ..., 18900000., 15900000.,
       33900000.])

In [56]:
pred_main_oof_log = train_with_pred[MAIN_OOF_COL].astype(float).values
mask = ~np.isnan(pred_main_oof_log)

# ★ 全部 mask に揃える
y_true = y_train.values[mask]

pred_main_oof = np.exp(pred_main_oof_log[mask])
p_low_use = p_low_oof[mask]
pred_low_use = pred_low_oof[mask]

base_mape = mape(y_true, pred_main_oof)
print('BASE(main oof) MAPE:', base_mape)

ts = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]
rows = []
for t in ts:
    pred_bl = blend_pred(p_low_use, pred_low_use, pred_main_oof, t=t)
    rows.append({'t': t, 'mape': mape(y_true, pred_bl)})

score_df = pd.DataFrame(rows).sort_values('mape')
score_df

BASE(main oof) MAPE: 0.13833738495997044


Unnamed: 0,t,mape
6,1.2,0.136144
5,1.1,0.136155
4,1.0,0.136171
3,0.9,0.136197
2,0.8,0.136233
1,0.7,0.136275
0,0.6,0.136328


In [59]:
best_t = float(score_df.iloc[0]['t'])

In [61]:
HO_COL = 'residential_lgb_ho_pred'  # 実際の列名に合わせる
pred_main_ho = train_with_pred[HO_COL].astype(float).values
mask_ho = ~np.isnan(pred_main_ho)

# HO でも同じtを使う（OOFで選んだ best_t を固定）
pred_bl_ho = blend_pred(p_low_oof, pred_low_oof, pred_main_ho, t=best_t)

print('BASE(main ho) MAPE:', mape(y_train.values[mask_ho], pred_main_ho[mask_ho]))
print('BLEND(ho) MAPE:', mape(y_train.values[mask_ho], pred_bl_ho[mask_ho]))

BASE(main ho) MAPE: 0.1300691303572458
BLEND(ho) MAPE: 0.13611610657757903


## 6. 最終学習（full train）→ test 推論 → residential を blend

In [None]:
# full train で classifier 学習
clf_final = lgb.LGBMClassifier(**clf_params)
clf_final.fit(
    X_train, y_low,
    categorical_feature=common_cat_cols if len(common_cat_cols) > 0 else 'auto',
)

# full train (low subset) で low regressor 学習
low_idx_full = train_with_pred.index[y_train < LOW_THRESHOLD]
X_train_low_full = X_train.loc[low_idx_full]
y_train_low_full = y_train.loc[low_idx_full]

reg_low_final = lgb.LGBMRegressor(**low_params)
reg_low_final.fit(
    X_train_low_full, y_train_low_full,
    categorical_feature=common_cat_cols if len(common_cat_cols) > 0 else 'auto',
)

# save artifacts
os.makedirs(model_path, exist_ok=True)
clf_path = f'{model_path}/lowclf_{target_model}_{alg}_thr{int(LOW_THRESHOLD)}_v{training_ver}_lbv{low_blend_ver}.pkl'
reg_path = f'{model_path}/lowreg_{target_model}_{alg}_thr{int(LOW_THRESHOLD)}_v{training_ver}_lbv{low_blend_ver}.pkl'
meta_path = f'{model_path}/lowblend_meta_{target_model}_{alg}_thr{int(LOW_THRESHOLD)}_v{training_ver}_lbv{low_blend_ver}.pkl'

with open(clf_path, 'wb') as f:
    pickle.dump(clf_final, f)
with open(reg_path, 'wb') as f:
    pickle.dump(reg_low_final, f)
with open(meta_path, 'wb') as f:
    pickle.dump({
        'LOW_THRESHOLD': LOW_THRESHOLD,
        'best_t': best_t,
        'common_cols': common_cols,
        'common_cat_cols': common_cat_cols,
        'training_ver': training_ver,
        'create_tbl_ver': create_tbl_ver,
        'inference_ver': inference_ver,
    }, f)

print('saved:', clf_path)
print('saved:', reg_path)
print('saved:', meta_path)

# test 推論
p_low_test = clf_final.predict_proba(X_test)[:, 1]
pred_low_test = reg_low_final.predict(X_test)

pred_blend_test = blend_pred(
    p_low=p_low_test,
    pred_low=pred_low_test,
    pred_main=pred_main_test,
    t=best_t
)

print(pd.Series(pred_main_test).describe())
print(pd.Series(pred_blend_test).describe())


In [None]:
# pred_df_main を residential blend で置き換えて保存
pred_df_blend = pred_df_main.copy()
pred_df_blend['pred_main'] = pred_df_main['pred'].astype(float)
pred_df_blend['p_low'] = p_low_test
pred_df_blend['pred_low'] = pred_low_test
pred_df_blend['pred'] = pred_blend_test

out_pred_path = f'{intermediate_path}pred_df_{target_model}_v{inference_ver}_lowblend_v{low_blend_ver}.parquet'
pred_df_blend.to_parquet(out_pred_path)
print('saved:', out_pred_path)


## 7. 提出ファイル作成（04_02_post_revision_v1 を流用）

In [None]:
# 他ターゲットの pred_df は既存推論のまま読み込む
test_df_house = pd.read_parquet(f'{intermediate_path}pred_df_house_v{inference_ver}.parquet')
test_df_other = pd.read_parquet(f'{intermediate_path}pred_df_other_v{inference_ver}.parquet')

# residential は lowblend を使う
test_df_residential = pred_df_blend

submit_df = pd.read_csv(submit_file_path, index_col=0)
submit_df['pred'] = np.nan

for df_part in [test_df_residential, test_df_house, test_df_other]:
    idx = submit_df.index.intersection(df_part.index)
    submit_df.loc[idx, 'pred'] = df_part.loc[idx, 'pred'].astype(float)

assert submit_df['pred'].isna().sum() == 0, 'submit_df に欠損が残っています'
print(submit_df['pred'].describe())


In [None]:
# 出力
os.makedirs(pred_path, exist_ok=True)

out_submit_path = f'{pred_path}submit_v{submit_ver}_res_lowblend_thr{int(LOW_THRESHOLD)}_t{best_t}_lbv{low_blend_ver}_{today}.csv'
submit_df.to_csv(out_submit_path)
print('saved:', out_submit_path)

out_submit_path
