# 学習データ・予測データの作成

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import japanize_matplotlib

from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
data_definition_path = ROOT_DIR + 'data_definition.xlsx'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
oof_path = '../output/oof/'
fi_path = '../output/fi/'

# スクリプトのバージョン指定
fe_ver = 3
create_tbl_ver = 1

## File Import

In [3]:
train_df = pd.read_parquet(f'{intermediate_path}train_df_fe_v{fe_ver}.parquet')
test_df = pd.read_parquet(f'{intermediate_path}test_df_fe_v{fe_ver}.parquet')

In [4]:
date_col = 'target_ym'
target_col = 'money_room'

## テーブルの分割

In [5]:
RESIDENTIAL_SET = {
    'mansion', 'apartment', 'townhouse', 'terrace_house', 'dormitory'
}

HOUSE_SET = {'house'}

OTHER_SET = {
    'land', 'parking', 'office_building', 'shop', 'warehouse', 'factory',
    'hotel', 'ryokan', 'other', 'unknown', 'other_unknown'
}

def assign_property_group(df: pd.DataFrame, col: str = 'building_category') -> pd.Series:
    s = df[col].astype('string').fillna('other_unknown')

    group = pd.Series('other', index=df.index, dtype='string')
    group[s.isin(RESIDENTIAL_SET)] = 'residential'
    group[s.isin(HOUSE_SET)] = 'house'

    # 想定外カテゴリの検知（運用上かなり重要）
    unk = ~s.isin(RESIDENTIAL_SET | HOUSE_SET | OTHER_SET)
    if unk.any():
        # 想定外は other 扱いに倒しつつ、ログで気づけるようにしておく
        # print('Unexpected building_category:', s[unk].value_counts().head(20))
        group[unk] = 'other'

    return group


In [6]:
train_df['property_group'] = assign_property_group(train_df)
test_df['property_group'] = assign_property_group(test_df)

In [7]:
def make_effective_area_log(df):
    eff = np.full(len(df), np.nan)

    g = df['property_group']

    # residential: 専有面積
    idx = g == 'residential'
    eff[idx] = df.loc[idx, 'senyu_area_log']

    # house: 建物延床 + 土地（平均的に）
    idx = g == 'house'
    eff[idx] = (
        0.7 * df.loc[idx, 'nobeyuka_area_log']
      + 0.3 * df.loc[idx, 'tochi_area_log']
    )

    # other: まずは延床を優先、なければ土地
    idx = g == 'other'
    eff[idx] = df.loc[idx, 'nobeyuka_area_log']
    eff[idx] = np.where(
        np.isnan(eff[idx]),
        df.loc[idx, 'tochi_area_log'],
        eff[idx]
    )

    return eff

In [8]:
train_df['effective_area_log'] = make_effective_area_log(train_df)
test_df['effective_area_log']  = make_effective_area_log(test_df)

In [9]:
house_idx = train_df['property_group'] == 'house'
residential_idx = train_df['property_group'] == 'residential'
other_idx = train_df['property_group'] == 'other'

train_df_house = train_df[house_idx]
train_df_residential = train_df[residential_idx]
train_df_other = train_df[other_idx]

In [10]:
house_idx = test_df['property_group'] == 'house'
residential_idx = test_df['property_group'] == 'residential'
other_idx = test_df['property_group'] == 'other'

test_df_house = test_df[house_idx]
test_df_residential = test_df[residential_idx]
test_df_other = test_df[other_idx]

In [11]:
print(train_df['property_group'].value_counts(dropna=False))
print(test_df['property_group'].value_counts(dropna=False))

property_group
residential    195154
house          153456
other           15314
Name: count, dtype: Int64
property_group
residential    58834
house          48594
other           5009
Name: count, dtype: Int64


## 個別特徴量の作成

#### residential用

In [None]:
def add_residential_flags_and_scores(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # -----------------------
    # is_top_floor
    # relative_floor == 1 もしくは 最上階フラグ
    # -----------------------
    top_by_rel = (
        pd.to_numeric(df['relative_floor'], errors='coerce').eq(1)
        if 'relative_floor' in df.columns else pd.Series(False, index=df.index)
    )
    top_by_flag = _to_int8_bool(_col_or_false(df, '建物構造・性能_最上階'))
    df['is_top_floor'] = _to_int8_bool(top_by_rel | (top_by_flag == 1))

    # -----------------------
    # is_south_facing / is_north_facing
    # snapshot_window_angle のコードに加えて方角フラグ
    # -----------------------
    angle = pd.to_numeric(df['snapshot_window_angle'], errors='coerce') if 'snapshot_window_angle' in df.columns else None

    south_by_angle = (angle == 5) if angle is not None else pd.Series(False, index=df.index)
    south_by_flag = _to_int8_bool(_col_or_false(df, '建物構造・性能_南向き'))
    df['is_south_facing'] = _to_int8_bool(south_by_angle | (south_by_flag == 1))

    north_by_angle = (angle == 1) if angle is not None else pd.Series(False, index=df.index)
    north_by_flag = _to_int8_bool(_col_or_false(df, '建物構造・性能_北向き'))  # もし存在するなら使う
    df['is_north_facing'] = _to_int8_bool(north_by_angle | (north_by_flag == 1))

    # -----------------------
    # area_efficiency_ratio
    # ログ面積同士を割るのはNG → expして面積に戻して比率
    # （senyu / effective_area）
    # -----------------------
    if ('senyu_area_log' in df.columns) and ('effective_area_log' in df.columns):
        senyu = np.exp(pd.to_numeric(df['senyu_area_log'], errors='coerce'))
        eff = np.exp(pd.to_numeric(df['effective_area_log'], errors='coerce'))
        df['area_efficiency_ratio'] = senyu / np.where((eff <= 0) | np.isnan(eff), np.nan, eff)

        # ついでに安定版（ログ差）も作るならこちら（任意）
        # df['area_efficiency_logdiff'] = pd.to_numeric(df['senyu_area_log'], errors='coerce') - pd.to_numeric(df['effective_area_log'], errors='coerce')

    # -----------------------
    # has_storage（複数フラグのOR）
    # 列が無くても落ちないように合成
    # -----------------------
    storage_cols = [
        '専有部分設備_収納_ウォークインクローゼット',
        '専有部分設備_収納_クローゼット',
        '専有部分設備_収納_シューズインクローゼット',
        '専有部分設備_収納_シューズクローク',
        '専有部分設備_収納_シューズボックス',
        '専有部分設備_収納_トランクルーム',
        '専有部分設備_収納_パントリー',
        '専有部分設備_収納_全居室収納',
        '専有部分設備_収納_床下収納',
    ]
    storage_exist = [c for c in storage_cols if c in df.columns]
    if storage_exist:
        storage_any = pd.concat([_to_int8_bool(df[c]) for c in storage_exist], axis=1).max(axis=1) == 1
        df['has_storage'] = _to_int8_bool(storage_any)
    else:
        df['has_storage'] = pd.Series(0, index=df.index, dtype='int8')

    # -----------------------
    # kyoueki_per_m2_log / shuuzen_per_m2_log
    # 負値や欠損に強く
    # -----------------------
    if 'kyoueki_per_m2' in df.columns:
        v = pd.to_numeric(df['kyoueki_per_m2'], errors='coerce').clip(lower=0)
        df['kyoueki_per_m2_log'] = np.log1p(v)

    if 'shuuzen_per_m2' in df.columns:
        v = pd.to_numeric(df['shuuzen_per_m2'], errors='coerce').clip(lower=0)
        df['shuuzen_per_m2_log'] = np.log1p(v)

    # -----------------------
    # management_age_adjusted
    # -----------------------
    if ('management_total_score' in df.columns) and ('effective_age' in df.columns):
        m = pd.to_numeric(df['management_total_score'], errors='coerce')
        age = pd.to_numeric(df['effective_age'], errors='coerce').clip(lower=0)
        df['management_age_adjusted'] = m / (1.0 + age / 30.0)

    return df


# 適用（train/test）
train_df_residential = add_residential_flags_and_scores(train_df_residential)
test_df_residential = add_residential_flags_and_scores(test_df_residential)


#### house用

In [13]:
# =========================================================
# expected_building_value(age, area) を train から作る（リーク回避）
#  - age と area をビン分割して、building_value_proxy の代表値を学習
#  - test へは mapping を適用
# =========================================================

def _build_expected_building_value_map(train_df: pd.DataFrame) -> tuple[pd.IntervalIndex, pd.IntervalIndex, dict]:
    # ビンは分位点ベースにするとデータ密度が安定しやすい
    # ただし effective_age の取り得る範囲が狭い場合は重複cutになるのでdrop_duplicates
    age_q = np.unique(np.nanquantile(train_df['effective_age'].to_numpy(), [0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]))
    area_q = np.unique(np.nanquantile(train_df['effective_area_log'].to_numpy(), [0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]))

    # ビン境界が少なすぎる場合の保険
    if len(age_q) < 4:
        age_q = np.unique(np.nanquantile(train_df['effective_age'].to_numpy(), [0.0, 0.33, 0.66, 1.0]))
    if len(area_q) < 4:
        area_q = np.unique(np.nanquantile(train_df['effective_area_log'].to_numpy(), [0.0, 0.33, 0.66, 1.0]))

    age_bins = pd.IntervalIndex.from_breaks(age_q, closed='right')
    area_bins = pd.IntervalIndex.from_breaks(area_q, closed='right')

    tr = train_df.copy()
    tr['age_bin'] = pd.cut(tr['effective_age'], bins=age_bins)
    tr['area_bin'] = pd.cut(tr['effective_area_log'], bins=area_bins)

    # 各セルの代表値（中央値推奨：外れに強い）
    cell_med = (
        tr.groupby(['age_bin', 'area_bin'])['building_value_proxy']
          .median()
    )

    # (age_bin, area_bin) -> expected_value の辞書
    exp_map = cell_med.to_dict()

    # フォールバック（セル欠損用）
    global_med = np.nanmedian(tr['building_value_proxy'].to_numpy())
    exp_map[('__GLOBAL__', '__GLOBAL__')] = global_med

    return age_bins, area_bins, exp_map


def _apply_expected_building_value(df: pd.DataFrame, age_bins, area_bins, exp_map: dict) -> pd.Series:
    age_bin = pd.cut(df['effective_age'], bins=age_bins)
    area_bin = pd.cut(df['effective_area_log'], bins=area_bins)

    # まずはセル参照
    key = list(zip(age_bin.astype('object'), area_bin.astype('object')))
    out = np.empty(len(df), dtype='float64')

    global_med = exp_map.get(('__GLOBAL__', '__GLOBAL__'), 0.0)
    for i, k in enumerate(key):
        out[i] = exp_map.get(k, global_med)

    return pd.Series(out, index=df.index, name='expected_building_value')


# =========================================================
# 既存の building_value_proxy を作った後に expected を作る必要がある
# =========================================================

# まず train 側で最低限必要なカラムがある前提で building_value_proxy を作る
# （すでにあなたの loop 内で作っているが、map作成前に一度作っておく）
train_df_house['building_value_proxy'] = (
    train_df_house['effective_area_log'] - train_df_house['log_land_price']
)

age_bins, area_bins, exp_map = _build_expected_building_value_map(train_df_house)

In [14]:
eps = 1e-6

for df in [train_df_house, test_df_house]:
    # density が 'high' のとき1
    df['is_high_density'] = (
        df['zone_residential_rank'].isin([3, 4, 0]) |
        df['zone_residential_rank'].isna()
    ).astype('int8')

    # 面積系
    df['effective_area_log_x_high_density'] = (
        df['effective_area_log'] * df['is_high_density']
    )
    df['tochi_area_log_x_high_density'] = (
        df['tochi_area_log'] * df['is_high_density']
    )
    df['nobeyuka_area_log_x_high_density'] = (
        df['nobeyuka_area_log'] * df['is_high_density']
    )

    # 接道
    df['is_no_road_x_high_density'] = (
        df['is_no_road'] * df['is_high_density']
    )
    df['has_shidou_x_high_density'] = (
        df['has_shidou'] * df['is_high_density']
    )

    # 建物価値の「相対化」
    df['building_value_proxy'] = (
        df['effective_area_log'] - df['log_land_price']
    )

    # =====================================================
    # 追加: 流動性補正
    # =====================================================
    df['listing_months_log_x_effective_land_price'] = (
        df['listing_months_log'] * df['effective_land_price']
    )
    df['listing_months_log_x_iqr_price_1000m'] = (
        df['listing_months_log'] * df['iqr_price_1000m']
    )

    # =====================================================
    # 追加: 周辺分布の歪み
    #  - 'median_price_1000m' が存在する前提
    # =====================================================
    if 'median_price_1000m' in df.columns:
        denom = df['median_price_1000m'].astype('float64')
        df['effective_land_price_div_median_price_1000m'] = (
            df['effective_land_price'] / (denom.abs() + eps)
        )
    else:
        # ない場合は欠損で作っておく（学習側でdropするなど判断）
        df['effective_land_price_div_median_price_1000m'] = np.nan

    # =====================================================
    # 追加: 建物価値比率
    # =====================================================
    df['building_value_proxy_div_effective_land_price'] = (
        df['building_value_proxy'] / (df['effective_land_price'].abs() + eps)
    )

    # expected_building_value(age, area) を適用して差分特徴量
    df['expected_building_value'] = _apply_expected_building_value(df, age_bins, area_bins, exp_map)
    df['building_value_proxy_minus_expected'] = (
        df['building_value_proxy'] - df['expected_building_value']
    )

    # =====================================================
    # 追加: 築浅専用の非線形
    # =====================================================
    df['is_new_building'] = (df['effective_age'] <= 3).astype('int8')
    df['effective_age_sq'] = (df['effective_age'].astype('float64') ** 2)

    # =====================================================
    # 追加: リフォーム効きの強調
    # =====================================================
    df['renovation_recency_x_effective_area_log'] = (
        df['renovation_recency'] * df['effective_area_log']
    )

    # =====================================================
    # 追加: エリア内の「ばらつき」
    # =====================================================
    df['iqr_price_500m_div_log_land_price'] = (
        df['iqr_price_500m'] / (df['log_land_price'].abs() + eps)
    )
    df['count_neighbors_1000m_house_x_iqr_price_1000m'] = (
        df['count_neighbors_1000m_house'] * df['iqr_price_1000m']
    )


In [15]:
def add_regulation_used_and_gaps_house(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def _exp_logcol(col):
        if col not in df.columns:
            return None
        return np.exp(pd.to_numeric(df[col], errors='coerce'))

    def _num(col):
        if col not in df.columns:
            return None
        return pd.to_numeric(df[col], errors='coerce')

    # ---- 敷地面積（優先順位：敷地→土地→区画） ----
    site = _exp_logcol('shikichi_area_log')
    if site is None:
        site = _exp_logcol('tochi_area_log')
    if site is None:
        site = _exp_logcol('kukaku_area_log')

    if site is None:
        # 敷地が取れないなら何も作らない
        return df

    # ---- 面積（今回の検証結果に基づく暫定割当） ----
    # footprint（建築面積）≒ exp(nobeyuka_area_log)
    footprint = _exp_logcol('nobeyuka_area_log')

    # total_floor（延床）≒ exp(max_floor_area_log)
    total_floor = _exp_logcol('max_floor_area_log')

    def _safe_ratio(numer, denom):
        return numer / np.where((denom <= 0) | np.isnan(denom), np.nan, denom)

    # ---- 実績（used, %） ----
    if footprint is not None:
        df['kenpei_used'] = 100.0 * _safe_ratio(footprint, site)

    if total_floor is not None:
        df['youseki_used'] = 100.0 * _safe_ratio(total_floor, site)

    # ---- 規制（limit） ----
    kenpei_limit = _num('kenpei')
    youseki_limit = _num('youseki')

    # ---- ギャップ＆利用率 ----
    if ('kenpei_used' in df.columns) and (kenpei_limit is not None):
        df['kenpei_gap'] = kenpei_limit - df['kenpei_used']
        df['kenpei_gap'] = df['kenpei_gap'].clip(lower=-100, upper=100)
        df['kenpei_utilization'] = df['kenpei_used'] / np.where((kenpei_limit <= 0) | np.isnan(kenpei_limit), np.nan, kenpei_limit)

    if ('youseki_used' in df.columns) and (youseki_limit is not None):
        df['youseki_gap'] = youseki_limit - df['youseki_used']
        df['youseki_gap'] = df['youseki_gap'].clip(lower=-500, upper=500)
        df['youseki_utilization'] = df['youseki_used'] / np.where((youseki_limit <= 0) | np.isnan(youseki_limit), np.nan, youseki_limit)

    # ---- クリップ（外れ値で学習を壊さない） ----
    # ここは暫定。分布を見て調整してください。
    if 'kenpei_used' in df.columns:
        df['kenpei_used'] = df['kenpei_used'].clip(lower=0, upper=300)

    if 'youseki_used' in df.columns:
        df['youseki_used'] = df['youseki_used'].clip(lower=0, upper=2000)

    if 'kenpei_utilization' in df.columns:
        df['kenpei_utilization'] = df['kenpei_utilization'].clip(lower=0, upper=5)

    if 'youseki_utilization' in df.columns:
        df['youseki_utilization'] = df['youseki_utilization'].clip(lower=0, upper=10)

    return df

In [16]:
train_df_house = add_regulation_used_and_gaps_house(train_df_house)
test_df_house  = add_regulation_used_and_gaps_house(test_df_house)

## 使用特徴量の選択

In [17]:
fe_cols = train_df.columns.to_list()
# fe_cols.remove(target_col)

#### 共通特徴量

In [18]:
idx_key_cols = [
    'Prefecture name',
    'City/town/village name',
    'zone_residential_rank'
]

In [19]:
core_time_cols = [
    'target_ym',
    'target_year',
    'listing_months_log',
    'building_id'
]

In [20]:
core_location_cols = [
    'log_land_price',
    'log_weighted_land_price_3',
    'effective_land_price',
    'distance_to_landpoint_m',
    'land_price_yoy_w3',
]

In [21]:
core_admin_cols = [
    'Prefecture name_te',
    'City/town/village name_te',
    'eki_name1',
    'urban_score',
    'livability_score',
]

In [22]:
core_age_cols = [
    'effective_age',
    'renovation_recency',
]

In [23]:
core_building_cols = [
    'floor_count',
    'basement_floor_count',
    'room_floor',
    'relative_floor',
    'is_basement',
]

In [24]:
core_area_raw_cols = [
    'senyu_area_log',
    'nobeyuka_area_log',
    'tochi_area_log',
    'shikichi_area_log',
    'kukaku_area_log',
]

In [25]:
core_missing_cols = [
    'floor_count_missing',
    'max_floor_area_missing',
]

In [26]:
core_cols = (
    idx_key_cols
  + core_time_cols
  + core_location_cols
  + core_admin_cols
  + core_age_cols
  + core_building_cols
  + core_area_raw_cols
  + core_missing_cols
)

In [27]:
core_cols_final = [
    c for c in core_cols
    if c not in core_area_raw_cols
] + ['effective_area_log']

#### 個別特徴量

In [28]:
from __future__ import annotations
from typing import Iterable

def keep_existing_cols(df, cols: Iterable[str]) -> list[str]:
    cols = list(cols)
    return [c for c in cols if c in df.columns]

In [29]:
add_cols_residential_v1_base = [
    # 間取り・住戸構成
    'madori_kind_all',
    'madori_number_all',
    'is_one_room',
    'area_per_room',
    'area_ratio',
    # 住戸位置/角度/バルコニー（あれば効く）
    'balcony_area',
    'dwelling_unit_window_angle',
    'snapshot_window_angle',
    # 管理・共益・修繕（スコア＆per系）
    'management_total_score',
    'has_management_association',
    'has_kyoueki',
    'has_shuuzen',
    'kyoueki_per_m2',
    'shuuzen_per_m2',
    'kyoueki_per_unit',
    'shuuzen_per_unit',
    # 空室・募集状況（データが信頼できるなら）
    'empty_ratio',
]

add_cols_residential_v2_base = [
    # ====================
    # A. 階 × 住戸特性（深化）
    # ====================
    'relative_floor',
    'is_top_floor',
    '建物構造・性能_角部屋',
    'is_south_facing',
    'is_north_facing',

    # ====================
    # B. 面積の使われ方
    # ====================
    'area_per_room',                # v1から継続
    'area_efficiency_ratio',        # 専有 / 延床 or 類似
    'has_storage',                  # 納戸・トランク

    # ====================
    # C. 管理・修繕の質（深化）
    # ====================
    'kyoueki_per_m2_log',
    'shuuzen_per_m2_log',
    'management_total_score',     # v1より上位概念
    'management_age_adjusted',      # 管理 × 築年

    # ====================
    # D. 厳選設備（数を絞る）
    # ====================
    '建物構造・性能_オートロック',
    '建物構造・性能_エレベーター',
    '建物構造・性能_宅配ボックス',
    '専有部分設備_空調・暖房_エアコン',
]

add_cols_residential_v3_base = [
    # 地価 × 劣化
    'log_land_price_x_age_decay_30',
    'log_weighted_land_price_3_x_age_decay_30',

    # 地価 × livability
    'log_land_price_x_livability_score',
    'log_weighted_land_price_3_x_livability_score',

    # 駅・間取り
    'eki_name1_te',
    'madori_kind_all',
    'madori_number_all',
]

add_cols_residential_v1 = keep_existing_cols(train_df_residential, add_cols_residential_v1_base)
add_cols_residential_v2 = keep_existing_cols(train_df_residential, add_cols_residential_v2_base)
add_cols_residential_v3 = keep_existing_cols(train_df_residential, add_cols_residential_v3_base)

In [30]:
# =========================
# 3) house 差分 v1
# 目的：土地・規制・接道（戸建ての価格差の本丸）
# =========================
add_cols_house_v1_base = [
    # 規制・区域
    'kenpei',
    'youseki',
    'zone_residential_rank',
    'is_urbanized_area',
    'is_urban_control_area',
    'is_quasi_fireproof_area',
    'is_land_readjustment_area',
    'is_urban_renaissance_area',
    'is_special_far_area',
    'is_highrise_residential_area',
    'is_disaster_prevention_block',
    'is_redevelopment_core_area',

    # 接道・私道・セットバック
    'has_shidou',
    'land_shidou_ratio',
    'shidou_area_ratio',
    'land_setback_flg',
    'land_setback',
    'land_road_cond',
    'is_no_road',
    'dist_to_road_any_m',
    'dist_to_road_major_m',
    'dist_to_road_highway_m',
    'road_len_density',
    'road_len_density_gap',
    'road_cnt_any_in_100m',
    'road_cnt_major_in_100m',
    'road_cnt_any_in_300m',
    'road_cnt_major_in_300m',
    'road_cnt_any_in_500m',
    'road_cnt_major_in_500m',

    # 地目・地勢（過学習しにくいカテゴリだけ）
    'building_land_chimoku',
    'land_chisei',
    'land_area_kind',

    # 追加の土地制約スコア（既に作っているなら強い）
    'land_constraint_score',
]

add_cols_house_high_density_base = [
    # ====================
    # A. 狭小・制約の表現
    # ====================
    'is_high_density',
    'effective_area_log_x_high_density',   # interaction（明示的）
    'tochi_area_log_x_high_density',
    'nobeyuka_area_log_x_high_density',

    # ====================
    # B. 建物 vs 土地の歪み
    # ====================
    'area_building_land_ratio',            # 延床 / 土地
    'area_gap_land_building',              # 土地 - 延床

    # ====================
    # C. 接道制約の強調
    # ====================
    'is_no_road_x_high_density',
    'land_road_cond_x_high_density',
    'has_shidou_x_high_density',

    # ====================
    # D. 規制の効き方
    # ====================
    'youseki_used',
    'youseki_utilization',
    'kenpei_used',
    'kenpei_utilization',

    # 建物価値の相対化
    'building_value_proxy'
]

add_cols_house_v2_base = [
    # 地形
    'elev_mean',
    'elev_range',
    'slope_mean',
    'slope_range',

    # 周辺価格の分散
    'iqr_price_500m',
    'iqr_price_1000m',

    # 立地性格
    'count_neighbors_1000m_house',
    'count_neighbors_1000m_mansion',
]

add_cols_house_v3_base = [
    # 地価 × 劣化
    'log_land_price_x_age_decay_30',
    'log_weighted_land_price_3_x_age_decay_30',

    # 地価 × livability
    'log_land_price_x_livability_score',
    'log_weighted_land_price_3_x_livability_score',

    # 駅力
    'eki_name1_te',

    # 理論土地価格
    'land_theoretical_price',
    'land_theoretical_price_weighted',
]

add_cols_house_v4_base = [
    # 流動性補正
    'listing_months_log_x_effective_land_price',
    'listing_months_log_x_iqr_price_1000m',

    # 周辺分布の歪み
    'effective_land_price_div_median_price_1000m',

    # 建物価値比率
    'building_value_proxy_div_effective_land_price',
    'building_value_proxy_minus_expected',

    # 築浅専用の非線形
    'is_new_building',
    'effective_age_sq',

    # リフォーム効きの強調
    'renovation_recency_x_effective_area_log',

    # エリア内の「ばらつき」
    'iqr_price_500m_div_log_land_price',
    'count_neighbors_1000m_house_x_iqr_price_1000m'
]


add_cols_house_v1 = keep_existing_cols(train_df_house, add_cols_house_v1_base)
add_cols_house_high_density = keep_existing_cols(train_df_house, add_cols_house_high_density_base)
add_cols_house_v2 = keep_existing_cols(train_df_house, add_cols_house_v2_base)
add_cols_house_v3 = keep_existing_cols(train_df_house, add_cols_house_v3_base)
add_cols_house_v4 = keep_existing_cols(train_df_house, add_cols_house_v4_base)

In [31]:
# =========================
# 4) other 差分 v1
# 目的：用途差を分割せず特徴量で吸収（カテゴリ＋最低限の補助）
# =========================
add_cols_other_v1_base = [
    # 用途カテゴリ（分割しないのでここで吸収）
    'building_category',
    'model_category',

    # 用途差の補助（都市性・アクセス）
    'access_zone',
    'walk_distance_bin',
    'urban_score',          # coreに入っているなら重複でもOKだが、ここでは残しておく
    'livability_score',     # 同上

    # 駐車場や投資フラグ等（otherで効くことがある）
    'parking_kubun',
    'parking_number',
    'parking_keiyaku',
    'flg_investment',
]

add_cols_other_v1 = keep_existing_cols(train_df_other, add_cols_other_v1_base)

#### 結合

In [32]:
def dedupe_keep_order(cols: list[str]) -> list[str]:
    seen = set()
    out = []
    for c in cols:
        if c in seen:
            continue
        seen.add(c)
        out.append(c)
    return out

In [33]:
house_use_cols = dedupe_keep_order(
    core_cols_final + 
    add_cols_house_v1 + 
    add_cols_house_v2 +
    add_cols_house_v3 +
    add_cols_house_v4 +
    add_cols_house_high_density
)
residential_use_cols = dedupe_keep_order(
    core_cols_final + 
    add_cols_residential_v1 + 
    add_cols_residential_v2 +
    add_cols_residential_v3
)
other_use_cols = dedupe_keep_order(
    core_cols_final + 
    add_cols_other_v1
)

## 出力

In [34]:
train_df_house[house_use_cols + [target_col]].to_parquet(f'{intermediate_path}train_df_house_v{create_tbl_ver}.parquet')
train_df_residential[residential_use_cols +  [target_col]].to_parquet(f'{intermediate_path}train_df_residential_v{create_tbl_ver}.parquet')
train_df_other[other_use_cols +  [target_col]].to_parquet(f'{intermediate_path}train_df_other_v{create_tbl_ver}.parquet')

test_df_house[house_use_cols].to_parquet(f'{intermediate_path}test_df_house_v{create_tbl_ver}.parquet')
test_df_residential[residential_use_cols].to_parquet(f'{intermediate_path}test_df_residential_v{create_tbl_ver}.parquet')
test_df_other[other_use_cols].to_parquet(f'{intermediate_path}test_df_other_v{create_tbl_ver}.parquet')

In [35]:
# === 【residential】OOF MAPE(2019–2021) ===
# main_city  | OOF MAPE: 0.153888
# mid_city   | OOF MAPE: 0.159294
# other      | OOF MAPE: 0.170715

# === 【residential】HO MAPE (2022) ===
# main_city  | HO MAPE: 0.139459
# mid_city   | HO MAPE: 0.155595
# other      | HO MAPE: 0.146493
# 【residential】OOF MAPE (all, 2019–2021): 0.162929
# 【residential】HO MAPE (all, 2022): 0.147827

In [36]:
# === 【house】OOF MAPE(2019–2021) ===
# low        | OOF MAPE: 0.192370
# mid        | OOF MAPE: 0.202282
# high       | OOF MAPE: 0.214667

# === 【house】HO MAPE (2022) ===
# low        | HO MAPE: 0.176612
# mid        | HO MAPE: 0.175276
# high       | HO MAPE: 0.186546
# 【house】OOF MAPE (all, 2019–2021): 0.205168
# 【house】HO MAPE (all, 2022): 0.181413

In [37]:
# === 【other】OOF MAPE(2019–2021) ===
# all        | OOF MAPE: 0.306664

# === 【other】HO MAPE (2022) ===
# all        | HO MAPE: 0.245024
# 【other】OOF MAPE (all, 2019–2021): 0.306664
# 【other】HO MAPE (all, 2022): 0.245024