# 特徴量エンジニアリング

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import warnings
warnings.filterwarnings('ignore')

import common_func as func

In [2]:
# 自身がファイルを格納したディレクトリを指定
intermediate_path = '../output/intermediate_file/'

# スクリプトのバージョン指定
preprocessing_ver = 4
geo_ver = 2
fe_ver = 3

## File Import

In [3]:
train_df_fe = pd.read_parquet(f'{intermediate_path}train_df_preprocessed_v{preprocessing_ver}.parquet')
test_df_fe = pd.read_parquet(f'{intermediate_path}test_df_preprocessed_v{preprocessing_ver}.parquet')

fe_cols = test_df_fe.columns.to_list()

In [4]:
date_col = 'target_ym'
target_col = 'money_room'

In [5]:
train_df_fe['y_log'] = np.log(train_df_fe[target_col])

## 国土数値情報と結合

In [6]:
train_df_geo = pd.read_parquet(f'{intermediate_path}train_df_geo_v{geo_ver}.parquet')
test_df_geo = pd.read_parquet(f'{intermediate_path}test_df_geo_v{geo_ver}.parquet')

In [7]:
pkey_cols = ['target_ym', 'building_id', 'unit_id']

train_df_fe = train_df_fe.merge(train_df_geo, on=pkey_cols)
test_df_fe = test_df_fe.merge(test_df_geo, on=pkey_cols)

## 都道府県・市区町村情報のTE

In [8]:
# 作成される特徴量：'Prefecture name_te', 'City/town/village name_te'
adress_cols = ['Prefecture name', 'City/town/village name']

global_mean_log = train_df_fe['y_log'].mean()

for col in adress_cols:
    stats = (
        train_df_fe
        .groupby(col)['y_log']
        .agg(['mean', 'count'])
    )

    smoothing = 50  # ハイパラ
    smooth_mean = (
        (stats['mean'] * stats['count'] + global_mean_log * smoothing) /
        (stats['count'] + smoothing)
    )

    train_df_fe[col + '_te'] = train_df_fe[col].map(smooth_mean).fillna(global_mean_log)
    test_df_fe[col + '_te']  = test_df_fe[col].map(smooth_mean).fillna(global_mean_log)


## 面積比

In [9]:
# 作成される特徴量：'area_ratio'
area_ratio_train = train_df_fe['senyu_area'] / train_df_fe['nobeyuka_area']
area_ratio_test = test_df_fe['senyu_area'] / test_df_fe['nobeyuka_area']

train_df_fe['area_ratio'] = area_ratio_train.clip(upper=1.5)
test_df_fe['area_ratio'] = area_ratio_test.clip(upper=1.5)

## 相対階数

In [10]:
# 地下は明示的にフラグにする
train_df_fe['is_basement'] = (train_df_fe['room_floor'] < 0).astype('int8')
test_df_fe['is_basement']  = (test_df_fe['room_floor'] < 0).astype('int8')

In [11]:
# 作成される特徴量：'relative_floor'
train_df_fe['relative_floor'] = (
    train_df_fe['room_floor'] / train_df_fe['floor_count']
).clip(lower=0, upper=1)

test_df_fe['relative_floor'] = (
    test_df_fe['room_floor'] / test_df_fe['floor_count']
).clip(lower=0, upper=1)

## 密度

In [12]:
# 作成される特徴量：'unit_land_density', 'area_per_room'
for df in [train_df_fe, test_df_fe]:
    # 2) 敷地あたり専有面積密度: 専有面積 / 区画面積
    df['unit_land_density_raw'] = df['senyu_area'] / df['kukaku_area']
    df['unit_land_density'] = df['unit_land_density_raw'].clip(upper=1.5)
    df.loc[df['kukaku_area'] <= 0, 'unit_land_density'] = np.nan
    df['unit_land_density_over1'] = (df['unit_land_density_raw'] > 1).astype('int8')

    # 3) 面積 / 部屋数: 1部屋あたり専有面積
    df['area_per_room_raw'] = df['senyu_area'] / df['room_count']
    df['area_per_room'] = df['area_per_room_raw'].clip(upper=50)
    df.loc[df['room_count'] <= 0, 'area_per_room'] = np.nan
    df['area_per_room_log'] = np.log1p(df['area_per_room'])

## 豪邸検出

In [13]:
# 作成される特徴量：'land_building_ratio'
CAP = 20.0  # まずは 10〜30 の範囲で検討（後で分布見て調整）

for df in [train_df_fe, test_df_fe]:
    # 0/負・欠損は NaN に
    df['land_building_ratio_raw'] = df['kukaku_area'] / df['nobeyuka_area']
    df.loc[df['nobeyuka_area'] <= 0, 'land_building_ratio_raw'] = np.nan

    # 極端値を抑制
    df['land_building_ratio'] = df['land_building_ratio_raw'].clip(upper=CAP)

    # 極端に大きい（≒分母が小さすぎる/定義ブレ）をフラグ化
    df['land_building_ratio_hi'] = (df['land_building_ratio_raw'] > CAP).astype('int8')

## 面積と築年の交互作用

In [14]:
# 作成される特徴量：'senyu_area_x_built_diff', 'area_per_room_x_built_diff'
for df in [train_df_fe, test_df_fe]:
    # 2) 専有面積 × 築年数
    df['senyu_area_x_built_diff'] = df['senyu_area'] * df['built_diff']

    # 3) 1部屋あたり面積 × 築年数
    #   → 同じ築年数でも「広くてゆとりのある間取り」のプレミアムを表現
    df['area_per_room_x_built_diff'] = df['area_per_room'] * df['built_diff']

## building_idごとの統合特徴量

In [15]:
# 作成される特徴量：'building_senyu_area_median', 'building_room_floor_max', 'building_unit_count'
# train のみで作る（リーク防止）
base_units = (
    train_df_fe
    .dropna(subset=['unit_id'])
    .drop_duplicates(subset=['building_id', 'unit_id'])
)

building_stats = base_units.groupby('building_id').agg(
    building_senyu_area_median=('senyu_area_log', 'median'),
    building_room_floor_max=('room_floor', 'max'),
    building_unit_count=('unit_id', 'nunique'),
)

train_df_fe = train_df_fe.join(building_stats, on='building_id')
test_df_fe  = test_df_fe.join(building_stats, on='building_id')

## 近傍価格特徴量

In [16]:
from sklearn.neighbors import BallTree

def add_multi_radius_neighbor_features(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    *,
    target_col: str = 'money_room',
    lat_col: str = 'lat',
    lon_col: str = 'lon',
    building_id_col: str = 'building_id',
    building_category_col: str = 'building_category',
    radius_list_m: list[int] = [300, 500, 1000, 2000],
):
    """
    距離ごと（300m, 500m, 1km, 2km） × building_category別（house, mansion, all）
    の近傍集計特徴量を追加する。

    作る特徴量例：
      - mean_price_300m, median_price_300m（実数）
      - mean_price_300m_log, median_price_300m_log（log）
      - std_price_300m, iqr_price_300m
      - count_neighbors_300m

      - mean_price_300m_house, mean_price_300m_house_log
      - mean_price_300m_mansion, mean_price_300m_mansion_log
      - …

    注意：
      test 側は train のみを近傍に使う。
      train 側は「同じ building_id」は除外する。
      最終学習向け（train全量でtarget依存特徴量を作り、testへ適用）。
      注意: CV評価にはリークする。
    """

    tr = train_df.copy()
    te = test_df.copy()

    # --- 必須列チェック ---
    req_tr = [target_col, lat_col, lon_col, building_id_col, building_category_col]
    for c in req_tr:
        if c not in tr.columns:
            raise KeyError(f'train_df missing column: {c}')
    req_te = [lat_col, lon_col]
    for c in req_te:
        if c not in te.columns:
            raise KeyError(f'test_df missing column: {c}')

    # --- 数値化 & 欠損処理 ---
    tr[lat_col] = pd.to_numeric(tr[lat_col], errors='coerce')
    tr[lon_col] = pd.to_numeric(tr[lon_col], errors='coerce')
    te[lat_col] = pd.to_numeric(te[lat_col], errors='coerce')
    te[lon_col] = pd.to_numeric(te[lon_col], errors='coerce')

    # 参照に使う train 側は、座標と target が揃っている行だけ
    tr_ref = tr.dropna(subset=[target_col, lat_col, lon_col]).copy()
    if len(tr_ref) == 0:
        raise ValueError('No valid rows for neighbor reference (target/lat/lon all NaN).')

    y_ref = pd.to_numeric(tr_ref[target_col], errors='coerce').astype(float).to_numpy()

    cats = tr_ref[building_category_col].astype('string').fillna('missing').to_numpy()
    bids = tr_ref[building_id_col].to_numpy()

    train_coords = np.radians(tr_ref[[lat_col, lon_col]].to_numpy())
    tree = BallTree(train_coords, metric='haversine')
    R = 6371.0

    # fallback stats（参照集合に対する統計）
    g_mean = float(np.nanmean(y_ref))
    g_median = float(np.nanmedian(y_ref))
    g_std = float(np.nanstd(y_ref))
    g_iqr = float(np.nanpercentile(y_ref, 75) - np.nanpercentile(y_ref, 25))

    # 生成列一覧（学習列に足しやすくする）
    created_cols: list[str] = []

    def _compute_for_apply(df_apply: pd.DataFrame, exclude_same_building: bool) -> pd.DataFrame:
        out = df_apply.copy()

        apply_coords = np.radians(out[[lat_col, lon_col]].to_numpy())
        n = len(out)

        bids_apply = None
        if exclude_same_building and building_id_col in out.columns:
            bids_apply = out[building_id_col].to_numpy()

        for r in radius_list_m:
            r_rad = (r / 1000.0) / R
            neigh_list = tree.query_radius(apply_coords, r=r_rad, return_distance=False)

            mean_all = np.full(n, g_mean, dtype=float)
            median_all = np.full(n, g_median, dtype=float)
            std_all = np.full(n, g_std, dtype=float)
            iqr_all = np.full(n, g_iqr, dtype=float)
            cnt_all = np.zeros(n, dtype=int)

            mean_house = np.full(n, g_mean, dtype=float)
            mean_mansion = np.full(n, g_mean, dtype=float)
            cnt_house = np.zeros(n, dtype=int)
            cnt_mansion = np.zeros(n, dtype=int)

            for i, neigh_idx in enumerate(neigh_list):
                if len(neigh_idx) == 0:
                    continue

                if bids_apply is not None:
                    # ref側のbuilding_idと、apply側のbuilding_idが一致するものを除外
                    neigh_idx = neigh_idx[bids[neigh_idx] != bids_apply[i]]
                    if len(neigh_idx) == 0:
                        continue

                prices = y_ref[neigh_idx]
                mean_all[i] = float(prices.mean())
                median_all[i] = float(np.median(prices))
                cnt_all[i] = int(len(neigh_idx))

                if len(neigh_idx) > 1:
                    std_all[i] = float(prices.std())
                    q75, q25 = np.percentile(prices, [75, 25])
                    iqr_all[i] = float(q75 - q25)

                neigh_cat = cats[neigh_idx]
                idx_h = neigh_idx[neigh_cat == 'house']
                if len(idx_h) > 0:
                    p = y_ref[idx_h]
                    mean_house[i] = float(p.mean())
                    cnt_house[i] = int(len(idx_h))

                idx_m = neigh_idx[neigh_cat == 'mansion']
                if len(idx_m) > 0:
                    p = y_ref[idx_m]
                    mean_mansion[i] = float(p.mean())
                    cnt_mansion[i] = int(len(idx_m))

            # base（生集計）
            out[f'mean_price_{r}m'] = mean_all
            out[f'median_price_{r}m'] = median_all
            out[f'std_price_{r}m'] = std_all
            out[f'iqr_price_{r}m'] = iqr_all
            out[f'count_neighbors_{r}m'] = cnt_all

            out[f'mean_price_{r}m_house'] = mean_house
            out[f'count_neighbors_{r}m_house'] = cnt_house

            out[f'mean_price_{r}m_mansion'] = mean_mansion
            out[f'count_neighbors_{r}m_mansion'] = cnt_mansion

            # log列（生集計に対してlog1p）
            out[f'mean_price_{r}m_log'] = np.log1p(np.clip(mean_all, 0.0, None))
            out[f'median_price_{r}m_log'] = np.log1p(np.clip(median_all, 0.0, None))
            out[f'mean_price_{r}m_house_log'] = np.log1p(np.clip(mean_house, 0.0, None))
            out[f'mean_price_{r}m_mansion_log'] = np.log1p(np.clip(mean_mansion, 0.0, None))

        return out

    # train側: 参照集合(tr_ref)で計算して tr_ref に付与
    tr_ref_feat = _compute_for_apply(tr_ref[[lat_col, lon_col, building_id_col]].copy(), exclude_same_building=True)
    # 元 train にマージ（参照に使えなかった行はNaN→fallback or 欠損のまま）
    for c in tr_ref_feat.columns:
        if c in [lat_col, lon_col, building_id_col]:
            continue
        tr.loc[tr_ref.index, c] = tr_ref_feat[c].to_numpy()
        created_cols.append(c)

    # trainで参照に使えなかった行（座標/target欠損）には fallback を入れる（0ではなく統計値）
    # ※ここは方針。欠損フラグも作るとさらに安定
    for c in created_cols:
        tr[c] = pd.to_numeric(tr[c], errors='coerce')
        tr[c] = tr[c].fillna(tr_ref_feat[c].mean() if c in tr_ref_feat.columns else np.nan)

    # test側: train参照で計算
    te_feat = _compute_for_apply(te[[lat_col, lon_col]].copy(), exclude_same_building=False)
    for c in te_feat.columns:
        if c in [lat_col, lon_col]:
            continue
        te[c] = te_feat[c].to_numpy()

    created_cols = sorted(set(created_cols))
    return tr, te, created_cols

In [17]:
train_df_fe, test_df_fe, created_cols = add_multi_radius_neighbor_features(train_df_fe, test_df_fe)

## 交通情報

In [18]:
def add_access_zone_features(df, far_thresh=5000, error_thresh=20000):
    """
    交通系FEをまとめて付与する関数

    作成する特徴量:
      - access_zone           : 'walk', 'bus', 'car', 'error', 'unknown'
      - door_to_station_min   : 駅までの実質アクセス時間（徒歩 + バス）
      - door_to_station_min_log
    """
    df = df.copy()

    # 元の距離を退避
    df['walk_distance1_raw'] = df['walk_distance1']

    # 基本フラグ
    has_eki           = df['eki_name1'].notnull()
    has_bus_stop      = df['bus_stop1'].notnull()
    has_bus_time      = df['bus_time1'].notnull()
    has_traffic_other = df['traffic_other'].notnull()

    # =========================
    # 1) access_zone の分類
    # =========================
    df['access_zone'] = 'unknown'

    # 徒歩圏：駅あり & バス停なし
    mask_walk = has_eki & ~has_bus_stop
    df.loc[mask_walk, 'access_zone'] = 'walk'

    # バス圏：バス停あり（駅あり/なしどちらも）
    mask_bus = has_bus_stop & ~mask_walk
    df.loc[mask_bus, 'access_zone'] = 'bus'

    # 自動車圏候補：「駅までめっちゃ遠い」ケース
    very_far = df['walk_distance1_raw'] >= far_thresh

    # 1) traffic_other にコメントあり → 自動車圏
    mask_car1 = very_far & has_traffic_other

    # 2) 駅までの徒歩距離が far_thresh〜error_thresh 未満 → 自動車圏
    mask_car2 = very_far & (df['walk_distance1_raw'] < error_thresh)

    mask_car = (mask_car1 | mask_car2) & ~mask_walk & ~mask_bus

    # walk / bus / unknown のうち、car 条件を満たすものを上書き
    df.loc[mask_car, 'access_zone'] = 'car'

    # 20,000m 以上は入力ミス疑い(駅の位置情報取れれば直接計算もできるが、、)
    mask_error = df['walk_distance1_raw'] >= error_thresh
    df.loc[mask_error, 'access_zone'] = 'error'

    # =========================
    # 2) 駅までの実質アクセス時間
    #    (door_to_station_min)
    # =========================
    # 自宅→徒歩時間（分）
    walk_min1 = df['walk_distance1_raw'] / 80

    # 初期化
    df['door_to_station_min'] = np.nan

    # パターンA：駅あり + バス停あり + バス時間あり（徒歩＋バス）
    mask_A = has_eki & has_bus_stop & has_bus_time
    df.loc[mask_A, 'door_to_station_min'] = (
        walk_min1[mask_A] + df.loc[mask_A, 'bus_time1']
    )

    # パターンB：駅のみ → 徒歩のみ
    mask_B = has_eki & ~has_bus_stop
    df.loc[mask_B, 'door_to_station_min'] = walk_min1[mask_B]

    # パターンC（駅なし＋バスのみ）は NaN のまま
    # → bus_only_flag などで別途対処可能

    # 非線形吸収用ログ特徴量
    df['door_to_station_min_log'] = np.log1p(df['door_to_station_min'])

    bins = [-1, 300, 700, 1500, 5000]
    labels = ['0-300', '300-700', '700-1500', '1500-5000']
  
    df['walk_distance_bin'] = pd.cut(
        df['walk_distance1_raw'],
        bins=bins,
        labels=labels
    ).astype('category')

    return df

In [19]:
train_df_fe = add_access_zone_features(train_df_fe)
test_df_fe  = add_access_zone_features(test_df_fe)

In [20]:
traffic_te_cols = ['eki_name1', 'eki_name2', 'rosen_name1', 'rosen_name2']

global_mean_log = float(train_df_fe['y_log'].mean())

for col in traffic_te_cols:
    mapping = train_df_fe.groupby(col)['y_log'].mean()

    train_df_fe[col + '_te'] = train_df_fe[col].map(mapping).fillna(global_mean_log)
    test_df_fe[col + '_te']  = test_df_fe[col].map(mapping).fillna(global_mean_log)

## 建物情報の拡充

In [21]:
def add_building_area_features(df):
    """
    建物規模系の追加特徴量を作成する
    - density_floor_area
    - senyu_area_range
    - senyu_area_range_log
    - has_senyu_area_range
    - empty_ratio
    """
    df = df.copy()

    # 1) 延床面積密度
    df['density_floor_area'] = df['nobeyuka_area'] / df['tochi_area']
    df.loc[df['tochi_area'] <= 0, 'density_floor_area'] = np.nan

    # 2) 専有面積レンジ
    df['senyu_area_range'] = df['unit_area_max'] - df['unit_area_min']
    df.loc[df['unit_count'] <= 1, 'senyu_area_range'] = 0.0

    # レンジが存在するか（unit_count > 1）
    df['has_senyu_area_range'] = (df['senyu_area_range'] > 0).astype('int8')

    # log1p（下限0を保証）
    df['senyu_area_range_log'] = np.log1p(df['senyu_area_range'].clip(lower=0))

    # 3) 空室率
    df['empty_ratio'] = df['empty_number'] / df['unit_count']
    df.loc[df['unit_count'] <= 0, 'empty_ratio'] = np.nan
    df.loc[df['empty_number'] > df['unit_count'], 'empty_ratio'] = np.nan
    df['empty_ratio'] = df['empty_ratio'].clip(0, 1)

    return df


In [22]:
train_df_fe = add_building_area_features(train_df_fe)
test_df_fe  = add_building_area_features(test_df_fe)

## リフォーム・リノベ情報の拡充

In [23]:
def add_effective_age(df, train_year):
    df = df.copy()

    # --- 2) 日付列
    date_cols = [
        'renovation_date',
        'reform_interior_date',
        'reform_wet_area_date',
        'reform_exterior_date'
    ]

    for c in date_cols:
        df[c] = pd.to_datetime(df[c], errors='coerce')

    # --- 3) 最後の改修年
    df['last_reform_date'] = df[date_cols].max(axis=1)
    df['last_reform_year'] = df['last_reform_date'].dt.year

    # --- 4) 改修フラグ
    df['has_renovation'] = df['last_reform_date'].notnull().astype('int8')

    # --- 5) 改修からの経過年
    df['renovation_recency'] = train_year - df['last_reform_year']
    df.loc[df['renovation_recency'] < 0, 'renovation_recency'] = np.nan

    # --- 6) 軽い補正（最大 -3年）
    discount = np.exp(-df['renovation_recency'] / 5) * 3
    discount = discount.fillna(0)

    df['effective_age'] = df['built_diff'] - discount
    df['effective_age'] = df['effective_age'].clip(lower=0)

    return df

In [24]:
# 例：target_ym の最大値から基準年・月を決める
train_max_ym = train_df_fe['target_ym'].max()  # 例: 202210
train_year = train_max_ym // 100            # → 2022

train_df_fe = add_effective_age(train_df_fe, train_year)
test_df_fe  = add_effective_age(test_df_fe, train_year)

## 掲載期間

In [25]:
# 作成する特徴量：'listing_months'
def calculate_listing_months(df):

    # 1) building_create_date を datetime へ（すでに datetime の場合はスキップ）
    df['building_create_date'] = pd.to_datetime(df['building_create_date'], errors='coerce')

    # 2) target_ym (例: 201901 → 2019-01-01) を datetime に変換
    df['target_ym_date'] = pd.to_datetime(df['target_ym'].astype(str), format='%Y%m', errors='coerce')

    # 3) 年月のみを使った '月数' 変換
    b_y = df['building_create_date'].dt.year
    b_m = df['building_create_date'].dt.month
    t_y = df['target_ym_date'].dt.year
    t_m = df['target_ym_date'].dt.month

    # 4) 掲載期間（何ヶ月経っているか）を計算
    df['listing_months'] = (t_y - b_y) * 12 + (t_m - b_m)
    df['listing_months_log'] = np.log1p(df['listing_months'])

    return df

In [26]:
train_df_fe = calculate_listing_months(train_df_fe)
test_df_fe  = calculate_listing_months(test_df_fe)

## 地価の比率

In [27]:
def add_land_price_ratio_features_full(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    *,
    clip_q_ratio: float = 0.999,   # ratio系の上側clip分位
    clip_q_logdiff: float = 0.999, # log差分の上側clip分位（基本ratioと同じでOK）
    eps: float = 1e-6,
    add_flags: bool = True,
    flag_thr_ratio: float = 5.0,   # gapフラグ用（必要に応じて調整）
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    作成する特徴量（例）:
      - ratio_mean_land, ratio_weighted_land
      - ratio_mean_land_clip, ratio_weighted_land_clip
      - ratio_mean_land_log, ratio_weighted_land_log
      - logdiff_mean_nearest, logdiff_median_weighted
      - logdiff_*_clip
      - (optional) is_land_price_gap_ratio_mean
    """
    tr = train_df.copy()
    te = test_df.copy()

    def _calc_base(df: pd.DataFrame) -> pd.DataFrame:
        # --- ratio（分母>0 & 分子notna）---
        df['ratio_mean_land'] = np.nan
        mask_mean = (df['nearest_land_price'] > 0) & df['mean_price_1000m'].notna()
        df.loc[mask_mean, 'ratio_mean_land'] = (
            df.loc[mask_mean, 'mean_price_1000m'].astype(float)
            / np.maximum(df.loc[mask_mean, 'nearest_land_price'].astype(float), eps)
        )

        df['ratio_weighted_land'] = np.nan
        mask_w = (df['weighted_land_price_3'] > 0) & df['median_price_1000m'].notna()
        df.loc[mask_w, 'ratio_weighted_land'] = (
            df.loc[mask_w, 'median_price_1000m'].astype(float)
            / np.maximum(df.loc[mask_w, 'weighted_land_price_3'].astype(float), eps)
        )

        # --- log差分（log比）---
        # log(mean/nearest) = log(mean+1) - log(nearest+1) では厳密な比ではないが安定で効きやすい
        df['logdiff_mean_nearest'] = df['mean_price_1000m_log'] - df['log_land_price']
        df['logdiff_median_weighted'] = df['median_price_1000m_log'] - df['log_weighted_land_price_3']

        return df

    tr = _calc_base(tr)
    te = _calc_base(te)

    # ==========
    # 1) ratio系 clip/log（clip閾値はtrainから固定）
    # ==========
    if tr['ratio_mean_land'].notna().any():
        ratio_mean_clip = float(tr['ratio_mean_land'].dropna().quantile(clip_q_ratio))
    else:
        ratio_mean_clip = np.nan

    if tr['ratio_weighted_land'].notna().any():
        ratio_w_clip = float(tr['ratio_weighted_land'].dropna().quantile(clip_q_ratio))
    else:
        ratio_w_clip = np.nan

    tr['ratio_mean_land_clip'] = tr['ratio_mean_land'].clip(upper=ratio_mean_clip) if np.isfinite(ratio_mean_clip) else tr['ratio_mean_land']
    te['ratio_mean_land_clip'] = te['ratio_mean_land'].clip(upper=ratio_mean_clip) if np.isfinite(ratio_mean_clip) else te['ratio_mean_land']

    tr['ratio_weighted_land_clip'] = tr['ratio_weighted_land'].clip(upper=ratio_w_clip) if np.isfinite(ratio_w_clip) else tr['ratio_weighted_land']
    te['ratio_weighted_land_clip'] = te['ratio_weighted_land'].clip(upper=ratio_w_clip) if np.isfinite(ratio_w_clip) else te['ratio_weighted_land']

    tr['ratio_mean_land_log'] = np.log1p(np.maximum(tr['ratio_mean_land_clip'], 0))
    te['ratio_mean_land_log'] = np.log1p(np.maximum(te['ratio_mean_land_clip'], 0))

    tr['ratio_weighted_land_log'] = np.log1p(np.maximum(tr['ratio_weighted_land_clip'], 0))
    te['ratio_weighted_land_log'] = np.log1p(np.maximum(te['ratio_weighted_land_clip'], 0))

    # ==========
    # 2) log差分 clip（外れ値を抑える。こちらもtrainから固定）
    # ==========
    if tr['logdiff_mean_nearest'].notna().any():
        ld_m_clip = float(tr['logdiff_mean_nearest'].dropna().quantile(clip_q_logdiff))
    else:
        ld_m_clip = np.nan

    if tr['logdiff_median_weighted'].notna().any():
        ld_w_clip = float(tr['logdiff_median_weighted'].dropna().quantile(clip_q_logdiff))
    else:
        ld_w_clip = np.nan

    tr['logdiff_mean_nearest_clip'] = tr['logdiff_mean_nearest'].clip(upper=ld_m_clip) if np.isfinite(ld_m_clip) else tr['logdiff_mean_nearest']
    te['logdiff_mean_nearest_clip'] = te['logdiff_mean_nearest'].clip(upper=ld_m_clip) if np.isfinite(ld_m_clip) else te['logdiff_mean_nearest']

    tr['logdiff_median_weighted_clip'] = tr['logdiff_median_weighted'].clip(upper=ld_w_clip) if np.isfinite(ld_w_clip) else tr['logdiff_median_weighted']
    te['logdiff_median_weighted_clip'] = te['logdiff_median_weighted'].clip(upper=ld_w_clip) if np.isfinite(ld_w_clip) else te['logdiff_median_weighted']

    # ==========
    # 3) 追加フラグ（任意）
    # ==========
    if add_flags:
        tr['is_land_price_gap_ratio_mean'] = (tr['ratio_mean_land'] > flag_thr_ratio).astype('int8')
        te['is_land_price_gap_ratio_mean'] = (te['ratio_mean_land'] > flag_thr_ratio).astype('int8')

        tr['is_land_price_gap_ratio_weighted'] = (tr['ratio_weighted_land'] > flag_thr_ratio).astype('int8')
        te['is_land_price_gap_ratio_weighted'] = (te['ratio_weighted_land'] > flag_thr_ratio).astype('int8')

    return tr, te

In [28]:
train_df_fe, test_df_fe = add_land_price_ratio_features_full(
    train_df_fe,
    test_df_fe,
    clip_q_ratio=0.999,
    clip_q_logdiff=0.999,
    add_flags=True,
    flag_thr_ratio=5.0,
)

## 共益費・修繕費関連

In [29]:
def add_fee_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    共益費・修繕積立の特徴量:
      - kyoueki_per_m2, shuuzen_per_m2
      - kyoueki_per_unit, shuuzen_per_unit
      - has_kyoueki, has_shuuzen
    """
    out = df.copy()

    house = func._as_numeric(out.get('senyu_area'))
    unit_cnt = func._as_numeric(out.get('unit_count'))
    kyoueki = func._as_numeric(out.get('money_kyoueki_std'))
    shuuzen = func._as_numeric(out.get('money_shuuzen'))

    # 1) 面積あたり
    if (house is not None) and (kyoueki is not None):
        out['kyoueki_per_m2'] = np.where(house > 0, kyoueki / house, np.nan)

    if (house is not None) and (shuuzen is not None):
        out['shuuzen_per_m2'] = np.where(house > 0, shuuzen / house, np.nan)

    # 2) 戸数あたり
    if (unit_cnt is not None) and (kyoueki is not None):
        out['kyoueki_per_unit'] = np.where(unit_cnt > 0, kyoueki / unit_cnt, np.nan)

    if (unit_cnt is not None) and (shuuzen is not None):
        out['shuuzen_per_unit'] = np.where(unit_cnt > 0, shuuzen / unit_cnt, np.nan)

    # 3) 有無フラグ（欠損は 0 扱いにする方針）
    if kyoueki is not None:
        out['has_kyoueki'] = (kyoueki.fillna(0) > 0).astype('Int8')

    if shuuzen is not None:
        out['has_shuuzen'] = (shuuzen.fillna(0) > 0).astype('Int8')

    return out

In [30]:
train_df_fe = add_fee_features(train_df_fe)
test_df_fe = add_fee_features(test_df_fe)

## 地価のギャップ

In [31]:
def add_land_gap_flags(df: pd.DataFrame,
                       cheap_th: float = 0.8,
                       expensive_th: float = 1.2,
                       ratio_col: str = 'ratio_mean_land') -> pd.DataFrame:
    """
    地価ギャップ由来のフラグ:
      - land_cheap_flag
      - land_expensive_flag
    """
    out = df.copy()

    if ratio_col in out.columns:
        ratio = func._as_numeric(out[ratio_col])
        out['land_cheap_flag'] = (ratio < cheap_th).astype('Int8')
        out['land_expensive_flag'] = (ratio > expensive_th).astype('Int8')

    return out

In [32]:
train_df_fe = add_land_gap_flags(train_df_fe)
test_df_fe = add_land_gap_flags(test_df_fe)

## 理論土地価格

In [33]:
def add_land_theoretical_price_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    理論土地価格 (= 地価 × 土地面積) を log1p で作る:
      - land_theoretical_price                  (nearest_land_price)
      - land_theoretical_price_weighted         (weighted_land_price_3)
      - land_theoretical_price_within1km_mp     (mean_price_1000m)
    """
    out = df.copy()

    land_area = func._as_numeric(out.get('kukaku_area'))
    nearest_lp = func._as_numeric(out.get('nearest_land_price'))
    weighted_lp = func._as_numeric(out.get('weighted_land_price_3'))
    within1km_mp = func._as_numeric(out.get('mean_price_1000m'))

    if land_area is None:
        return out

    valid_area = land_area > 0

    if nearest_lp is not None:
        out['land_theoretical_price'] = np.where(
            valid_area,
            np.log1p(nearest_lp * land_area),
            np.nan
        )
        out['land_theoretical_price_x_senyu'] = (
            out['land_theoretical_price'] * out['senyu_area_log']
        )
        out['senyu_to_land_value_ratio'] = (
            out['senyu_area_log'] - out['land_theoretical_price']
        )

    if weighted_lp is not None:
        out['land_theoretical_price_weighted'] = np.where(
            valid_area,
            np.log1p(weighted_lp * land_area),
            np.nan
        )
        out['land_theoretical_price_weighted_x_senyu'] = (
            out['land_theoretical_price_weighted'] * out['senyu_area_log']
        )

    if within1km_mp is not None:
        out['land_theoretical_price_within1km_mp'] = np.where(
            valid_area,
            np.log1p(within1km_mp * land_area),
            np.nan
        )

    return out

In [34]:
train_df_fe= add_land_theoretical_price_features(train_df_fe)
test_df_fe= add_land_theoretical_price_features(test_df_fe)

## 周辺施設カテゴリ数

In [35]:
def add_life_convenience_features(
    train_df_fe: pd.DataFrame,
    test_df_fe: pd.DataFrame,
    amenity_cols: list[str] | None = None,
    thresholds: tuple[int, ...] = (500, 1000),
) -> tuple[pd.DataFrame, pd.DataFrame]:
    train = train_df_fe.copy()
    test = test_df_fe.copy()

    combined = pd.concat([train, test], ignore_index=True)

    if amenity_cols is None:
        amenity_cols = [
            c for c in [
                'convenience_distance',
                'super_distance',
                'hospital_distance',
                'park_distance',
                'drugstore_distance',
                'bank_distance',
                'shopping_street_distance',
                'est_other_distance',
            ]
            if c in combined.columns
        ]

    if len(amenity_cols) == 0:
        print('[add_life_convenience_features] amenity_cols is empty. Skip.')
        return train_df_fe, test_df_fe

    # 距離を数値化（欠損は遠い扱い）
    dist = combined[amenity_cols].apply(pd.to_numeric, errors='coerce')
    dist = dist.clip(lower=0).fillna(np.inf)

    # thresholds ごとに「施設カテゴリ数」のみ作成
    for th in thresholds:
        combined[f'amenity_count_within_{th}m'] = (dist <= th).sum(axis=1).astype('int16')

    n_train = len(train_df_fe)
    train_out = combined.iloc[:n_train].reset_index(drop=True)
    test_out = combined.iloc[n_train:].reset_index(drop=True)

    return train_out, test_out

In [36]:
train_df_fe, test_df_fe = add_life_convenience_features(train_df_fe, test_df_fe)

## 現況・引渡し、管理スコアなど

In [37]:
# 作成する特徴量：'genkyo_flex_score', 'usable_status_score', 'usable_months_delay', 'has_management_association', 'management_form_score', 'manager_presence_score', 'management_total_score',
def add_status_and_management_features(
    train_df_fe: pd.DataFrame,
    test_df_fe: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    現況(genkyo_code)、引渡し(usable_status, usable_date)、
    管理状態(management_form, management_association_flg, house_kanrinin)
    を再整理 & スコア化して特徴量追加。
    """

    combined = pd.concat([train_df_fe, test_df_fe], ignore_index=True)

    # ---------------------------
    # 1) 現況 (genkyo_code)
    # ---------------------------
    genkyo = combined.get('genkyo_code', pd.Series(np.nan, index=combined.index)).astype(float)

    # 自由度スコア
    flex_map = {
        1: 0.5,   # 居住中 or 更地（兼ねるので控えめ）
        2: 1.5,   # 空家
        3: 0.0,   # 賃貸中 → 自由度低い
        4: 1.0,   # 未完成
        10: 1.5,  # 古屋あり更地引渡可
    }
    combined['genkyo_flex_score'] = genkyo.map(flex_map).fillna(0.0)

    # ---------------------------
    # 2) 引渡し (usable_status, usable_date)
    # ---------------------------
    usable_status = combined.get('usable_status', pd.Series(np.nan, index=combined.index)).astype(float)
    usable_date   = combined.get('usable_date',   pd.Series(np.nan, index=combined.index)).astype(float)
    target_ym     = combined.get('target_ym',     pd.Series(np.nan, index=combined.index)).astype(float)

    # フラグ
    combined['usable_immediate_flag']   = (usable_status == 1).astype(int)
    combined['usable_fixed_date_flag']  = (usable_status == 3).astype(int)

    # スコア: 即時 > 期日指定 > 相談・未定
    usable_score_map = {
        1: 2.0,   # 即時
        3: 1.0,   # 期日指定
        2: 0.5,   # 相談
        4: 0.0,   # 未定
    }
    combined['usable_status_score'] = usable_status.map(usable_score_map).fillna(0.0)

    # yyyymm 同士の差分 → おおざっぱに「ヶ月差」とみなす
    # 例: target_ym=202201, usable_date=202204 → 3 ヶ月
    usable_months_delay = np.nan

    if 'usable_date' in combined.columns and 'target_ym' in combined.columns:
        # 年月を整数に分解
        u = usable_date.copy()
        t = target_ym.copy()

        u_year  = (u // 100).astype('Int64')
        u_month = (u % 100).astype('Int64')

        t_year  = (t // 100).astype('Int64')
        t_month = (t % 100).astype('Int64')

        usable_months_delay = (u_year - t_year) * 12 + (u_month - t_month)

    combined['usable_months_delay'] = usable_months_delay

    # ---------------------------
    # 3) 管理状態
    # ---------------------------
    management_form          = combined.get('management_form',          pd.Series(np.nan, index=combined.index)).astype(float)
    management_association   = combined.get('management_association_flg', pd.Series(np.nan, index=combined.index)).astype(float)
    house_kanrinin           = combined.get('house_kanrinin',          pd.Series(np.nan, index=combined.index)).astype(float)

    # 管理組合あり
    combined['has_management_association'] = (management_association == 2).astype(int)

    # プロ管理（委託あり）
    combined['has_professional_management'] = management_form.isin([2, 3]).astype(int)

    # 管理人あり
    combined['has_manager'] = house_kanrinin.isin([1, 2, 3, 5]).astype(int)

    # 管理形態スコア
    management_form_score_map = {
        1: 1.0,   # 自主管理
        2: 2.0,   # 一部委託
        3: 3.0,   # 全部委託
    }
    combined['management_form_score'] = management_form.map(management_form_score_map).fillna(0.0)

    # 管理人スコア
    manager_score_map = {
        4: 0.0,   # 無
        5: 0.5,   # 非常駐
        3: 1.0,   # 巡回
        2: 1.5,   # 日勤
        1: 2.0,   # 常駐
    }
    combined['manager_presence_score'] = house_kanrinin.map(manager_score_map).fillna(0.0)

    # 合計管理スコア
    combined['management_total_score'] = (
        combined['management_form_score']
        + combined['manager_presence_score']
        + combined['has_management_association'] * 0.5  # 管理組合ありに少し加点
    )

    # ---------------------------
    # 4) train/test に戻す
    # ---------------------------
    n_train = len(train_df_fe)
    train_out = combined.iloc[:n_train].reset_index(drop=True)
    test_out  = combined.iloc[n_train:].reset_index(drop=True)

    return train_out, test_out


In [38]:
train_df_fe, test_df_fe = add_status_and_management_features(train_df_fe, test_df_fe)

## 持分比率

In [39]:
def make_mochibun_features(df):
    df = df.copy()

    # 持分割合（欠損は 100% 所有とみなす）
    df['mochibun_ratio'] = df['land_mochibun_b'] / df['land_mochibun_a']
    df['mochibun_ratio'] = df['mochibun_ratio'].replace([np.inf, -np.inf], np.nan)
    df['mochibun_ratio'] = df['mochibun_ratio'].fillna(1.0)

    # フラグ
    df['has_mochibun'] = (df['mochibun_ratio'] < 1.0).astype(int)

    # 実効面積（土地面積が存在する場合のみ）
    if 'tochi_area' in df.columns:
        df['mochibun_area'] = df['tochi_area'] * df['mochibun_ratio']
    else:
        df['mochibun_area'] = np.nan

    df['mochibun_area_log'] = np.log1p(df['mochibun_area'])

    return df

In [40]:
train_df_fe = make_mochibun_features(train_df_fe)
test_df_fe = make_mochibun_features(test_df_fe)

## 私道比率

In [41]:
def make_shidou_features(df):
    df = df.copy()

    # 私道負担面積（欠損＝0）
    df['shidou_area_eff'] = df['snapshot_land_shidou'].fillna(0)

    # 私道負担割合（分子 / 分母）
    df['land_shidou_ratio'] = (
        df['land_shidou_b'] / df['land_shidou_a']
    )
    df['land_shidou_ratio'] = df['land_shidou_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0)

    # フラグ
    df['has_shidou'] = (df['shidou_area_eff'] > 0).astype(int)

    # 土地面積比率
    if 'tochi_area' in df.columns:
        df['shidou_area_ratio'] = df['shidou_area_eff'] / df['tochi_area']
        df['shidou_area_ratio'] = df['shidou_area_ratio'].fillna(0).clip(0, 1)

    return df

In [42]:
train_df_fe = make_shidou_features(train_df_fe)
test_df_fe = make_shidou_features(test_df_fe)

## 住みやすさスコア

In [43]:
from __future__ import annotations
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge


TAG_RULES = {
    'bonus': {
        '環境プレミアム_コンビニ 400ｍ以内': 2.0,
        '環境プレミアム_スーパー 800ｍ以内': 3.0,
        '環境プレミアム_総合病院 800ｍ以内': 2.0,
        '環境プレミアム_公園 400ｍ以内': 1.5,
        '環境プレミアム_子育てに嬉しい環境': 2.0,
        '建物構造・性能_オートロック': 2.0,
        '建物構造・性能_宅配ボックス': 2.5,
        '建物構造・性能_防犯カメラ': 1.5,
        '建物構造・性能_エレベーター': 1.0,
        '建物構造・性能_管理人常駐': 1.5,
        '建物構造・性能_免震構造': 2.0,
        '建物構造・性能_耐震・制震・免震構造': 2.0,
        '専有部分設備_浴室・洗面_浴室乾燥機': 1.5,
        '専有部分設備_浴室・洗面_追焚機能': 1.5,
        '専有部分設備_浴室・洗面_洗面所独立': 1.0,
        '専有部分設備_キッチン_システムキッチン': 1.5,
        '専有部分設備_キッチン_食器洗い乾燥機': 1.5,
        '専有部分設備_収納_ウォークインクローゼット': 1.5,
        '専有部分設備_収納_全居室収納': 1.0,
        '専有部分設備_空調・暖房_床暖房': 2.0,
    },
    'penalty': {
        '専有部分設備_トイレ_トイレなし': -10.0,
        '専有部分設備_浴室・洗面_バスなし': -10.0,
        '専有部分設備_トイレ_共同トイレ': -5.0,
        '専有部分設備_浴室・洗面_共同バス': -5.0,
        '建物設備（給排水・インフラ）_汲取': -3.0,
        '建物設備（給排水・インフラ）_浄化槽': -2.0,
    },
}

def compute_tag_rule_score(df: pd.DataFrame, rules: dict = TAG_RULES) -> pd.Series:
    score = pd.Series(0.0, index=df.index)
    for col, w in rules.get('bonus', {}).items():
        if col in df.columns:
            score += df[col].fillna(0).astype(float) * float(w)
    for col, w in rules.get('penalty', {}).items():
        if col in df.columns:
            score += df[col].fillna(0).astype(float) * float(w)
    return score

def _add_tag_score(df: pd.DataFrame, prefixes: list[str], new_col: str) -> pd.DataFrame:
    cols = [c for c in df.columns if any(c.startswith(p) for p in prefixes)]
    if not cols:
        df[new_col] = 0.0
    else:
        df[new_col] = df[cols].mean(axis=1).fillna(0.0)
    return df

def _standardize_train_apply_test(
    train_s: pd.Series,
    test_s: pd.Series,
    fill_value: float = 0.0,
) -> tuple[pd.Series, pd.Series]:
    tr = train_s.astype(float).replace([np.inf, -np.inf], np.nan)
    te = test_s.astype(float).replace([np.inf, -np.inf], np.nan)

    mu = tr.mean(skipna=True)
    sd = tr.std(skipna=True)
    if sd == 0 or np.isnan(sd):
        return (
            pd.Series(0.0, index=train_s.index),
            pd.Series(0.0, index=test_s.index),
        )

    tr_z = (tr - mu) / sd
    te_z = (te - mu) / sd
    return tr_z.fillna(fill_value), te_z.fillna(fill_value)


# =========================================================
# ① urban_score
# =========================================================

def make_urban_score(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    urban_features: list[str] | None = None,
    impute_strategy: str = 'median',
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:

    if urban_features is None:
        candidate_cols = [
            'count_neighbors_1000m',
            'door_to_station_min_log',
            'tochi_area_log',
            'shikichi_area_log',
        ]
        urban_features = [c for c in candidate_cols if c in train_df.columns]

    train_df = train_df.copy()
    test_df = test_df.copy()

    if not urban_features:
        train_df['urban_score'] = 0.0
        test_df['urban_score'] = 0.0
        return train_df, test_df, {'scaler': None, 'pca': None, 'urban_features': []}

    combined = pd.concat(
        [train_df[urban_features], test_df[urban_features]],
        axis=0, ignore_index=True
    ).astype(float)

    combined = combined.replace([np.inf, -np.inf], np.nan)
    used_features = [c for c in combined.columns if combined[c].notna().any()]
    combined = combined[used_features]

    if not used_features:
        train_df['urban_score'] = 0.0
        test_df['urban_score'] = 0.0
        return train_df, test_df, {'scaler': None, 'pca': None, 'urban_features': []}

    if impute_strategy == 'median':
        fill_values = combined.median(numeric_only=True)
    elif impute_strategy == 'mean':
        fill_values = combined.mean(numeric_only=True)
    else:
        raise ValueError('impute_strategy must be \'median\' or \'mean\'')

    combined_imputed = combined.fillna(fill_values)

    scaler = StandardScaler()
    combined_scaled = scaler.fit_transform(combined_imputed)

    pca = PCA(n_components=1, random_state=42)
    urban_component = pca.fit_transform(combined_scaled).ravel()

    n_train = len(train_df)
    train_df['urban_score'] = urban_component[:n_train]
    test_df['urban_score'] = urban_component[n_train:]

    meta = {
        'scaler': scaler,
        'pca': pca,
        'urban_features': used_features,
        'impute_values': fill_values.to_dict(),
    }
    return train_df, test_df, meta


# =========================================================
# ② livability subscores（KSI + land_road_cond 統合版）
# =========================================================

def make_livability_subscores_ksi(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    use_rule_tags: bool = True,
    rule_weight_in_daily: float = 0.6,
    rule_weight_in_building: float = 0.8,
    # KSI をどこまで使うか
    use_ksi_access: bool = True,
    use_ksi_zoning: bool = True,
    use_ksi_geo_risk: bool = True,
    ksi_access_weight: float = 0.7,
    ksi_zoning_weight: float = 0.6,
    ksi_geo_risk_weight: float = 0.5,
    # === land_road_cond: 追加 ===
    use_land_road_cond: bool = True,
    land_road_weight: float = 0.6,
    land_road_map: dict[int, float] | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:

    train_df = train_df.copy()
    test_df = test_df.copy()

    # ---------- タグ系スコア ----------
    tag_groups = {
        'tag_land': ['土地価格_'],
        'tag_unit': ['専有部分設備_'],
        'tag_building': ['建物構造・性能_'],
        'tag_infra': ['建物設備（給排水・インフラ）_'],
        'tag_env': ['環境プレミアム_'],
        'tag_cert': ['用途・投資セグメント_不動産の証明書・性能評価_'],
    }

    for name, prefixes in tag_groups.items():
        train_df = _add_tag_score(train_df, prefixes, f'{name}_score')
        test_df = _add_tag_score(test_df, prefixes, f'{name}_score')

    # ---------- ルールベースタグ ----------
    if use_rule_tags:
        train_df['tag_rule_score_raw'] = compute_tag_rule_score(train_df)
        test_df['tag_rule_score_raw'] = compute_tag_rule_score(test_df)

        tr_z, te_z = _standardize_train_apply_test(
            train_df['tag_rule_score_raw'],
            test_df['tag_rule_score_raw'],
            fill_value=0.0,
        )
        train_df['tag_rule_score'] = tr_z
        test_df['tag_rule_score'] = te_z
    else:
        train_df['tag_rule_score'] = 0.0
        test_df['tag_rule_score'] = 0.0

    # ---------- 数値系（駅近 / 密度 / 面積） ----------
    def _add_base_numeric_scores(df: pd.DataFrame) -> pd.DataFrame:
        if 'door_to_station_min_log' in df.columns:
            df['access_station_score'] = func._zscore(-df['door_to_station_min_log'].astype(float))
        else:
            df['access_station_score'] = 0.0

        if 'count_neighbors_1000m' in df.columns:
            df['neighbor_density_score'] = func._zscore(df['count_neighbors_1000m'].astype(float))
        else:
            df['neighbor_density_score'] = 0.0

        numeric_area_cols = [
            c for c in df.columns
            if any(k in c for k in ['senyu_area_log', 'area_per_room', 'nobeyuka_area_log'])
        ]
        df['room_space_score'] = func._safe_mean(df, numeric_area_cols)
        if df['room_space_score'].std() > 0:
            df['room_space_score'] = func._zscore(df['room_space_score'])
        else:
            df['room_space_score'] = 0.0

        return df

    train_df = _add_base_numeric_scores(train_df)
    test_df = _add_base_numeric_scores(test_df)

    # =====================================================
    # === land_road_cond: 接道条件スコア化 ===
    # =====================================================
    if land_road_map is None:
        # あなたの定義に基づく「素朴な序列」スコア
        # 一方(1)を基準0、角地(2)や二方(5)を加点、三方(3)・四方(4)は強加点、接道なし(10)は強減点
        land_road_map = {
            1: 0.0,   # 一方
            2: 0.7,   # 角地
            5: 0.6,   # 二方(除角地)
            3: 0.9,   # 三方
            4: 1.0,   # 四方
            10: -2.0, # 接道なし（強減点）
        }

    if use_land_road_cond and 'land_road_cond' in train_df.columns and 'land_road_cond' in test_df.columns:
        tr_code = train_df['land_road_cond'].copy()
        te_code = test_df['land_road_cond'].copy()

        # 数値変換（その他・欠損は NaN → map で埋まらないので 0 扱いへ）
        tr_code = pd.to_numeric(tr_code, errors='coerce')
        te_code = pd.to_numeric(te_code, errors='coerce')

        tr_raw = tr_code.map(land_road_map).fillna(0.0)
        te_raw = te_code.map(land_road_map).fillna(0.0)

        tr_z, te_z = _standardize_train_apply_test(tr_raw, te_raw, fill_value=0.0)
        train_df['land_road_cond_score'] = tr_z
        test_df['land_road_cond_score'] = te_z
    else:
        train_df['land_road_cond_score'] = 0.0
        test_df['land_road_cond_score'] = 0.0

    # =====================================================
    # KSI: Access（道路近接）
    # =====================================================
    if use_ksi_access:
        tr_dist_major = func._get_or_zeros(train_df, 'dist_to_road_major_m')
        te_dist_major = func._get_or_zeros(test_df, 'dist_to_road_major_m')

        tr_len_den = func._get_or_zeros(train_df, 'road_len_density')
        te_len_den = func._get_or_zeros(test_df, 'road_len_density')

        tr_cnt_major_300 = func._get_or_zeros(train_df, 'road_cnt_major_in_300m')
        te_cnt_major_300 = func._get_or_zeros(test_df, 'road_cnt_major_in_300m')

        tr_dist_z, te_dist_z = _standardize_train_apply_test(-tr_dist_major, -te_dist_major)
        tr_den_z, te_den_z = _standardize_train_apply_test(tr_len_den, te_len_den)
        tr_cnt_z, te_cnt_z = _standardize_train_apply_test(tr_cnt_major_300, te_cnt_major_300)

        train_df['ksi_road_access_score'] = (tr_dist_z + tr_den_z + 0.5 * tr_cnt_z) / 2.5
        test_df['ksi_road_access_score'] = (te_dist_z + te_den_z + 0.5 * te_cnt_z) / 2.5
    else:
        train_df['ksi_road_access_score'] = 0.0
        test_df['ksi_road_access_score'] = 0.0

    # =====================================================
    # KSI: Zoning
    # =====================================================
    if use_ksi_zoning:
        tr_zone_rank = func._get_or_zeros(train_df, 'zone_residential_rank')
        te_zone_rank = func._get_or_zeros(test_df, 'zone_residential_rank')

        tr_lowrise = func._get_or_zeros(train_df, 'is_lowrise_residential')
        te_lowrise = func._get_or_zeros(test_df, 'is_lowrise_residential')

        tr_kenpei = func._get_or_zeros(train_df, 'kenpei')
        te_kenpei = func._get_or_zeros(test_df, 'kenpei')

        tr_youseki = func._get_or_zeros(train_df, 'youseki')
        te_youseki = func._get_or_zeros(test_df, 'youseki')

        tr_zone_z, te_zone_z = _standardize_train_apply_test(tr_zone_rank, te_zone_rank)
        tr_k_z, te_k_z = _standardize_train_apply_test(tr_kenpei, te_kenpei)
        tr_y_z, te_y_z = _standardize_train_apply_test(tr_youseki, te_youseki)

        train_df['ksi_zoning_score'] = (tr_zone_z + 0.7 * tr_lowrise + 0.2 * tr_k_z + 0.2 * tr_y_z) / 2.1
        test_df['ksi_zoning_score'] = (te_zone_z + 0.7 * te_lowrise + 0.2 * te_k_z + 0.2 * te_y_z) / 2.1
    else:
        train_df['ksi_zoning_score'] = 0.0
        test_df['ksi_zoning_score'] = 0.0

    # =====================================================
    # KSI: Geo-risk
    # =====================================================
    if use_ksi_geo_risk:
        tr_slope_mean = func._get_or_zeros(train_df, 'slope_mean')
        te_slope_mean = func._get_or_zeros(test_df, 'slope_mean')

        tr_elev_range = func._get_or_zeros(train_df, 'elev_range')
        te_elev_range = func._get_or_zeros(test_df, 'elev_range')

        tr_slope_max = func._get_or_zeros(train_df, 'slope_max')
        te_slope_max = func._get_or_zeros(test_df, 'slope_max')

        tr_disaster = func._get_or_zeros(train_df, 'is_disaster_prevention_block')
        te_disaster = func._get_or_zeros(test_df, 'is_disaster_prevention_block')

        tr_sm_z, te_sm_z = _standardize_train_apply_test(tr_slope_mean, te_slope_mean)
        tr_er_z, te_er_z = _standardize_train_apply_test(tr_elev_range, te_elev_range)
        tr_sx_z, te_sx_z = _standardize_train_apply_test(tr_slope_max, te_slope_max)

        train_df['ksi_geo_risk_score'] = - (0.5 * tr_sm_z + 0.3 * tr_er_z + 0.2 * tr_sx_z + 0.6 * tr_disaster)
        test_df['ksi_geo_risk_score'] = - (0.5 * te_sm_z + 0.3 * te_er_z + 0.2 * te_sx_z + 0.6 * te_disaster)
    else:
        train_df['ksi_geo_risk_score'] = 0.0
        test_df['ksi_geo_risk_score'] = 0.0

    # =====================================================
    # 4つのサブスコア（KSI + land_road_cond 注入）
    # =====================================================
    def _calc_subscores(df: pd.DataFrame) -> pd.DataFrame:
        df['score_access'] = func._safe_mean(df, [
            'access_station_score',
            'neighbor_density_score',
            'tag_env_score',
        ]) + ksi_access_weight * df['ksi_road_access_score'] + land_road_weight * df['land_road_cond_score']

        df['score_daily'] = func._safe_mean(df, [
            'tag_env_score',
            'tag_land_score',
        ]) + rule_weight_in_daily * df['tag_rule_score'] + ksi_geo_risk_weight * df['ksi_geo_risk_score']

        df['score_room'] = func._safe_mean(df, [
            'tag_unit_score',
            'room_space_score',
        ])

        df['score_building'] = func._safe_mean(df, [
            'tag_building_score',
            'tag_infra_score',
            'tag_cert_score',
        ]) + rule_weight_in_building * df['tag_rule_score'] + ksi_zoning_weight * df['ksi_zoning_score']

        return df

    train_df = _calc_subscores(train_df)
    test_df = _calc_subscores(test_df)

    meta = {
        'tag_groups': tag_groups,
        'subscores': ['score_access', 'score_daily', 'score_room', 'score_building'],
        'use_rule_tags': use_rule_tags,
        'rule_weights': {
            'daily': rule_weight_in_daily,
            'building': rule_weight_in_building,
        },
        'ksi': {
            'use_ksi_access': use_ksi_access,
            'use_ksi_zoning': use_ksi_zoning,
            'use_ksi_geo_risk': use_ksi_geo_risk,
            'weights': {
                'ksi_access_weight': ksi_access_weight,
                'ksi_zoning_weight': ksi_zoning_weight,
                'ksi_geo_risk_weight': ksi_geo_risk_weight,
            },
            'scores_added': [
                'ksi_road_access_score',
                'ksi_zoning_score',
                'ksi_geo_risk_score',
            ],
        },
        'land_road_cond': {
            'use_land_road_cond': use_land_road_cond,
            'land_road_weight': land_road_weight,
            'land_road_map': land_road_map,
            'score_col': 'land_road_cond_score',
        },
        'rule_tags_used': {
            'bonus': [k for k in TAG_RULES['bonus'].keys() if k in train_df.columns],
            'penalty': [k for k in TAG_RULES['penalty'].keys() if k in train_df.columns],
        },
    }
    return train_df, test_df, meta


def fit_livability_weight_model(
    train_df: pd.DataFrame,
    target_col: str = 'money_room',
    alpha: float = 1.0,
) -> tuple[Ridge, list[str]]:

    required_cols = ['urban_score', 'score_access', 'score_daily', 'score_room', 'score_building']
    for c in required_cols + [target_col]:
        if c not in train_df.columns:
            raise KeyError(f'必要な列が存在しません: {c}')

    df = train_df.dropna(subset=[target_col]).copy()
    y = df[target_col].astype(float)

    y_std = (y - y.mean()) / y.std()

    X_parts = []
    feature_names = []

    X_parts.append(df[required_cols].astype(float).values)
    feature_names.extend(required_cols)

    for col in ['score_access', 'score_daily', 'score_room', 'score_building']:
        inter_name = f'{col}_x_urban'
        X_parts.append((df[col] * df['urban_score']).values.reshape(-1, 1))
        feature_names.append(inter_name)

    X = np.hstack(X_parts)

    model = Ridge(alpha=alpha, random_state=42)
    model.fit(X, y_std)

    return model, feature_names


def apply_livability_score(
    df: pd.DataFrame,
    model: Ridge,
    train_min: float,
    train_max: float,
) -> pd.Series:

    base_cols = ['urban_score', 'score_access', 'score_daily', 'score_room', 'score_building']

    X_parts = [df[base_cols].astype(float).values]
    for col in ['score_access', 'score_daily', 'score_room', 'score_building']:
        X_parts.append((df[col] * df['urban_score']).values.reshape(-1, 1))
    X = np.hstack(X_parts)

    liv_raw = model.predict(X)

    if train_max == train_min:
        return pd.Series(50.0, index=df.index)

    liv_scaled = (liv_raw - train_min) / (train_max - train_min)
    liv_scaled = liv_scaled.clip(0, 1) * 100.0
    return pd.Series(liv_scaled, index=df.index)


def add_livability_features_ksi(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_col: str = 'money_room',
    alpha: float = 1.0,
    use_rule_tags: bool = True,
    rule_weight_in_daily: float = 0.6,
    rule_weight_in_building: float = 0.8,
    use_ksi_access: bool = True,
    use_ksi_zoning: bool = True,
    use_ksi_geo_risk: bool = True,
    ksi_access_weight: float = 0.7,
    ksi_zoning_weight: float = 0.6,
    ksi_geo_risk_weight: float = 0.5,
    # === land_road_cond: 追加 ===
    use_land_road_cond: bool = True,
    land_road_weight: float = 0.6,
    land_road_map: dict[int, float] | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:

    train_u, test_u, urban_meta = make_urban_score(train_df, test_df)

    train_s, test_s, subs_meta = make_livability_subscores_ksi(
        train_u,
        test_u,
        use_rule_tags=use_rule_tags,
        rule_weight_in_daily=rule_weight_in_daily,
        rule_weight_in_building=rule_weight_in_building,
        use_ksi_access=use_ksi_access,
        use_ksi_zoning=use_ksi_zoning,
        use_ksi_geo_risk=use_ksi_geo_risk,
        ksi_access_weight=ksi_access_weight,
        ksi_zoning_weight=ksi_zoning_weight,
        ksi_geo_risk_weight=ksi_geo_risk_weight,
        use_land_road_cond=use_land_road_cond,
        land_road_weight=land_road_weight,
        land_road_map=land_road_map,
    )

    model, feature_names = fit_livability_weight_model(
        train_s, target_col=target_col, alpha=alpha
    )

    base_cols = ['urban_score', 'score_access', 'score_daily', 'score_room', 'score_building']
    raw_tmp = model.predict(np.hstack([
        train_s[base_cols].astype(float).values,
        (train_s['score_access'] * train_s['urban_score']).values.reshape(-1, 1),
        (train_s['score_daily'] * train_s['urban_score']).values.reshape(-1, 1),
        (train_s['score_room'] * train_s['urban_score']).values.reshape(-1, 1),
        (train_s['score_building'] * train_s['urban_score']).values.reshape(-1, 1),
    ]))
    train_min = float(np.nanmin(raw_tmp))
    train_max = float(np.nanmax(raw_tmp))

    train_s['livability_score'] = apply_livability_score(train_s, model, train_min, train_max)
    test_s['livability_score'] = apply_livability_score(test_s, model, train_min, train_max)

    meta = {
        'urban_meta': urban_meta,
        'subscores_meta': subs_meta,
        'model': model,
        'feature_names': feature_names,
        'train_min': train_min,
        'train_max': train_max,
    }
    return train_s, test_s, meta


In [44]:
train_df_fe, test_df_fe, meta = add_livability_features_ksi(
    train_df=train_df_fe,
    test_df=test_df_fe,
    target_col='money_room',
    use_land_road_cond=True,
    land_road_weight=0.6,  # 強すぎたら 0.3 へ
)

## 道路関連

In [45]:
for df in [train_df_fe, test_df_fe]:
    df['is_no_road'] = (df['land_road_cond'] == 10).astype('int8')
    df['road_len_density_x_no_road'] = (df['road_len_density'] * df['is_no_road']).astype('float32')
    df['road_narrow_ratio_gap_x_no_road'] = (df['road_narrow_ratio_gap'] * df['is_no_road']).astype('float32')

## 用途地域関連

In [46]:
def add_max_floor_area_features(
    df: pd.DataFrame,
    *,
    tochi_area_col: str = 'tochi_area',
    youseki_col: str = 'youseki',
) -> pd.DataFrame:
    out = df.copy()

    # 数値化（列が無ければ作って NaN）
    if tochi_area_col not in out.columns:
        out[tochi_area_col] = np.nan
    if youseki_col not in out.columns:
        out[youseki_col] = np.nan

    out[tochi_area_col] = pd.to_numeric(out[tochi_area_col], errors='coerce')
    out[youseki_col] = pd.to_numeric(out[youseki_col], errors='coerce')

    # invalid: 容積率<=0 or 土地<=0 or どちらか欠損
    invalid = (
        out[youseki_col].isna() | (out[youseki_col] <= 0) |
        out[tochi_area_col].isna() | (out[tochi_area_col] <= 0)
    )

    out['max_floor_area'] = out[tochi_area_col] * out[youseki_col] / 100.0
    out.loc[invalid, 'max_floor_area'] = np.nan

    # log1p: NaN は NaN のまま（欠損=0にしない）
    out['max_floor_area_log'] = np.where(
        out['max_floor_area'].notna(),
        np.log1p(out['max_floor_area']),
        np.nan,
    )

    out['max_floor_area_missing'] = out['max_floor_area'].isna().astype('int8')
    return out


def add_lowrise_station_interaction(
    df: pd.DataFrame,
    *,
    lowrise_col: str = 'is_lowrise_residential',
    station_min_col: str = 'door_to_station_min_log',
) -> pd.DataFrame:
    out = df.copy()

    low = out[lowrise_col] if lowrise_col in out.columns else 0
    sta = out[station_min_col] if station_min_col in out.columns else 0.0

    low = pd.to_numeric(low, errors='coerce').fillna(0).astype('int8')
    sta = pd.to_numeric(sta, errors='coerce')

    # 欠損は 0 に寄せる（interaction は「情報なし→影響なし」にする）
    out['lowrise_x_station_log'] = low * sta.fillna(0.0)
    out['lowrise_x_station_missing'] = (low.eq(1) & sta.isna()).astype('int8')

    return out


def add_lowrise_landprice_interaction(
    df: pd.DataFrame,
    *,
    lowrise_col: str = 'is_lowrise_residential',
    land_price_log_col: str = 'log_land_price',
) -> pd.DataFrame:
    out = df.copy()

    low = out[lowrise_col] if lowrise_col in out.columns else 0
    lp = out[land_price_log_col] if land_price_log_col in out.columns else 0.0

    low = pd.to_numeric(low, errors='coerce').fillna(0).astype('int8')
    lp = pd.to_numeric(lp, errors='coerce')

    out['lowrise_x_landprice_log'] = low * lp.fillna(0.0)
    out['lowrise_x_landprice_missing'] = (low.eq(1) & lp.isna()).astype('int8')

    return out


def add_house_key_interactions(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out = add_max_floor_area_features(out)
    out = add_lowrise_station_interaction(out)
    out = add_lowrise_landprice_interaction(out)
    return out

In [47]:
train_df_fe = add_house_key_interactions(train_df_fe)
test_df_fe = add_house_key_interactions(test_df_fe)

## 都市計画関連

In [48]:
def add_cityplan_interactions(
    df: pd.DataFrame,
    *,
    land_price_log_col: str = 'log_land_price',
    max_floor_area_log_col: str = 'max_floor_area_log',
    station_log_col: str = 'door_to_station_min_log',
    youseki_col: str = 'youseki',
    is_urban_control_col: str = 'is_urban_control_area',
    is_fireproof_col: str = 'is_fireproof_area',
    has_district_plan_col: str = 'has_district_plan',
    is_high_util_col: str = 'is_high_utilization_area',
    has_height_limit_col: str = 'has_height_limit',
    is_urban_renaissance_col: str = 'is_urban_renaissance_area',
    clip_youseki: float | None = None,
    station_sign: int = -1,          # -1: 近いほどプラス, +1: 遠いほどプラス
    youseki_scale: str = 'ratio',    # 'raw' | 'ratio' | 'log'
) -> pd.DataFrame:
    out = df.copy()

    # station（方向を揃える）
    station = func._num0(out, station_log_col) * float(station_sign)

    # youseki（スケール調整）
    y = func._num0(out, youseki_col)
    if clip_youseki is not None:
        y = y.clip(upper=float(clip_youseki))
    if youseki_scale == 'ratio':
        y = y / 100.0
    elif youseki_scale == 'log':
        y = np.log1p(y)
    elif youseki_scale != 'raw':
        raise ValueError('youseki_scale must be one of: raw, ratio, log')

    if is_urban_control_col in out.columns and land_price_log_col in out.columns:
        func._add(out, 'urban_control_x_land_price_log', func._flag(out, is_urban_control_col) * func._num0(out, land_price_log_col))

    if is_fireproof_col in out.columns and max_floor_area_log_col in out.columns:
        func._add(out, 'fireproof_x_max_floor_area_log', func._flag(out, is_fireproof_col) * func._num0(out, max_floor_area_log_col))

    if has_district_plan_col in out.columns and station_log_col in out.columns:
        func._add(out, 'district_plan_x_station_log', func._flag(out, has_district_plan_col) * station)

    if is_high_util_col in out.columns and youseki_col in out.columns:
        func._add(out, 'high_util_x_youseki', func._flag(out, is_high_util_col) * y)

    if has_height_limit_col in out.columns and max_floor_area_log_col in out.columns:
        func._add(out, 'height_limit_x_max_floor_area_log', func._flag(out, has_height_limit_col) * func._num0(out, max_floor_area_log_col))

    if is_urban_renaissance_col in out.columns and station_log_col in out.columns:
        func._add(out, 'urban_renaissance_x_station_log', func._flag(out, is_urban_renaissance_col) * station)

    return out

In [49]:
def add_fireproof_x_structure_cat_both(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    *,
    is_fireproof_col: str = 'is_fireproof_area',
    structure_col: str = 'building_structure',
) -> tuple[pd.DataFrame, pd.DataFrame]:
    tr = train_df.copy()
    te = test_df.copy()

    # 欠損列があっても落ちないように用意
    if is_fireproof_col not in tr.columns:
        tr[is_fireproof_col] = 0
    if is_fireproof_col not in te.columns:
        te[is_fireproof_col] = 0
    if structure_col not in tr.columns:
        tr[structure_col] = 'missing'
    if structure_col not in te.columns:
        te[structure_col] = 'missing'

    fire_tr = pd.to_numeric(tr[is_fireproof_col], errors='coerce').fillna(0).astype('int8')
    fire_te = pd.to_numeric(te[is_fireproof_col], errors='coerce').fillna(0).astype('int8')

    # string dtypeにして欠損を埋める（NAを残さない）
    s_tr = tr[structure_col].astype('string').fillna('missing')
    s_te = te[structure_col].astype('string').fillna('missing')

    # 'fire' / 'nonfire' を Series として作る
    prefix_tr = pd.Series(np.where(fire_tr.to_numpy() == 1, 'fire', 'nonfire'), index=tr.index, dtype='string')
    prefix_te = pd.Series(np.where(fire_te.to_numpy() == 1, 'fire', 'nonfire'), index=te.index, dtype='string')

    # pandas の文字列結合（dtype事故回避）
    tr_col = prefix_tr.str.cat(s_tr, sep='_')
    te_col = prefix_te.str.cat(s_te, sep='_')

    # カテゴリ集合を統一（train+test）
    cats = pd.Index(pd.concat([tr_col, te_col], axis=0).unique())

    tr['fireproof_x_structure'] = pd.Categorical(tr_col, categories=cats)
    te['fireproof_x_structure'] = pd.Categorical(te_col, categories=cats)

    return tr, te


In [50]:
train_df_fe = add_cityplan_interactions(train_df_fe, clip_youseki=800, station_sign=-1, youseki_scale='ratio')
test_df_fe  = add_cityplan_interactions(test_df_fe,  clip_youseki=800, station_sign=-1, youseki_scale='ratio')

train_df_fe, test_df_fe = add_fireproof_x_structure_cat_both(train_df_fe, test_df_fe)

## セットバック比率

In [51]:
for df in [train_df_fe, test_df_fe]:
    denom = df['shikichi_area'].fillna(df['tochi_area']).fillna(df['kukaku_area'])
    df['setback_ratio'] = df['land_setback_clean'] / denom
    df.loc[denom <= 0, 'setback_ratio'] = np.nan
    df['setback_ratio'] = df['setback_ratio'].clip(0, 1)  # 物理的におかしい比率を抑制

## 土地制約・再建築リスクスコア

In [52]:
def add_land_constraint_score(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    score = 0.0

    # 建ぺい率・容積率（低いほど制約が強い）
    if 'kenpei' in out:
        score += (100 - out['kenpei'].clip(0, 100)) / 100

    if 'youseki' in out:
        score += (200 - out['youseki'].clip(0, 200)) / 200

    # 接道・セットバック
    score += out.get('has_setback', 0) * 1.0
    score += out.get('land_setback_log', 0).fillna(0) * 0.3
    score += out.get('setback_ratio', 0).fillna(0) * 2.0
    score += out.get('is_no_road', 0) * 2.0

    # 私道負担
    if 'shidou_area_ratio' in out:
        score += out['shidou_area_ratio'].fillna(0).clip(0, 1)

    out['land_constraint_score'] = score

    return out

In [53]:
train_df_fe = add_land_constraint_score(train_df_fe)
test_df_fe = add_land_constraint_score(test_df_fe)

## premium 設備カウント

In [54]:
PREMIUM_EQUIP_COLS = [
    '専有部分設備_空調・暖房_床暖房',
    '専有部分設備_キッチン_食器洗い乾燥機',
    '専有部分設備_浴室・洗面_浴室乾燥機',
    '専有部分設備_収納_ウォークインクローゼット',
]

def add_premium_equipment_count(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    cols = [c for c in PREMIUM_EQUIP_COLS if c in out.columns]
    out['premium_equipment_count'] = out[cols].sum(axis=1)
    return out

In [55]:
train_df_fe = add_premium_equipment_count(train_df_fe)
test_df_fe = add_premium_equipment_count(test_df_fe)

## 地価関連の交互作用

In [56]:
def add_land_price_interactions(
    df: pd.DataFrame,
    land_log_cols: tuple[str, ...] = (
        'log_land_price',
        'log_weighted_land_price_3',
        'land_theoretical_price',
        'land_theoretical_price_weighted',
        'land_theoretical_price_within1km_mp',
    ),
    age_col: str = 'effective_age',
    livability_col: str = 'livability_score',
    urban_col: str = 'urban_score',
    structure_col: str = 'building_structure',
    taus: tuple[float, ...] = (30.0,),
    max_age_clip: float = 200.0,
    add_structure_interactions: bool = True,
    add_structure_group_cat: bool = True,
) -> pd.DataFrame:
    """
    土地価格（log系）× 建物属性 interaction（v2：building_structure が数値コード前提）

    building_structure コード:
      1:木造 2:ブロック 3:鉄骨造 4:RC 5:SRC 6:PC 7:HPC 9:その他
      10:軽量鉄骨 11:ALC 12:鉄筋ブロック 13:CFT

    追加（代表）
      - {land}_x_effective_age
      - {land}_x_age_decay_{tau}
      - {land}_x_livability_score / {land}_x_urban_score
      - {land}_x_is_wood / _x_is_steel / _x_is_rcsrc / _x_is_precast / _x_is_block / _x_is_other
      - （任意）structure_group（カテゴリ）も付与
    """
    out = df.copy()

    # --- age ---
    age = func._as_numeric(out.get(age_col))
    if age is None:
        return out
    age_log1p = func._safe_log1p(age)

    age_decay_map: dict[int, pd.Series] = {}
    for tau in taus:
        t = int(float(tau))
        decay = np.exp(-age / float(t)).astype(float)
        out[f'age_decay_{t}'] = decay
        age_decay_map[t] = decay

    # --- other continuous ---
    liv = func._as_numeric(out.get(livability_col))
    urb = func._as_numeric(out.get(urban_col))

    # --- structure flags from code ---
    struct = func._as_numeric(out.get(structure_col))
    if add_structure_interactions and struct is not None:
        # 主要グループに集約（必要なら調整）
        is_wood = (struct == 1).astype('int8')

        # 鉄骨系：鉄骨造(3) + 軽量鉄骨(10) + CFT(13)
        is_steel = (struct.isin([3, 10, 13])).astype('int8')

        # RC/SRC
        is_rcsrc = (struct.isin([4, 5])).astype('int8')

        # プレキャスト系：PC(6) + HPC(7)
        is_precast = (struct.isin([6, 7])).astype('int8')

        # ブロック系：ブロック(2) + 鉄筋ブロック(12)
        is_block = (struct.isin([2, 12])).astype('int8')

        # その他（欠損含めて別途フラグ化したいなら）
        is_other = (struct.isin([9, 11])).astype('int8')  # 9:その他, 11:ALC

        out['is_wood'] = is_wood
        out['is_steel'] = is_steel
        out['is_rcsrc'] = is_rcsrc
        out['is_precast'] = is_precast
        out['is_block'] = is_block
        out['is_other_structure'] = is_other

        if add_structure_group_cat:
            # カテゴリ（CatBoostでそのまま使う/LightGBMならカテゴリ扱い可）
            # 優先順位で割り当て（複数に該当しない設計だが念のため）
            group = pd.Series('missing', index=out.index, dtype='string')
            group = group.mask(is_wood == 1, 'wood')
            group = group.mask(is_rcsrc == 1, 'rcsrc')
            group = group.mask(is_steel == 1, 'steel')
            group = group.mask(is_precast == 1, 'precast')
            group = group.mask(is_block == 1, 'block')
            group = group.mask(is_other == 1, 'other')
            out['structure_group'] = group

    # --- land cols ---
    used_land_cols = [c for c in land_log_cols if c in out.columns]
    if not used_land_cols:
        return out

    for lc in used_land_cols:
        landv = func._as_numeric(out.get(lc))
        if landv is None:
            continue

        # 土地×築年数（線形/対数）
        out[f'{lc}_x_effective_age'] = landv * age
        out[f'{lc}_x_age_log1p'] = landv * age_log1p

        # 土地×築年数（減衰）
        for t, decay in age_decay_map.items():
            out[f'{lc}_x_age_decay_{t}'] = landv * decay

        # 土地×都市性
        if liv is not None:
            out[f'{lc}_x_{livability_col}'] = landv * liv
        if urb is not None:
            out[f'{lc}_x_{urban_col}'] = landv * urb

        # 土地×構造（グループフラグ）
        if add_structure_interactions and struct is not None:
            out[f'{lc}_x_is_wood'] = landv * out['is_wood']
            out[f'{lc}_x_is_steel'] = landv * out['is_steel']
            out[f'{lc}_x_is_rcsrc'] = landv * out['is_rcsrc']
            out[f'{lc}_x_is_precast'] = landv * out['is_precast']
            out[f'{lc}_x_is_block'] = landv * out['is_block']
            out[f'{lc}_x_is_other_structure'] = landv * out['is_other_structure']

    return out

def add_land_x_area_interactions(
    df: pd.DataFrame,
    land_log_cols: tuple[str, ...] = (
        'land_theoretical_price',
        'land_theoretical_price_weighted',
        'land_theoretical_price_within1km_mp',
        'log_land_price',
        'log_weighted_land_price_3',
    ),
    senyu_area_log_col: str = 'senyu_area_log',
    kukaku_area_log_col: str = 'kukaku_area_log',
) -> pd.DataFrame:
    """
    低コストで当たりやすい「土地価格（log）× 面積（log）」を追加。
    - {land_col}_x_senyu_area_log
    - {land_col}_x_kukaku_area_log
    """
    out = df.copy()

    senyu = func._as_numeric(out.get(senyu_area_log_col))
    kukaku = func._as_numeric(out.get(kukaku_area_log_col))

    used_land_cols = [c for c in land_log_cols if c in out.columns]
    if len(used_land_cols) == 0:
        return out

    for lc in used_land_cols:
        landv = func._as_numeric(out.get(lc))
        if landv is None:
            continue
        if senyu is not None:
            out[f'{lc}_x_{senyu_area_log_col}'] = landv * senyu
        if kukaku is not None:
            out[f'{lc}_x_{kukaku_area_log_col}'] = landv * kukaku

    return out


def add_land_interactions(
    df: pd.DataFrame,
    taus: tuple[float, ...] = (30.0,),
) -> pd.DataFrame:
    """
    まず試す用のまとめ関数（必要最低限）。
    - 土地×築年数（linear + decay）
    - 土地×livability / urban
    - 土地×面積（senyu / kukaku）
    """
    out = df.copy()
    out = add_land_price_interactions(
        out,
        taus=taus,
        add_structure_interactions=True,
        add_structure_group_cat=True,
    )
    # out = add_land_x_area_interactions(out)
    return out


In [57]:
train_df_fe = add_land_interactions(train_df_fe, taus=(30.0,))
test_df_fe = add_land_interactions(test_df_fe, taus=(30.0,))

## 駅力スコアの交互作用

In [58]:
# def add_station_power_x_land(df: pd.DataFrame) -> pd.DataFrame:
#     out = df.copy()

#     land_cols = [
#         'log_weighted_land_price_3',
#         'land_theoretical_price_weighted',
#     ]

#     for lc in land_cols:
#         if lc in out.columns:
#             out[f'station_power_max3_x_{lc}'] = (
#                 out['station_power_max3'] * out[lc]
#             )
#             out[f'station_power_sum3_x_{lc}'] = (
#                 out['station_power_sum3'] * out[lc]
#             )

#     return out

# def add_station_power_x_age(df: pd.DataFrame) -> pd.DataFrame:
#     out = df.copy()

#     if 'age_decay_30' in out.columns:
#         out['station_power_max3_x_age_decay_30'] = (
#             out['station_power_max3'] * out['age_decay_30']
#         )

#     return out

# def add_station_power_x_livability(df: pd.DataFrame) -> pd.DataFrame:
#     out = df.copy()

#     if 'livability_score' in out.columns:
#         out['station_power_max3_x_livability'] = (
#             out['station_power_max3'] * out['livability_score']
#         )

#     return out

# def add_station_power_x_structure(df: pd.DataFrame) -> pd.DataFrame:
#     out = df.copy()

#     for g in ['is_wood', 'is_rcsrc']:
#         if g in out.columns:
#             out[f'station_power_max3_x_{g}'] = (
#                 out['station_power_max3'] * out[g]
#             )

#     return out


In [59]:
# train_df_fe = add_station_power_x_land(train_df_fe)
# test_df_fe = add_station_power_x_land(test_df_fe)

# train_df_fe = add_station_power_x_age(train_df_fe)
# test_df_fe = add_station_power_x_age(test_df_fe)

# train_df_fe = add_station_power_x_livability(train_df_fe)
# test_df_fe = add_station_power_x_livability(test_df_fe)

# train_df_fe = add_station_power_x_structure(train_df_fe)
# test_df_fe = add_station_power_x_structure(test_df_fe)

## タグ情報の集約

In [60]:
def add_price_improving_features_from_tags(
    df: pd.DataFrame,
    *,
    keep_original_cols: bool = True,
) -> pd.DataFrame:
    """
    タグ系（0/1想定）から、価格決定構造に沿った集約特徴量を作成する。

    目的：
      - 過剰予測を抑えるための「ディスカウント理由」を明示
      - 列数を爆増させずに、意味単位のスコアへ圧縮
      - 存在する列だけで安全に動く

    想定：
      - タグ列は 0/1 (または True/False) が基本
      - df は train/test どちらでも可（タグが無ければ0埋め）
    """
    out = df.copy()

    # ----------------------------
    # 1) 強ディスカウント（個別保持）
    # ----------------------------
    hard_penalty_cols = func._existing_cols(
        out,
        [
            '専有部分設備_トイレ_トイレなし',
            '専有部分設備_浴室・洗面_バスなし',
            '専有部分設備_トイレ_共同トイレ',
            '専有部分設備_浴室・洗面_共同バス',
            '建物設備（給排水・インフラ）_汲取',
            '建物設備（給排水・インフラ）_浄化槽',
        ],
    )
    # 個別列は keep_original_cols=False のとき落とす可能性があるため、
    # 集約フラグも作っておく
    out['hard_penalty_any'] = func._any_cols(out, hard_penalty_cols, 'hard_penalty_any')

    # ----------------------------
    # 2) 土地タグ：形状・接道プレミアム/ペナルティ
    # ----------------------------
    land_premium_cols = func._existing_cols(
        out,
        [
            '土地価格_角地',
            '土地価格_南道路',
            '土地価格_整形地',
            '土地価格_低層住宅地',
        ],
    )
    land_penalty_cols = func._existing_cols(out, ['土地価格_敷地延長・変形地'])

    out['tag_land_premium_score'] = func._sum_cols(out, land_premium_cols, 'tag_land_premium_score')
    out['tag_land_penalty_flag'] = func._any_cols(out, land_penalty_cols, 'tag_land_penalty_flag')

    # ----------------------------
    # 3) キッチングレード（階層化）
    # ----------------------------
    kitchen_base_cols = func._existing_cols(
        out,
        [
            '専有部分設備_キッチン_システムキッチン',
            '専有部分設備_キッチン_カウンターキッチン',
            '専有部分設備_キッチン_食器洗い乾燥機',
            '専有部分設備_キッチン_ディスポーザー',
            '専有部分設備_キッチン_浄水器・活水器',
            '専有部分設備_キッチン_冷蔵庫あり',
            '専有部分設備_キッチン_IHコンロ',
            '専有部分設備_キッチン_ガスコンロ',
            '専有部分設備_キッチン_電気コンロ',
            '専有部分設備_キッチン_給湯',
        ],
    )
    # 口数は順序を入れる（存在する列だけ評価）
    burner_map = {
        '専有部分設備_キッチン_コンロ一口': 0,
        '専有部分設備_キッチン_コンロ二口': 1,
        '専有部分設備_キッチン_コンロ三口': 2,
        '専有部分設備_キッチン_コンロ四口以上': 3,
    }
    burner_cols = func._existing_cols(out, burner_map.keys())

    out['kitchen_upgrade_count'] = func._sum_cols(out, kitchen_base_cols, 'kitchen_upgrade_count')

    if burner_cols:
        # 0/1列のうち「該当している口数」を値として取り出す（複数立つことは通常ない想定）
        burner_score = np.zeros(len(out), dtype=np.float32)
        for c in burner_cols:
            burner_score = np.maximum(burner_score, out[c].fillna(0).astype('float32') * burner_map[c])
        out['kitchen_burner_score'] = burner_score
    else:
        out['kitchen_burner_score'] = 0.0

    # 総合（重みは軽め。強くしたいなら後で調整）
    out['kitchen_grade_score'] = (
        out['kitchen_upgrade_count'].astype('float32') + out['kitchen_burner_score'].astype('float32')
    )

    # 低グレードシグナル（MAPEで過剰予測抑制に効きやすい）
    out['kitchen_low_flag'] = func._any_cols(out, func._existing_cols(out, ['専有部分設備_キッチン_コンロ一口']), 'kitchen_low_flag')
    out['kitchen_high_flag'] = (
        (out['kitchen_grade_score'].astype('float32') >= 3.0).astype('int8')
    )

    # ----------------------------
    # 4) 水回り（浴室・洗面）グレード
    # ----------------------------
    wet_positive_cols = func._existing_cols(
        out,
        [
            '専有部分設備_浴室・洗面_追焚機能',
            '専有部分設備_浴室・洗面_浴室乾燥機',
            '専有部分設備_浴室・洗面_浴室暖房',
            '専有部分設備_浴室・洗面_オートバス',
            '専有部分設備_浴室・洗面_洗面所独立',
            '専有部分設備_浴室・洗面_シャワー付洗面化粧台',
            '専有部分設備_浴室・洗面_浴室TV',
            '専有部分設備_浴室・洗面_高温差湯式',
            '専有部分設備_浴室・洗面_浴室1.6×1.8M以上',
            '専有部分設備_浴室・洗面_浴室1.6×2.0M以上',
        ],
    )
    wet_separation_cols = func._existing_cols(out, ['専有部分設備_浴室・洗面_バス・トイレ別'])
    # バスなし/共同バスは hard_penalty で個別保持済み

    out['wet_area_upgrade_count'] = func._sum_cols(out, wet_positive_cols, 'wet_area_upgrade_count')
    out['wet_area_separation_flag'] = func._any_cols(out, wet_separation_cols, 'wet_area_separation_flag')
    out['wet_area_grade_score'] = (
        out['wet_area_upgrade_count'].astype('float32') + 1.0 * out['wet_area_separation_flag'].astype('float32')
    )

    # ----------------------------
    # 5) トイレグレード（温水洗浄便座など）
    # ----------------------------
    toilet_positive_cols = func._existing_cols(out, ['専有部分設備_トイレ_温水洗浄便座', '専有部分設備_トイレ_専用トイレ'])
    out['toilet_upgrade_count'] = func._sum_cols(out, toilet_positive_cols, 'toilet_upgrade_count')

    # ----------------------------
    # 6) 収納スコア
    # ----------------------------
    storage_cols = func._existing_cols(
        out,
        [
            '専有部分設備_収納_ウォークインクローゼット',
            '専有部分設備_収納_シューズインクローゼット',
            '専有部分設備_収納_シューズクローク',
            '専有部分設備_収納_パントリー',
            '専有部分設備_収納_トランクルーム',
            '専有部分設備_収納_全居室収納',
            '専有部分設備_収納_床下収納',
            '専有部分設備_収納_クローゼット',
            '専有部分設備_収納_シューズボックス',
        ],
    )
    out['storage_score'] = func._sum_cols(out, storage_cols, 'storage_score')
    out['storage_high_flag'] = (out['storage_score'].astype('float32') >= 3.0).astype('int8')

    # ----------------------------
    # 7) 空調・暖房（床暖房は強め）
    # ----------------------------
    hvac_cols = func._existing_cols(
        out,
        [
            '専有部分設備_空調・暖房_エアコン',
            '専有部分設備_空調・暖房_冷房',
            '専有部分設備_空調・暖房_ガス暖房',
            '専有部分設備_空調・暖房_石油暖房',
            '専有部分設備_空調・暖房_床暖房',
        ],
    )
    out['hvac_count'] = func._sum_cols(out, hvac_cols, 'hvac_count')
    if '専有部分設備_空調・暖房_床暖房' in out.columns:
        out['hvac_grade_score'] = out['hvac_count'].astype('float32') + out['専有部分設備_空調・暖房_床暖房'].fillna(0).astype('float32')
    else:
        out['hvac_grade_score'] = out['hvac_count'].astype('float32')

    # ----------------------------
    # 8) 通信・ネット（集約）
    # ----------------------------
    net_cols = func._existing_cols(
        out,
        [
            '専有部分設備_通信_インターネット対応',
            '専有部分設備_通信_光ファイバー',
            '専有部分設備_通信_高速インターネット',
            '専有部分設備_通信_インターネット使用料無料',
            '専有部分設備_通信_CATV',
            '専有部分設備_通信_CATV利用料無料',
            '専有部分設備_通信_BSアンテナ',
            '専有部分設備_通信_CSアンテナ',
            '専有部分設備_通信_有線放送',
        ],
    )
    out['net_ready_score'] = func._sum_cols(out, net_cols, 'net_ready_score')

    # ----------------------------
    # 9) セキュリティ・共用設備（マンション寄り要素は圧縮して保持）
    # ----------------------------
    security_cols = func._existing_cols(
        out,
        [
            '建物構造・性能_オートロック',
            '建物構造・性能_防犯カメラ',
            '建物構造・性能_TVモニタ付インターホン',
            '建物構造・性能_セキュリティ会社加入済み',
            '建物構造・性能_セキュリティー充実',
            '建物構造・性能_24時間有人管理',
            '建物構造・性能_管理人常駐',
        ],
    )
    service_cols = func._existing_cols(
        out,
        [
            '建物構造・性能_コンシェルジュサービス',
            '建物構造・性能_フロントサービス',
            '建物構造・性能_内廊下',
            '建物構造・性能_宅配ボックス',
            '建物構造・性能_ごみ出し24時間OK',
            '建物構造・性能_キッズルーム',
            '建物構造・性能_ゲストルーム',
        ],
    )
    out['security_score'] = func._sum_cols(out, security_cols, 'security_score')
    out['shared_service_score'] = func._sum_cols(out, service_cols, 'shared_service_score')

    # 高級マンションっぽさ（house ではノイズになりやすいのでフラグ1本で）
    luxury_cols = func._existing_cols(
        out,
        [
            '建物構造・性能_タワーマンション',
            '建物構造・性能_ハイグレードマンション',
            '建物構造・性能_リゾートマンション',
            '建物構造・性能_内廊下',
            '建物構造・性能_コンシェルジュサービス',
        ],
    )
    out['luxury_mansion_flag'] = func._any_cols(out, luxury_cols, 'luxury_mansion_flag')

    # ----------------------------
    # 10) インフラ（ディスカウント/モダン）
    # ----------------------------
    infra_penalty_cols = func._existing_cols(
        out,
        [
            '建物設備（給排水・インフラ）_汲取',
            '建物設備（給排水・インフラ）_浄化槽',
            '建物設備（給排水・インフラ）_井戸',
            '建物設備（給排水・インフラ）_プロパンガス',
            '建物設備（給排水・インフラ）_ガスその他',
            '建物設備（給排水・インフラ）_排水その他',
            '建物設備（給排水・インフラ）_水道その他',
        ],
    )
    infra_modern_cols = func._existing_cols(
        out,
        [
            '建物設備（給排水・インフラ）_都市ガス',
            '建物設備（給排水・インフラ）_下水',
            '建物設備（給排水・インフラ）_公営水道',
            '建物設備（給排水・インフラ）_オール電化',
            '建物設備（給排水・インフラ）_太陽光発電システム',
            '建物設備（給排水・インフラ）_家庭用燃料電池',
        ],
    )
    out['infra_penalty_score'] = func._sum_cols(out, infra_penalty_cols, 'infra_penalty_score')
    out['infra_modern_score'] = func._sum_cols(out, infra_modern_cols, 'infra_modern_score')

    # ----------------------------
    # 11) 環境プレミアム（livability_score と被るので圧縮して保持）
    # ----------------------------
    env400_cols = func._existing_cols(
        out,
        [
            '環境プレミアム_コンビニ 400ｍ以内',
            '環境プレミアム_保育園・幼稚園 400m以内',
            '環境プレミアム_公園 400ｍ以内',
        ],
    )
    env800_cols = func._existing_cols(
        out,
        [
            '環境プレミアム_コンビニ 800ｍ以内',
            '環境プレミアム_スーパー 800ｍ以内',
            '環境プレミアム_フィットネス施設（プール含む）800m以内',
            '環境プレミアム_中学校 800m以内',
            '環境プレミアム_小学校 800ｍ以内',
            '環境プレミアム_総合病院 800ｍ以内',
            '環境プレミアム_子育てに嬉しい環境',
        ],
    )
    out['env_premium_count_400m'] = func._sum_cols(out, env400_cols, 'env_premium_count_400m')
    out['env_premium_count_800m'] = func._sum_cols(out, env800_cols, 'env_premium_count_800m')
    out['env_premium_weighted'] = (
        2.0 * out['env_premium_count_400m'].astype('float32') + 1.0 * out['env_premium_count_800m'].astype('float32')
    )

    # ----------------------------
    # 12) 証明書・性能評価・記録（品質担保の代理）
    # ----------------------------
    cert_cols = func._existing_cols(
        out,
        [
            '用途・投資セグメント_不動産の証明書・性能評価_インスペクション（建物検査）報告書',
            '用途・投資セグメント_不動産の証明書・性能評価_住宅性能保証制度証明書',
            '用途・投資セグメント_不動産の証明書・性能評価_修繕・点検の記録',
            '用途・投資セグメント_不動産の証明書・性能評価_地盤調査済',
            '用途・投資セグメント_不動産の証明書・性能評価_建築確認完了検査済証',
            '用途・投資セグメント_不動産の証明書・性能評価_建設住宅性能評価書（新築時）',
            '用途・投資セグメント_不動産の証明書・性能評価_建設住宅性能評価書（既存住宅）',
            '用途・投資セグメント_不動産の証明書・性能評価_新築時・増改築時の設計図書',
            '用途・投資セグメント_不動産の証明書・性能評価_瑕疵保証（不動産会社独自）付',
            '用途・投資セグメント_不動産の証明書・性能評価_瑕疵保険（国交省指定）による保証付',
            '用途・投資セグメント_不動産の証明書・性能評価_瑕疵保険（国交省指定）による保証利用可',
            '用途・投資セグメント_不動産の証明書・性能評価_耐震基準適合証明書',
            '用途・投資セグメント_不動産の証明書・性能評価_設計住宅性能評価書',
            '用途・投資セグメント_不動産の証明書・性能評価_長期優良住宅認定通知書',
        ],
    )
    out['cert_score'] = func._sum_cols(out, cert_cols, 'cert_score')
    # 強い証明の有無（事故を減らす）
    cert_strong_cols = func._existing_cols(
        out,
        [
            '用途・投資セグメント_不動産の証明書・性能評価_建築確認完了検査済証',
            '用途・投資セグメント_不動産の証明書・性能評価_地盤調査済',
            '用途・投資セグメント_不動産の証明書・性能評価_建設住宅性能評価書（既存住宅）',
            '用途・投資セグメント_不動産の証明書・性能評価_建設住宅性能評価書（新築時）',
        ],
    )
    out['cert_strong_flag'] = func._any_cols(out, cert_strong_cols, 'cert_strong_flag')

    # ----------------------------
    # 13) 売買ステータス・投資セグメント
    # ----------------------------
    out['is_rented_full_flag'] = func._any_cols(
        out,
        func._existing_cols(out, ['用途・投資セグメント_売買ステータス_満室賃貸中']),
        'is_rented_full_flag',
    )

    # ----------------------------
    # 14) リフォーム（reform_* は数で持つのが強い）
    # ----------------------------
    reform_interior_cols = func._existing_cols(out, [f'reform_interior {i}' for i in range(1, 7)])
    reform_exterior_cols = func._existing_cols(out, [c for c in ['reform_exterior 1', 'reform_exterior 1 ', 'reform_exterior 2'] if c in out.columns])
    reform_wet_cols = func._existing_cols(out, [f'reform_wet_area {i}' for i in range(1, 7)])

    out['reform_interior_cnt'] = func._sum_cols(out, reform_interior_cols, 'reform_interior_cnt')
    out['reform_exterior_cnt'] = func._sum_cols(out, reform_exterior_cols, 'reform_exterior_cnt')
    out['reform_wet_cnt'] = func._sum_cols(out, reform_wet_cols, 'reform_wet_cnt')
    out['reform_total_cnt'] = (
        out['reform_interior_cnt'].astype('float32')
        + out['reform_exterior_cnt'].astype('float32')
        + out['reform_wet_cnt'].astype('float32')
    )
    out['reform_any_flag'] = (out['reform_total_cnt'].astype('float32') > 0).astype('int8')
    out['reform_wet_heavy_flag'] = (out['reform_wet_cnt'].astype('float32') >= 2).astype('int8')

    # ----------------------------
    # 15) 最終：設備総合（高級/準高級/ディスカウントの三段構え）
    # ----------------------------
    # 既にあなたが作っている premium_equipment_count がある前提でも、無くても動くようにする
    if 'premium_equipment_count' not in out.columns:
        premium_cols = func._existing_cols(
            out,
            [
                '専有部分設備_空調・暖房_床暖房',
                '専有部分設備_キッチン_食器洗い乾燥機',
                '専有部分設備_浴室・洗面_浴室乾燥機',
                '専有部分設備_収納_ウォークインクローゼット',
            ],
        )
        out['premium_equipment_count'] = func._sum_cols(out, premium_cols, 'premium_equipment_count')

    # 準プレミアム：高級ほどではないが差が出やすい要素
    semi_premium_cols = func._existing_cols(
        out,
        [
            '専有部分設備_浴室・洗面_追焚機能',
            '専有部分設備_浴室・洗面_洗面所独立',
            '専有部分設備_キッチン_システムキッチン',
            '建物構造・性能_宅配ボックス',
            '建物構造・性能_オートロック',
        ],
    )
    out['semi_premium_count'] = func._sum_cols(out, semi_premium_cols, 'semi_premium_count')

    # ディスカウントをまとめる（過剰予測抑制）
    out['discount_pressure_score'] = (
        2.0 * out['hard_penalty_any'].astype('float32')
        + 1.0 * out['infra_penalty_score'].astype('float32')
        + 1.0 * out['tag_land_penalty_flag'].astype('float32')
    )

    # ----------------------------
    # 16) 必要なら元タグ列を落とす
    # ----------------------------
    if not keep_original_cols:
        drop_cols = set(
            hard_penalty_cols
            + land_premium_cols
            + land_penalty_cols
            + kitchen_base_cols
            + burner_cols
            + wet_positive_cols
            + wet_separation_cols
            + toilet_positive_cols
            + storage_cols
            + hvac_cols
            + net_cols
            + security_cols
            + service_cols
            + luxury_cols
            + infra_penalty_cols
            + infra_modern_cols
            + env400_cols
            + env800_cols
            + cert_cols
            + cert_strong_cols
            + reform_interior_cols
            + reform_exterior_cols
            + reform_wet_cols
            + semi_premium_cols
        )
        out = out.drop(columns=[c for c in drop_cols if c in out.columns])

    return out


In [61]:
train_df_fe = add_price_improving_features_from_tags(train_df_fe, keep_original_cols=True)
test_df_fe = add_price_improving_features_from_tags(test_df_fe, keep_original_cols=True)

## 面積・地価の共通特徴量の作成

In [62]:
def make_effective_area_log(df: pd.DataFrame) -> pd.Series:
    eff = np.full(len(df), np.nan, dtype='float64')
    g = df['property_group']

    # residential: 専有面積
    idx = g == 'residential'
    eff[idx] = df.loc[idx, 'senyu_area_log']

    # house: 建物延床 + 土地
    idx = g == 'house'
    a = df.loc[idx, 'nobeyuka_area_log']
    l = df.loc[idx, 'tochi_area_log']

    eff[idx] = (
        0.7 * a + 0.3 * l
    )
    eff[idx] = np.where(
        np.isnan(eff[idx]),
        np.where(np.isnan(a), l, a),
        eff[idx]
    )

    # other: 延床優先、なければ土地
    idx = g == 'other'
    eff[idx] = df.loc[idx, 'nobeyuka_area_log']
    eff[idx] = np.where(
        np.isnan(eff[idx]),
        df.loc[idx, 'tochi_area_log'],
        eff[idx]
    )

    return eff

In [63]:
train_df_fe['effective_area_log'] = make_effective_area_log(train_df_fe)
test_df_fe['effective_area_log']  = make_effective_area_log(test_df_fe)

In [64]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import QuantileTransformer


def add_effective_land_price_with_group_calibration(
    train_df_fe: pd.DataFrame,
    test_df_fe: pd.DataFrame,
    *,
    group_col: str = 'property_group',
    groups_to_fit: tuple[str, ...] = ('house', 'other'),
    use_quantile_for_urban: bool = True,
    urban_clip: tuple[float, float] | None = (-5.0, 5.0),
    ridge_alpha: float = 1.0,
    random_state: int = 42,
    area_clip: tuple[float, float] = (0.0, 500.0),
    clip_calibrated: bool = True,
    clip_quantiles: tuple[float, float] = (0.001, 0.999),
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
    tr = train_df_fe.copy()
    te = test_df_fe.copy()

    # =========
    # 0) flags
    # =========
    for df in [tr, te]:
        df['has_land_theoretical_price'] = (~df['land_theoretical_price_weighted'].isna()).astype('int8')
        df['has_eki2'] = (~df['eki_name2'].isna()).astype('int8')
        df['has_density'] = (~df['unit_land_density'].isna()).astype('int8')

    # ============================
    # 1) urban_score のスケーリング
    # ============================
    tr_u = tr['urban_score'].astype('float64')
    te_u = te['urban_score'].astype('float64')

    if urban_clip is not None:
        lo, hi = urban_clip
        tr_u = tr_u.clip(lo, hi)
        te_u = te_u.clip(lo, hi)

    if use_quantile_for_urban:
        qt = QuantileTransformer(
            n_quantiles=min(2000, max(10, int(tr_u.notna().sum()))),
            output_distribution='uniform',
            subsample=int(1e9),
            random_state=random_state,
        )
        fill_val = tr_u.median()
        tr_u_2d = tr_u.fillna(fill_val).to_numpy().reshape(-1, 1)
        te_u_2d = te_u.fillna(fill_val).to_numpy().reshape(-1, 1)

        tr['urban_score_scaled'] = qt.fit_transform(tr_u_2d).ravel()
        te['urban_score_scaled'] = qt.transform(te_u_2d).ravel()
    else:
        u_min = np.nanmin(tr_u.to_numpy())
        u_max = np.nanmax(tr_u.to_numpy())
        denom = (u_max - u_min) if (u_max - u_min) != 0 else 1.0
        tr['urban_score_scaled'] = ((tr_u - u_min) / denom).fillna(0.5).clip(0.0, 1.0)
        te['urban_score_scaled'] = ((te_u - u_min) / denom).fillna(0.5).clip(0.0, 1.0)

    # ==================================
    # 2) proxy 用の面積 area_for_proxy（property_group 対応）
    # ==================================
    def build_area_for_proxy(df: pd.DataFrame) -> pd.Series:
        area = df.get('area_per_room')
        area = area.astype('float64') if area is not None else pd.Series(np.nan, index=df.index, dtype='float64')

        if group_col in df.columns:
            g = df[group_col].astype('string')
            is_res = g.eq('residential')
            is_house = g.eq('house')
            is_other = g.eq('other')
        else:
            is_res = pd.Series(False, index=df.index)
            is_house = pd.Series(False, index=df.index)
            is_other = pd.Series(False, index=df.index)

        # residential: area_per_room -> senyu_area（専有面積）へフォールバック
        if 'senyu_area' in df.columns:
            area = area.where(~(is_res & area.isna()), df['senyu_area'].astype('float64'))

        # house: 土地面積系 -> house_area -> nobeyuka_area/total_floor_area の順
        for c in ['shikichi_area', 'tochi_area', 'kukaku_area', 'house_area', 'nobeyuka_area', 'total_floor_area']:
            if c in df.columns:
                area = area.where(~(is_house & area.isna()), df[c].astype('float64'))

        # other: 延床/建物 -> 土地面積系の順（混在に強いフォールバック）
        for c in ['nobeyuka_area', 'total_floor_area', 'house_area', 'shikichi_area', 'tochi_area', 'kukaku_area']:
            if c in df.columns:
                area = area.where(~(is_other & area.isna()), df[c].astype('float64'))

        lo, hi = area_clip
        area = area.clip(lower=lo, upper=hi)
        return area

    tr['area_for_proxy'] = build_area_for_proxy(tr)
    te['area_for_proxy'] = build_area_for_proxy(te)

    tr['has_area_for_proxy'] = (~tr['area_for_proxy'].isna()).astype('int8')
    te['has_area_for_proxy'] = (~te['area_for_proxy'].isna()).astype('int8')

    area_fill = tr['area_for_proxy'].median()
    tr['area_for_proxy_filled'] = tr['area_for_proxy'].fillna(area_fill)
    te['area_for_proxy_filled'] = te['area_for_proxy'].fillna(area_fill)

    # ==================================
    # 3) 校正用X（3次元）を作る
    # ==================================
    def build_calib_X(df: pd.DataFrame) -> np.ndarray:
        sps = df['station_power_sum3'].astype('float64').to_numpy()
        sps = np.where(np.isfinite(sps), sps, 0.0)

        ups = df['urban_score_scaled'].astype('float64').to_numpy()
        ups = np.where(np.isfinite(ups), ups, 0.5)

        area = df['area_for_proxy_filled'].astype('float64').to_numpy()
        area = np.where(np.isfinite(area), area, 0.0)

        x1 = np.log1p(np.clip(sps, 0.0, None))
        x2 = ups
        x3 = np.log1p(np.clip(area, 0.0, None))
        return np.vstack([x1, x2, x3]).T

    X_tr_all = build_calib_X(tr)
    X_te_all = build_calib_X(te)

    # ==================================
    # 4) 用途別 Ridge 校正（property_group）
    # ==================================
    tr_pred = np.full(len(tr), np.nan, dtype='float64')
    te_pred = np.full(len(te), np.nan, dtype='float64')

    models = {}
    fit_stats = {}

    y_all = tr['land_theoretical_price_weighted'].to_numpy()
    m_y = np.isfinite(y_all)

    def fit_one(mask_fit: np.ndarray, key: str) -> Ridge | None:
        if int(mask_fit.sum()) < 200:
            return None

        X = X_tr_all[mask_fit]
        y = y_all[mask_fit]
        ok = np.all(np.isfinite(X), axis=1) & np.isfinite(y)
        X = X[ok]
        y = y[ok]

        if len(y) < 200:
            return None

        model = Ridge(alpha=ridge_alpha, random_state=random_state)
        model.fit(X, y)

        fit_stats[key] = {
            'n_fit': int(len(y)),
            'coef': model.coef_.astype('float64').tolist(),
            'intercept': float(model.intercept_),
        }
        return model

    # 全体モデル（フォールバック）
    model_all = fit_one(m_y, 'all')
    if model_all is None:
        raise ValueError('全体校正モデルの学習に失敗しました（有効行が少ない/特徴量が壊れている可能性）。')
    models['all'] = model_all

    if group_col in tr.columns and group_col in te.columns:
        for g in groups_to_fit:
            mask_g = tr[group_col].astype('string').to_numpy() == g
            mask_fit = mask_g & m_y
            model_g = fit_one(mask_fit, g)

            if model_g is None:
                models[g] = model_all
                fit_stats[g] = {
                    'fallback_to': 'all',
                    'n_fit': int(mask_fit.sum()),
                }
            else:
                models[g] = model_g

        g_tr = tr[group_col].astype('string').to_numpy()
        g_te = te[group_col].astype('string').to_numpy()

        # groups_to_fit の行はそれぞれのモデルで予測
        for g, model in models.items():
            if g == 'all':
                continue

            idx_tr = np.where(g_tr == g)[0]
            if len(idx_tr) > 0:
                tr_pred[idx_tr] = model.predict(X_tr_all[idx_tr])

            idx_te = np.where(g_te == g)[0]
            if len(idx_te) > 0:
                te_pred[idx_te] = model.predict(X_te_all[idx_te])

        # それ以外（residential など）は全体モデルで埋める
        idx_tr_nan = np.where(~np.isfinite(tr_pred))[0]
        if len(idx_tr_nan) > 0:
            tr_pred[idx_tr_nan] = model_all.predict(X_tr_all[idx_tr_nan])

        idx_te_nan = np.where(~np.isfinite(te_pred))[0]
        if len(idx_te_nan) > 0:
            te_pred[idx_te_nan] = model_all.predict(X_te_all[idx_te_nan])
    else:
        tr_pred = model_all.predict(X_tr_all)
        te_pred = model_all.predict(X_te_all)

    tr['proxy_land_price_calibrated'] = tr_pred
    te['proxy_land_price_calibrated'] = te_pred

    # =========================================
    # 4.5) 外挿の抑制：校正値クリップ（任意・推奨）
    # =========================================
    clip_lo = None
    clip_hi = None
    if clip_calibrated:
        q_lo, q_hi = clip_quantiles
        y_fit = tr.loc[m_y, 'land_theoretical_price_weighted'].to_numpy()
        clip_lo = float(np.quantile(y_fit, q_lo))
        clip_hi = float(np.quantile(y_fit, q_hi))

        tr['proxy_land_price_calibrated'] = tr['proxy_land_price_calibrated'].clip(clip_lo, clip_hi)
        te['proxy_land_price_calibrated'] = te['proxy_land_price_calibrated'].clip(clip_lo, clip_hi)

    # =========================================
    # 5) effective_land_price の作成（切り替え）
    # =========================================
    for df in [tr, te]:
        df['effective_land_price'] = np.where(
            df['has_land_theoretical_price'].to_numpy() == 1,
            df['land_theoretical_price_weighted'].to_numpy(),
            df['proxy_land_price_calibrated'].to_numpy(),
        ).astype('float64')

    info = {
        'urban_scaler': 'quantile' if use_quantile_for_urban else 'minmax',
        'urban_clip': urban_clip,
        'ridge_alpha': ridge_alpha,
        'area_clip': area_clip,
        'area_fill': float(area_fill) if np.isfinite(area_fill) else None,
        'groups_to_fit': list(groups_to_fit),
        'fit_stats': fit_stats,
        'clip_calibrated': clip_calibrated,
        'clip_quantiles': clip_quantiles,
        'clip_lo': clip_lo,
        'clip_hi': clip_hi,
    }
    return tr, te, info


In [65]:
train_df_fe, test_df_fe, cal_info = add_effective_land_price_with_group_calibration(
    train_df_fe,
    test_df_fe,
    group_col='property_group',
    groups_to_fit=('house', 'other'),
    use_quantile_for_urban=True,
    urban_clip=(-5.0, 5.0),
    ridge_alpha=1.0,
)

print(cal_info['fit_stats'])
print(train_df_fe[['effective_land_price', 'proxy_land_price_calibrated', 'land_theoretical_price_weighted']].describe())


{'all': {'n_fit': 165577, 'coef': [0.5629432991025102, 0.8627245482313626, 0.2807017472547172], 'intercept': 14.026545512863272}, 'house': {'n_fit': 153455, 'coef': [0.5017758790242636, 0.6778290018119314, 0.5156742585140934], 'intercept': 13.452951307726979}, 'other': {'n_fit': 12122, 'coef': [1.0863762732953717, 2.2266349104538605, -0.44849790127655187], 'intercept': 15.595617187477462}}
       effective_land_price  proxy_land_price_calibrated  \
count         363924.000000                363924.000000   
mean              16.572026                    16.572026   
std                0.561901                     0.360342   
min               11.262605                    14.267928   
25%               16.315927                    16.383697   
50%               16.638352                    16.618029   
75%               16.826680                    16.781873   
max               24.439706                    19.803590   

       land_theoretical_price_weighted  
count                    

## 土砂災害

In [66]:
DISASTER_RAW_COLS = [
    'keikai_dist_m',
    'kyuusha_dist_m',
    'dosham_in',
    'dosham_A30a5_005_max',
    'dosham_A30a5_006_max',
    'dosham_A30a5_007_max',
    'dosham_A30a5_008_max',
    'dosham_A30a5_009_max',
    'dosham_A30a5_010_max',
    'dosham_phen_gake',
    'dosham_phen_doseki',
    'dosham_phen_jisuberi',
    'dosham_phen_nadare',
]

def _add_dist_pack(df: pd.DataFrame, dist_col: str, prefix: str, clip_m: float = 10000.0) -> pd.DataFrame:
    if dist_col not in df.columns:
        return df

    d = pd.to_numeric(df[dist_col], errors='coerce').astype('float32')

    # NaN（近傍なし）は遠い扱いに寄せる：学習/推論で安定
    d2 = d.fillna(clip_m).clip(lower=0, upper=clip_m)

    df[f'{prefix}_dist_clip'] = d2.astype('float32')
    df[f'{prefix}_dist_log1p'] = np.log1p(d2).astype('float32')

    # 閾値フラグ（多くのケースで生distより安定して効く）
    for thr in [100, 300, 1000, 5000]:
        df[f'{prefix}_within_{thr}m'] = (d2 <= thr).astype('int8')

    # 近傍なしフラグ（NaNだったこと自体が情報になることがある）
    df[f'{prefix}_no_near'] = d.isna().astype('int8')

    return df


def _add_mesh_num_pack(df: pd.DataFrame, col: str, prefix: str) -> pd.DataFrame:
    if col not in df.columns:
        return df

    x = pd.to_numeric(df[col], errors='coerce').astype('float32')
    df[f'{prefix}_{col}_has'] = x.notna().astype('int8')

    # マイナスは想定外なので0に寄せる
    x2 = x.fillna(0.0).clip(lower=0)
    df[f'{prefix}_{col}_log1p'] = np.log1p(x2).astype('float32')
    return df


def add_disaster_features_other(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 距離：生distはモデルに入れず、圧縮パックに置換する
    df = _add_dist_pack(df, 'keikai_dist_m', prefix='keikai', clip_m=10000.0)
    df = _add_dist_pack(df, 'kyuusha_dist_m', prefix='kyuusha', clip_m=10000.0)

    # メッシュ：雨量（A30a5_005/006）中心。勾配(A30a5_007)は任意で追加
    df = _add_mesh_num_pack(df, 'dosham_A30a5_005_max', prefix='dosham')
    df = _add_mesh_num_pack(df, 'dosham_A30a5_006_max', prefix='dosham')
    df = _add_mesh_num_pack(df, 'dosham_A30a5_007_max', prefix='dosham')

    # dosham_in はそのまま使う（なければ作らない）
    if 'dosham_in' in df.columns:
        df['dosham_in'] = pd.to_numeric(df['dosham_in'], errors='coerce').fillna(0).astype('int8')

    # phen は、重要度が立っていた gake のみ残す（他はノイズ源になりやすい）
    if 'dosham_phen_gake' in df.columns:
        df['dosham_phen_gake'] = pd.to_numeric(df['dosham_phen_gake'], errors='coerce').fillna(0).astype('int8')

    return df

In [67]:
train_df_fe = add_disaster_features_other(train_df_fe)
test_df_fe = add_disaster_features_other(test_df_fe)

## 一軒家の築古物件へのディスカウント特徴量

In [68]:
def add_house_error_fix_features(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:

    train = train_df.copy()
    test = test_df.copy()
    combined = pd.concat([train, test], ignore_index=True)

    # =====================
    # 1. 築年数 piecewise
    # =====================
    # 1) 築年 piecewise（連続）
    age = pd.to_numeric(combined.get('effective_age'), errors='coerce')
    combined['effective_age_missing'] = age.isna().astype('int8')
    age_fill = age.fillna(0.0).clip(lower=0.0)

    def _piece(x, start, width):
        return (x - start).clip(lower=0, upper=width)

    combined['age_0_10']    = _piece(age_fill, 0, 10)
    combined['age_10_20']   = _piece(age_fill, 10, 10)
    combined['age_20_30']   = _piece(age_fill, 20, 10)
    combined['age_30_40']   = _piece(age_fill, 30, 10)
    combined['age_40_60']   = _piece(age_fill, 40, 20)
    combined['age_60_plus'] = (age_fill - 60).clip(lower=0, upper=80)

    combined['age_40_over'] = (age >= 40).astype('int8')
    combined['age_60_over'] = (age >= 60).astype('int8')

    # =====================
    # 2. 構造 × 築古
    # =====================
    if 'is_wood' in combined.columns:
        is_wood = pd.to_numeric(combined['is_wood'], errors='coerce').fillna(0).astype('int8')
        combined['age_40_over_x_is_wood'] = combined['age_40_over'].gt(0).astype('int8') * is_wood
        combined['age_60_over_x_is_wood'] = combined['age_60_over'].gt(0).astype('int8') * is_wood

    # =====================
    # 3. リフォーム無し × 築古
    # =====================
    if 'renovation_recency' in combined.columns:
        no_renov = combined['renovation_recency'].isna().astype('int8')
    else:
        no_renov = pd.Series(1, index=combined.index, dtype='int8')  # 列が無いなら全て「不明」扱い
    combined['no_renovation_flag'] = no_renov
    combined['age_40_over_x_no_renov'] = combined['age_40_over'].gt(0).astype('int8') * no_renov
    combined['age_60_over_x_no_renov'] = combined['age_60_over'].gt(0).astype('int8') * no_renov

    # =====================
    # split back
    # =====================
    train_out = combined.iloc[:len(train)].reset_index(drop=True)
    test_out  = combined.iloc[len(train):].reset_index(drop=True)

    return train_out, test_out


In [69]:
train_df_fe, test_df_fe = add_house_error_fix_features(train_df_fe, test_df_fe)

## 土地支配スコア

In [70]:
def add_land_dominance_proxy_features(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:

    train = train_df.copy()
    test = test_df.copy()
    combined = pd.concat([train, test], ignore_index=True)

    # =========================
    # 1. 理論建物価値 proxy
    # =========================
    area = combined['senyu_area']
    age  = combined['effective_age']

    # --- 構造係数（雑でOK）
    structure_coef = np.ones(len(combined))

    if 'is_wood' in combined.columns:
        structure_coef += combined['is_wood'] * (-0.3)

    if 'is_rcsrc' in combined.columns:
        structure_coef += combined['is_rcsrc'] * (0.2)

    # --- 築年減価
    age_decay = np.exp(-age / 40).clip(0.05, 1.0)

    # --- リフォーム補正
    renov = combined['renovation_recency']
    renov_factor = np.where(
        renov.notna(),
        np.exp(-renov / 30),
        0.6
    )

    combined['theoretical_building_value'] = (
        area
        * structure_coef
        * age_decay
        * renov_factor
    ).fillna(0)

    # =========================
    # 2. 土地支配スコア
    # =========================
    land_price = combined['land_theoretical_price_weighted']

    combined['land_dominance_score'] = (
        land_price / (combined['theoretical_building_value'] + 1e-6)
    ).clip(0, 10)
    combined['land_dominance_score_log'] = np.log1p(combined['land_dominance_score'])

    combined['land_dominant_flag'] = (
        combined['land_dominance_score'] > 2 # NOTE: ここは工夫の余地あり？（時間はなさそう）
    ).astype('int8')

    # =========================
    # split back
    # =========================
    train_out = combined.iloc[:len(train)].reset_index(drop=True)
    test_out  = combined.iloc[len(train):].reset_index(drop=True)

    return train_out, test_out


In [71]:
train_df_fe, test_df_fe = add_land_dominance_proxy_features(train_df_fe, test_df_fe)

## building_id単位の近傍物件カウント

In [72]:
from sklearn.neighbors import BallTree

def add_neighbor_count_features_by_building(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    *,
    building_id_col: str = 'building_id',
    lat_col: str = 'lat',
    lon_col: str = 'lon',
    radii_m: tuple[int, ...] = (300, 500, 1000, 2000),
    prefix: str = 'nb_build_cnt',
    agg: str = 'mean',  # 'mean' or 'first'
    fillna_value: int = 0,
) -> tuple[pd.DataFrame, pd.DataFrame, list[str]]:
    """
    building_id をユニークにした代表点でBallTreeを作り、近傍“建物数”を数える。
    自分自身の建物は除外（-1）。
    目的変数は一切使わない。
    """
    tr = train_df.copy()
    te = test_df.copy()
    combined = pd.concat([tr, te], ignore_index=True)

    # 数値化
    combined[lat_col] = pd.to_numeric(combined[lat_col], errors='coerce')
    combined[lon_col] = pd.to_numeric(combined[lon_col], errors='coerce')

    if building_id_col not in combined.columns:
        raise KeyError(f'{building_id_col} not found in dataframe.')

    # 座標がある行だけで building 代表点を作る
    valid = combined[[building_id_col, lat_col, lon_col]].notna().all(axis=1)
    base = combined.loc[valid, [building_id_col, lat_col, lon_col]].copy()

    if base.empty:
        raise ValueError('No valid rows with building_id and lat/lon.')

    if agg == 'mean':
        bld = base.groupby(building_id_col, as_index=False)[[lat_col, lon_col]].mean()
    elif agg == 'first':
        bld = base.groupby(building_id_col, as_index=False)[[lat_col, lon_col]].first()
    else:
        raise ValueError("agg must be 'mean' or 'first'.")

    # BallTree は building 代表点で作る
    coords = np.radians(bld[[lat_col, lon_col]].to_numpy())
    tree = BallTree(coords, metric='haversine')
    r_earth = 6_371_000.0

    created_cols: list[str] = []
    for r_m in radii_m:
        r_rad = r_m / r_earth
        cnt = tree.query_radius(coords, r=r_rad, count_only=True) - 1  # self除外
        col = f'{prefix}_{r_m}m'
        bld[col] = cnt.astype('int32')
        created_cols.append(col)

    # building_id で元の行に付与
    combined = combined.merge(bld[[building_id_col] + created_cols], on=building_id_col, how='left')

    # 座標/ID欠損などで付与できなかった行の扱い
    for col in created_cols:
        combined[col] = combined[col].fillna(fillna_value).astype('int32')

    tr_out = combined.iloc[:len(tr)].reset_index(drop=True)
    te_out = combined.iloc[len(tr):].reset_index(drop=True)
    return tr_out, te_out, created_cols

In [73]:
train_df_fe, test_df_fe, created_cols = add_neighbor_count_features_by_building(train_df_fe, test_df_fe)

In [74]:
created_cols

['nb_build_cnt_300m',
 'nb_build_cnt_500m',
 'nb_build_cnt_1000m',
 'nb_build_cnt_2000m']

## geohash TE

In [75]:
def make_geohash(train_df, test_df, precision=6, lat_col='lat', lon_col='lon'):
    import pygeohash as pgh
    tr = train_df.copy()
    te = test_df.copy()
    gh_col = f'geohash_{precision}'
    tr[gh_col] = [pgh.encode(a, b, precision=precision) for a, b in zip(tr[lat_col].astype(float), tr[lon_col].astype(float))]
    te[gh_col] = [pgh.encode(a, b, precision=precision) for a, b in zip(te[lat_col].astype(float), te[lon_col].astype(float))]
    return tr, te, gh_col

def _te_map(s_cat, y, smoothing=200.0, min_samples_leaf=5):
    prior = float(np.mean(y))
    stats = pd.DataFrame({'cat': s_cat.astype('object'), 'y': y.astype(float)}).groupby('cat')['y'].agg(['mean', 'count'])
    count = stats['count']
    mean = stats['mean']
    smooth = 1.0 / (1.0 + np.exp(-(count - min_samples_leaf) / smoothing))
    te = prior * (1.0 - smooth) + mean * smooth
    return te, prior

def add_geohash_te_time_oof(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    y_log: pd.Series,
    *,
    target_year_col: str = 'target_year',
    lat_col: str = 'lat',
    lon_col: str = 'lon',
    precision: int = 6,
    out_col: str = 'geohash6_te_logy',
    smoothing: float = 200.0,
    min_samples_leaf: int = 5,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    tr, te, gh_col = make_geohash(train_df, test_df, precision=precision, lat_col=lat_col, lon_col=lon_col)

    oof = np.zeros(len(tr), dtype=float)
    years = np.sort(tr[target_year_col].dropna().unique())

    # 年単位で「過去→未来」のTEを作る（未来情報を使わない）
    for y in years:
        va_mask = (tr[target_year_col] == y)
        tr_mask = (tr[target_year_col] < y)
        if tr_mask.sum() == 0:
            # 最初の年は prior で埋める
            oof[va_mask.values] = float(y_log.mean())
            continue
        te_map, prior = _te_map(tr.loc[tr_mask, gh_col], y_log.loc[tr_mask], smoothing=smoothing, min_samples_leaf=min_samples_leaf)
        oof[va_mask.values] = tr.loc[va_mask, gh_col].map(te_map).fillna(prior).astype(float).values

    tr[out_col] = oof
    # test は train 全体でfit（最終学習用）
    te_map_full, prior_full = _te_map(tr[gh_col], y_log, smoothing=smoothing, min_samples_leaf=min_samples_leaf)
    te[out_col] = te[gh_col].map(te_map_full).fillna(prior_full).astype(float)

    return tr, te


In [76]:
train_df_fe, test_df_fe = add_geohash_te_time_oof(train_df_fe, test_df_fe, train_df_fe['y_log'])

## houseモデル用の特徴量

In [77]:
def add_land_trend_interactions_house(train_df, test_df):
    tr = train_df.copy()
    te = test_df.copy()
    df = pd.concat([tr, te], ignore_index=True)

    for c in ['land_price_dlog_w3', 'land_price_dlog_nearest', 'log_land_price', 'land_dominance_score', 'effective_age']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    if 'land_price_dlog_w3' in df.columns:
        dlog = df['land_price_dlog_w3'].fillna(0)

        if 'log_land_price' in df.columns:
            df['dlog_w3_x_log_land_price'] = dlog * df['log_land_price'].fillna(0)

        if 'land_dominance_score' in df.columns:
            df['dlog_w3_x_land_dom'] = dlog * np.log1p(df['land_dominance_score'].fillna(0).clip(lower=0))

        if 'age_40_over' in df.columns:
            df['dlog_w3_x_age40'] = dlog * df['age_40_over']

    tr_out = df.iloc[:len(tr)].reset_index(drop=True)
    te_out = df.iloc[len(tr):].reset_index(drop=True)
    return tr_out, te_out


In [78]:
train_df_fe, test_df_fe = add_land_trend_interactions_house(train_df_fe, test_df_fe)

In [79]:
def add_house_mape_fix_features(df):
    df['nobeyuka_area_x_age_40_over'] = (
        df['nobeyuka_area'] * df['age_40_over']
    )
    df['nobeyuka_area_x_age_60_over'] = (
        df['nobeyuka_area'] * df['age_60_plus']
    )
    
    # suburban correction
    df['log_land_price_x_nb_build'] = (
        df['log_land_price'] * np.log1p(df['nb_build_cnt_1000m'])
    )

    # market liquidity
    df['nb_build_cnt_500m_log'] = np.log1p(df['nb_build_cnt_500m'])
    
    # land × road (fallback to density)
    df['land_x_road_density'] = df['log_land_price'] * df['road_len_density_gap']

    df['over_divided'] = (df['area_per_room'] < 20).astype(int)

    df['small_building'] = (df['nobeyuka_area'] < 50).astype(int)
    df['small_x_old'] = df['small_building'] * df['age_40_over']

    return df


In [80]:
train_df_fe = add_house_mape_fix_features(train_df_fe)
test_df_fe  = add_house_mape_fix_features(test_df_fe)

## otherモデル用の特徴量

In [81]:
def add_other_tail_fix_features(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    tr = train_df.copy()
    te = test_df.copy()
    df = pd.concat([tr, te], ignore_index=True)

    # --- 安全な数値化
    for c in ['max_floor_area', 'floor_count', 'infra_penalty_score', 'shidou_area_ratio', 'land_theoretical_price_weighted']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # --- 形状系
    if {'max_floor_area', 'floor_count'}.issubset(df.columns):
        floor = df['floor_count'].replace(0, np.nan)
        df['floor_area_per_floor'] = (df['max_floor_area'] / floor).replace([np.inf, -np.inf], np.nan)
        df['floor_area_per_floor'] = df['floor_area_per_floor'].fillna(df['floor_area_per_floor'].median())
        df['floor_area_per_floor_log'] = np.log1p(df['floor_area_per_floor'].clip(lower=0))

        df['is_midrise'] = df['floor_count'].between(3, 5).astype('int8')
        df['is_highrise'] = (df['floor_count'] >= 10).astype('int8')
        df['is_lowrise'] = (df['floor_count'] == 1).astype('int8')

    # --- 地価×ペナルティ（低価格過大を抑える）
    if 'land_theoretical_price_weighted' in df.columns:
        df['land_tp_w_log'] = np.log1p(df['land_theoretical_price_weighted'].clip(lower=0))

        if {'land_tp_w_log', 'max_floor_area'}.issubset(df.columns):
            df['land_tp_w_log_x_max_floor_area_log'] = (
                df['land_tp_w_log'] * np.log1p(df['max_floor_area'].clip(lower=0))
            )
        
        if 'infra_penalty_score' in df.columns:
            df['land_tp_w_log_x_infra_penalty'] = df['land_tp_w_log'] * df['infra_penalty_score'].fillna(0)

        if 'tag_land_penalty_flag' in df.columns:
            df['land_tp_w_log_x_land_penalty_flag'] = df['land_tp_w_log'] * df['tag_land_penalty_flag'].fillna(0)

        if 'shidou_area_ratio' in df.columns:
            df['land_tp_w_log_x_shidou_area_ratio'] = df['land_tp_w_log'] * df['shidou_area_ratio'].fillna(0)
        
    penalty_cols = [
        'infra_penalty_score',
        'tag_land_penalty_flag',
        'shidou_area_ratio',
    ]
    exist = [c for c in penalty_cols if c in df.columns]
    if exist:
        df['penalty_any_flag'] = (df[exist].fillna(0).sum(axis=1) > 0).astype('int8')
    
    if 'is_midrise' in df.columns and 'penalty_any_flag' in df.columns:
        df['is_midrise_x_penalty'] = df['is_midrise'] * df['penalty_any_flag']

    tr_out = df.iloc[:len(tr)].reset_index(drop=True)
    te_out = df.iloc[len(tr):].reset_index(drop=True)
    return tr_out, te_out

In [82]:
train_df_fe, test_df_fe = add_other_tail_fix_features(train_df_fe, test_df_fe)

## Residential用の特徴量

In [83]:
def add_other_tail_fix_features(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    tr = train_df.copy()
    te = test_df.copy()
    df = pd.concat([tr, te], ignore_index=True)

    # =====================================================
    # 数値化（存在する列のみ）
    # =====================================================
    num_cols = [
        'relative_floor', 'room_floor', 'floor_count',
        'senyu_area', 'effective_age',
        'door_to_station_min_log',
        'kyoueki_per_m2', 'shuuzen_per_m2',
        'money_kyoueki_std',
        'elev_mean',
    ]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # =====================================================
    # 1) Floor premium（高価格過小を詰める）
    # =====================================================
    if 'relative_floor' in df.columns:
        rf = df['relative_floor'].clip(0, 1)

        df['rel_floor_sq'] = rf * rf
        df['rel_floor_over_0_8'] = (rf >= 0.8).astype('int8')
        df['rel_floor_over_0_9'] = (rf >= 0.9).astype('int8')

        if 'floor_count' in df.columns:
            fc = df['floor_count'].fillna(0)
            df['is_tower_20'] = (fc >= 20).astype('int8')
            df['floor_premium_interaction'] = rf * np.log1p(fc.clip(lower=0))

    if {'room_floor', 'floor_count'}.issubset(df.columns):
        rf_abs = df['room_floor']
        fc = df['floor_count'].replace(0, np.nan)
        df['is_top_floor'] = ((rf_abs == fc) & rf_abs.notna()).astype('int8')
        df['is_high_floor_20'] = (rf_abs >= 20).fillna(False).astype('int8')

    # =====================================================
    # 2) 維持費負担（低価格過大を抑制）
    # =====================================================
    if ('kyoueki_per_m2' in df.columns) or ('shuuzen_per_m2' in df.columns):
        kpm = df['kyoueki_per_m2'] if 'kyoueki_per_m2' in df.columns else 0
        spm = df['shuuzen_per_m2'] if 'shuuzen_per_m2' in df.columns else 0

        kpm = pd.to_numeric(kpm, errors='coerce').fillna(0)
        spm = pd.to_numeric(spm, errors='coerce').fillna(0)

        df['maint_per_m2_total'] = kpm + spm
        df['maint_per_m2_total_log'] = np.log1p(df['maint_per_m2_total'].clip(lower=0))

        if 'door_to_station_min_log' in df.columns:
            df['maint_x_station'] = (
                df['maint_per_m2_total_log'] * df['door_to_station_min_log'].fillna(0)
            )

        if 'effective_age' in df.columns:
            age = df['effective_age'].fillna(df['effective_age'].median())
            df['maint_x_age'] = df['maint_per_m2_total_log'] * np.log1p(age.clip(lower=0))

        if 'money_kyoueki_std' in df.columns:
            df['maint_x_kyoueki_std_log'] = (
                df['maint_per_m2_total_log']
                * np.log1p(df['money_kyoueki_std'].fillna(0).clip(lower=0))
            )

    # =====================================================
    # 3) 建物グレード（フラグは必ず掛け算）
    # =====================================================
    # タワー
    if '建物構造・性能_タワーマンション' in df.columns:
        tower = df['建物構造・性能_タワーマンション'].astype('int8')
        df['tower_x_relative_floor'] = tower * df.get('relative_floor', 0)
        df['tower_x_high_floor'] = tower * df.get('is_high_floor_20', 0)

    # ハイグレード
    if '建物構造・性能_ハイグレードマンション' in df.columns:
        hg = df['建物構造・性能_ハイグレードマンション'].astype('int8')
        df['highgrade_x_maint'] = hg * df.get('maint_per_m2_total_log', 0)

    # リゾート（1本だけ）
    if '建物構造・性能_リゾートマンション' in df.columns and 'elev_mean' in df.columns:
        rs = df['建物構造・性能_リゾートマンション'].astype('int8')
        df['resort_x_elev'] = rs * np.log1p(df['elev_mean'].fillna(0).clip(lower=0))

    # =====================================================
    # split
    # =====================================================
    tr_out = df.iloc[:len(tr)].reset_index(drop=True)
    te_out = df.iloc[len(tr):].reset_index(drop=True)
    return tr_out, te_out

In [84]:
train_df_fe, test_df_fe = add_other_tail_fix_features(train_df_fe, test_df_fe)

In [85]:
def add_residential_luxury_maint_features(train_df, test_df):
    tr = train_df.copy()
    te = test_df.copy()
    df = pd.concat([tr, te], ignore_index=True)

    for c in ['maint_per_m2_total_log', 'relative_floor', 'City/town/village name_te', 'log_land_price']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # 維持費高級フラグ（train+testで閾値決めるとリークが気になるなら train分位で決める）
    if 'maint_per_m2_total_log' in df.columns:
        th = df['maint_per_m2_total_log'].quantile(0.90)
        df['is_luxury_fee_flag'] = (df['maint_per_m2_total_log'] >= th).astype('int8')

    # 高価格エリアでの維持費＝プレミアムを表現
    if 'maint_per_m2_total_log' in df.columns and 'City/town/village name_te' in df.columns:
        df['maint_x_city_te'] = df['maint_per_m2_total_log'] * df['City/town/village name_te'].fillna(df['City/town/village name_te'].median())

    if 'maint_per_m2_total_log' in df.columns and 'log_land_price' in df.columns:
        df['maint_x_log_land_price'] = df['maint_per_m2_total_log'] * df['log_land_price'].fillna(df['log_land_price'].median())

    # 上層×維持費（眺望・高級の共通シグナル）
    if 'maint_per_m2_total_log' in df.columns and 'relative_floor' in df.columns:
        rf = df['relative_floor'].clip(0, 1).fillna(0)
        df['rel_floor_x_maint'] = rf * df['maint_per_m2_total_log'].fillna(0)

    tr_out = df.iloc[:len(tr)].reset_index(drop=True)
    te_out = df.iloc[len(tr):].reset_index(drop=True)
    return tr_out, te_out


In [86]:
train_df_fe, test_df_fe = add_residential_luxury_maint_features(train_df_fe, test_df_fe)

In [87]:
def add_residential_rights_penalty_features(train_df, test_df):
    tr = train_df.copy()
    te = test_df.copy()
    df = pd.concat([tr, te], ignore_index=True)

    for c in ['hard_penalty_any', 'infra_penalty_score', 'tag_land_penalty_flag',
              'mochibun_ratio', 'has_mochibun', 'shidou_area_eff', 'has_shidou',
              'log_land_price', 'City/town/village name_te']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    if 'mochibun_ratio' in df.columns:
        mr = df['mochibun_ratio'].fillna(0)
        df['mochibun_ratio_log'] = np.log1p(mr.clip(lower=0))
        df['mochibun_anomaly_flag'] = (mr > 1.5).astype('int8')  # 要調整：>1が普通なら閾値上げる

    # 相場×ペナルティ（低価格過大を落とす）
    if 'log_land_price' in df.columns:
        lp = df['log_land_price'].fillna(df['log_land_price'].median())
        if 'infra_penalty_score' in df.columns:
            df['infra_penalty_x_log_land_price'] = df['infra_penalty_score'].fillna(0) * lp
        if 'hard_penalty_any' in df.columns:
            df['hard_penalty_x_log_land_price'] = df['hard_penalty_any'].fillna(0) * lp
        if 'has_shidou' in df.columns:
            df['has_shidou_x_log_land_price'] = df['has_shidou'].fillna(0) * lp

    tr_out = df.iloc[:len(tr)].reset_index(drop=True)
    te_out = df.iloc[len(tr):].reset_index(drop=True)
    return tr_out, te_out


In [88]:
train_df_fe, test_df_fe = add_residential_rights_penalty_features(train_df_fe, test_df_fe)

In [89]:
def add_residential_mape_boost_features(df):
    # --- B. 建物内相対指標 ---
    df['senyu_area_ratio_to_median'] = (
        df['senyu_area'] /
        (df['building_senyu_area_median'] + 1e-6)
    )

    df['area_per_room_x_senyu_diff'] = (
        df['area_per_room'] *
        (df['senyu_area'] - df['building_senyu_area_median'])
    )

    # --- C. 低価格帯プロキシ ---
    df['low_price_proxy'] = (
        (df['nearest_land_price'] < 60000) |
        (df['senyu_area'] < 40)
    ).astype('int8')

    # --- D. 距離 × 利便性 ---
    df['log_dist_x_livability'] = (
        np.log1p(df['distance_to_landpoint_m'].clip(1, 3000)) *
        df['livability_score']
    )

    return df


In [90]:
train_df_fe = add_residential_mape_boost_features(train_df_fe)
test_df_fe  = add_residential_mape_boost_features(test_df_fe)

## 都市スコア

In [91]:
# --- 東京23区 ---
TOKYO_23 = [
    '千代田区', '中央区', '港区', '新宿区', '文京区', '台東区',
    '墨田区', '江東区', '品川区', '目黒区', '大田区', '世田谷区',
    '渋谷区', '中野区', '杉並区', '豊島区', '北区', '荒川区',
    '板橋区', '練馬区', '足立区', '葛飾区', '江戸川区'
]

# --- 政令指定都市 ---
SEIREI_CITIES = [
    '札幌市', '仙台市', 'さいたま市', '千葉市', '横浜市', '川崎市', '相模原市',
    '新潟市', '静岡市', '浜松市', '名古屋市',
    '京都市', '大阪市', '堺市', '神戸市',
    '岡山市', '広島市', '北九州市', '福岡市', '熊本市'
]

# --- 首都圏（都道府県） ---
CAPITAL_PREFS = ['東京都', '神奈川県', '埼玉県', '千葉県']

# --- 県庁所在地（市名のみ） ---
PREF_CAPITALS = [
    '札幌市','青森市','盛岡市','仙台市','秋田市','山形市','福島市',
    '水戸市','宇都宮市','前橋市','さいたま市','千葉市','新宿区',
    '横浜市','新潟市','富山市','金沢市','福井市','甲府市','長野市',
    '岐阜市','静岡市','名古屋市','津市','大津市','京都市','大阪市',
    '神戸市','奈良市','和歌山市','鳥取市','松江市','岡山市','広島市',
    '山口市','徳島市','高松市','松山市','高知市','福岡市','佐賀市',
    '長崎市','熊本市','大分市','宮崎市','鹿児島市','那覇市'
]

URBAN_SCORE_MAP = {
    'main_city': 1.0,
    'mid_city': 0.6,
    'other': 0.3,
}

In [92]:
def assign_urban_class(df: pd.DataFrame):
    cond_main = (
        ((df['Prefecture name'] == '東京都') &
         (df['City/town/village name'].isin(TOKYO_23)))
        |
        (df['City/town/village name'].isin(['大阪市', '名古屋市']))
    )

    cond_mid = (
        (
            (df['Prefecture name'].isin(CAPITAL_PREFS))
            &
            ~(
                (df['Prefecture name'] == '東京都') &
                (df['City/town/village name'].isin(TOKYO_23))
            )
        )
        |
        (df['City/town/village name'].isin(SEIREI_CITIES))
        |
        (df['City/town/village name'].isin(PREF_CAPITALS))
    )

    df = df.copy()
    df['UrbanClass'] = 'other'
    df.loc[cond_mid, 'UrbanClass'] = 'mid_city'
    df.loc[cond_main, 'UrbanClass'] = 'main_city'

    df['UrbanClass'] = df['UrbanClass'].astype('category')

    df['urban_class_score'] = (
        df['UrbanClass']
            .map(URBAN_SCORE_MAP)
            .astype('float32')
    )

    return df


In [93]:
train_df_fe = assign_urban_class(train_df_fe)
test_df_fe  = assign_urban_class(test_df_fe)


## ディスカウントスコア

In [94]:
def fit_minmax(train_s):
    mn = np.nanmin(train_s.values)
    mx = np.nanmax(train_s.values)
    return mn, mx

def apply_minmax(s, mn, mx):
    return (s - mn) / (mx - mn + 1e-9)

# --- fit on train ---
mm = {}
cols_norm = [
    'is_lowland',
    'ame_depth_max_log1p',
    'dosham_dosham_A30a5_005_max_log1p',
    'listing_months_log',
    'empty_number',
    'mochibun_ratio',
    'shidou_area_eff',
    'money_shuuzenkikin',
    # house追加に使うならこれも
    'land_x_road_density',
]

for c in cols_norm:
    mm[c] = fit_minmax(train_df_fe[c])

def add_scores(df):
    df['risk_disaster_score'] = (
        0.4 * apply_minmax(df['is_lowland'], *mm['is_lowland']) +
        0.3 * apply_minmax(df['ame_depth_max_log1p'], *mm['ame_depth_max_log1p']) +
        0.3 * apply_minmax(df['dosham_dosham_A30a5_005_max_log1p'], *mm['dosham_dosham_A30a5_005_max_log1p'])
    )

    df['risk_liquidity_score'] = (
        0.6 * apply_minmax(df['listing_months_log'], *mm['listing_months_log']) +
        0.4 * apply_minmax(df['empty_number'], *mm['empty_number'])
    )

    df['risk_uncertainty_score'] = (
        0.4 * apply_minmax(df['mochibun_ratio'], *mm['mochibun_ratio']) +
        0.3 * apply_minmax(df['shidou_area_eff'], *mm['shidou_area_eff']) +
        0.3 * apply_minmax(df['money_shuuzenkikin'], *mm['money_shuuzenkikin'])
    )

    df['discount_risk_score'] = (
        0.4 * df['risk_disaster_score'] +
        0.3 * df['risk_liquidity_score'] +
        0.3 * df['risk_uncertainty_score']
    )

for _df in [train_df_fe, test_df_fe]:
    add_scores(_df)

# --- house専用追加（tanhではなく正規化を推奨） ---
for _df in [train_df_fe, test_df_fe]:
    _df['house_discount_score'] = (
        0.4 * _df['over_divided'].astype('float32') +
        0.3 * _df['small_x_old'].astype('float32') +
        0.3 * apply_minmax(_df['land_x_road_density'], *mm['land_x_road_density'])
    )

## 近傍価格の集約

In [95]:
def add_mean_price_aggregate(df):
    # 例：存在する列だけで組む（列名はあなたの実データに合わせて調整）
    cols = [c for c in [
        'mean_price_500m_house_log',
        'mean_price_1000m_house_log',
        'mean_price_2000m_house_log',
    ] if c in df.columns]

    # 水準：半径が大きい方をやや重め（安定）
    w = np.array([0.2, 0.3, 0.5], dtype='float32')[:len(cols)]
    w = w / w.sum()
    df['nbhd_price_level'] = sum(wi * df[ci].astype('float32') for wi, ci in zip(w, cols))

    # 勾配：局所−広域（局所が高い＝局所プレミアム）
    if 'mean_price_500m_house_log' in df.columns and 'mean_price_2000m_house_log' in df.columns:
        df['nbhd_price_local_premium'] = (
            df['mean_price_500m_house_log'].astype('float32') - df['mean_price_2000m_house_log'].astype('float32')
        )
    return df

In [96]:
train_df_fe = add_mean_price_aggregate(train_df_fe)
test_df_fe  = add_mean_price_aggregate(test_df_fe)

## 出力

In [97]:
train_df_fe.to_parquet(f'{intermediate_path}train_df_fe_v{fe_ver}.parquet')
test_df_fe.to_parquet(f'{intermediate_path}test_df_fe_v{fe_ver}.parquet')

In [98]:
train_df_fe.columns.to_list()

['target_ym',
 'building_id',
 'building_create_date',
 'building_type',
 'building_name',
 'homes_building_name',
 'unit_count',
 'lon',
 'lat',
 'building_structure',
 'total_floor_area',
 'building_area',
 'floor_count',
 'basement_floor_count',
 'year_built',
 'building_land_area',
 'land_area_all',
 'unit_area_min',
 'unit_area_max',
 'building_land_chimoku',
 'land_youto',
 'land_toshi',
 'land_chisei',
 'land_area_kind',
 'land_setback_flg',
 'land_setback',
 'land_kenpei',
 'land_youseki',
 'land_road_cond',
 'land_seigen',
 'building_area_kind',
 'management_form',
 'management_association_flg',
 'reform_exterior_date',
 'unit_id',
 'room_floor',
 'balcony_area',
 'dwelling_unit_window_angle',
 'room_count',
 'unit_area',
 'reform_date',
 'reform_wet_area_date',
 'reform_interior_date',
 'renovation_date',
 'snapshot_modify_date',
 'bukken_type',
 'flg_investment',
 'empty_number',
 'post1',
 'post2',
 'nl',
 'el',
 'rosen_name1',
 'eki_name1',
 'bus_stop1',
 'bus_time1',
 'wa