# 特徴量エンジニアリング

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import japanize_matplotlib

import geopandas as gpd
from sklearn.neighbors import NearestNeighbors

from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import gc
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
intermediate_path = '../../output/intermediate_file/'

# スクリプトのバージョン指定
preprocessing_ver = 1
fe_ver = 1

## File Import

In [3]:
train_df_fe = pd.read_csv(f'{intermediate_path}train_df_preprocessed_v{preprocessing_ver}.csv')
test_df_fe = pd.read_csv(f'{intermediate_path}test_df_preprocessed_v{preprocessing_ver}.csv')

fe_cols = test_df_fe.columns.to_list()

In [4]:
date_col = 'target_ym'
target_col = 'money_room'

## 都道府県・市区町村情報のTE

In [6]:
adress_cols = ['Prefecture name', 'City/town/village name']

global_mean = train_df_fe[target_col].mean()

for col in adress_cols:
    # Step1: train でカテゴリごとの平均を計算
    mapping = train_df_fe.groupby(col)[target_col].mean()

    # Step2: train に map を適用
    train_df_fe[col + '_te'] = train_df_fe[col].map(mapping)

    # Step3: test にも map を適用（未知カテゴリは global_mean）
    test_df_fe[col + '_te'] = test_df_fe[col].map(mapping).fillna(global_mean)

## 面積比

In [7]:
train_df_fe['area_ratio'] = train_df_fe['senyu_area_log'] / train_df_fe['kukaku_area_log']
test_df_fe['area_ratio'] = test_df_fe['senyu_area_log'] / test_df_fe['kukaku_area_log']

## 相対階数

In [8]:
train_df_fe['relative_floor'] = train_df_fe['room_floor'] / train_df_fe['floor_count']
test_df_fe['relative_floor']  = test_df_fe['room_floor'] / test_df_fe['floor_count']

## 密度

In [9]:
for df in [train_df_fe, test_df_fe]:
    # 2) 敷地あたり専有面積密度: 専有面積 / 区画面積
    df['unit_land_density'] = df['senyu_area_log'] / df['kukaku_area_log']
    df.loc[df['kukaku_area_log'] <= 0, 'unit_land_density'] = np.nan

    # 3) 面積 / 部屋数: 1部屋あたり専有面積
    df['area_per_room'] = df['senyu_area_log'] / df['madori_number_all']
    df.loc[df['madori_number_all'] <= 0, 'area_per_room'] = np.nan

## 豪邸検出

In [10]:
for df in [train_df_fe, test_df_fe]:
    df['land_building_ratio'] = df['kukaku_area_log'] / df['senyu_area_log']
    df.loc[df['senyu_area_log'] <= 0, 'land_building_ratio'] = np.nan

## 面積と築年の交互作用

In [11]:
for df in [train_df_fe, test_df_fe]:
    # 2) 専有面積 × 築年数
    df['senyu_area_x_built_diff'] = df['senyu_area_log'] * df['built_diff']

    # 3) 1部屋あたり面積 × 築年数
    #   → 同じ築年数でも「広くてゆとりのある間取り」のプレミアムを表現
    df['area_per_room_x_built_diff'] = df['area_per_room'] * df['built_diff']

## building_idごとの統合特徴量

In [12]:
len_train = len(train_df_fe)

# --- train + test を結合 ---
combined_df = pd.concat([train_df_fe, test_df_fe], ignore_index=True)

# --- 1) building_id ごとの median(senyu_area_log) ---
building_senyu_area_log_median = (
    combined_df.groupby('building_id')['senyu_area_log']
               .median()
               .rename('building_senyu_area_log_median')
)

# --- 2) building_id ごとの max(room_floor) ---
building_room_floor_max = (
    combined_df.groupby('building_id')['room_floor']
               .max()
               .rename('building_room_floor_max')
)

# --- 3) building_id ごとの unit_count（件数） ---
building_unit_count = (
    combined_df.groupby('building_id')['unit_id']  # unit_id がなければ建物内 index をカウントでもOK
               .count()
               .rename('building_unit_count')
)

# --- まとめて結合 ---
combined_df = combined_df.join(building_senyu_area_log_median, on='building_id')
combined_df = combined_df.join(building_room_floor_max,   on='building_id')
combined_df = combined_df.join(building_unit_count,       on='building_id')

# --- 再び train / test に分割 ---
train_df_fe = combined_df.iloc[:len_train].copy()  # 元の train 行数を使う
test_df_fe  = combined_df.iloc[len_train:].copy()


## 近傍価格特徴量

In [13]:
from sklearn.neighbors import BallTree

def add_neighbor_price_features(
    train_df_fe,
    test_df_fe,
    target_col='money_room',
    lat_col='lat',
    lon_col='lon',
    radius_km=1.0
):
    """
    train/test に「1km 以内の近傍価格特徴量」を追加する関数

    追加されるカラム:
      - mean_price_within_1km
      - median_price_within_1km
      - count_neighbors_1km
      - neighbor_price_std_1km
      - neighbor_price_iqr_1km
    """

    # --------------------
    # 0) 前処理 & 準備
    # --------------------
    train_df_fe = train_df_fe.copy()
    test_df_fe  = test_df_fe.copy()

    # 座標をラジアンに変換
    train_coords_rad = np.radians(train_df_fe[[lat_col, lon_col]].values.astype(float))
    test_coords_rad  = np.radians(test_df_fe[[lat_col, lon_col]].values.astype(float))

    # BallTree 構築（haversine 距離）
    tree = BallTree(train_coords_rad, metric='haversine')
    R_earth_km = 6371.0
    radius_rad = radius_km / R_earth_km

    y_train = train_df_fe[target_col].to_numpy()
    n_train = len(train_df_fe)
    n_test  = len(test_df_fe)

    # --------------------
    # 1) train 側の近傍集計
    # --------------------
    mean_price_train   = np.full(n_train, np.nan, dtype=float)
    median_price_train = np.full(n_train, np.nan, dtype=float)
    count_neighbors_train = np.zeros(n_train, dtype=int)

    std_train = np.full(n_train, np.nan, dtype=float)
    iqr_train = np.full(n_train, np.nan, dtype=float)

    ind_array = tree.query_radius(
        train_coords_rad,
        r=radius_rad,
        return_distance=False,
        sort_results=False
    )

    for i, idx in enumerate(ind_array):
        idx = np.asarray(idx, dtype=int)

        # 自分自身を除外
        idx = idx[idx != i]

        # --- 平均・中央値・件数 ---
        if len(idx) > 0:
            prices = y_train[idx]
            mean_price_train[i]   = prices.mean()
            median_price_train[i] = np.median(prices)
            count_neighbors_train[i] = len(idx)

        # --- 分散・IQR ---
        if len(idx) > 1:
            neigh_prices = y_train[idx]
            std_train[i] = np.std(neigh_prices)
            q75, q25 = np.percentile(neigh_prices, [75, 25])
            iqr_train[i] = q75 - q25

    train_df_fe['mean_price_within_1km']   = np.log(mean_price_train)
    train_df_fe['median_price_within_1km'] = np.log(median_price_train)
    train_df_fe['count_neighbors_1km']     = count_neighbors_train
    train_df_fe['neighbor_price_std_1km']  = std_train
    train_df_fe['neighbor_price_iqr_1km']  = iqr_train

    # --------------------
    # 2) test 側の近傍集計（近傍は train だけ）
    # --------------------
    mean_price_test   = np.full(n_test, np.nan, dtype=float)
    median_price_test = np.full(n_test, np.nan, dtype=float)
    count_neighbors_test = np.zeros(n_test, dtype=int)

    std_test = np.full(n_test, np.nan, dtype=float)
    iqr_test = np.full(n_test, np.nan, dtype=float)

    ind_array_test = tree.query_radius(
        test_coords_rad,
        r=radius_rad,
        return_distance=False,
        sort_results=False
    )

    for i, idx in enumerate(ind_array_test):
        idx = np.asarray(idx, dtype=int)

        if len(idx) > 0:
            prices = y_train[idx]
            mean_price_test[i]   = prices.mean()
            median_price_test[i] = np.median(prices)
            count_neighbors_test[i] = len(idx)

        if len(idx) > 1:
            neigh_prices = y_train[idx]
            std_test[i] = np.std(neigh_prices)
            q75, q25 = np.percentile(neigh_prices, [75, 25])
            iqr_test[i] = q75 - q25

    test_df_fe['mean_price_within_1km']   = np.log(mean_price_test)
    test_df_fe['median_price_within_1km'] = np.log(median_price_test)
    test_df_fe['count_neighbors_1km']     = count_neighbors_test
    test_df_fe['neighbor_price_std_1km']  = std_test
    test_df_fe['neighbor_price_iqr_1km']  = iqr_test

    # --------------------
    # 3) 欠損埋め（近傍なしの物件）
    # --------------------
    global_mean_price   = np.log(np.nanmean(y_train))
    global_median_price = np.log(np.nanmedian(y_train))
    global_std  = np.nanmean(std_train)
    global_iqr  = np.nanmean(iqr_train)

    for df in [train_df_fe, test_df_fe]:
        df['mean_price_within_1km']   = df['mean_price_within_1km'].fillna(global_mean_price)
        df['median_price_within_1km'] = df['median_price_within_1km'].fillna(global_median_price)
        df['neighbor_price_std_1km']  = df['neighbor_price_std_1km'].fillna(global_std)
        df['neighbor_price_iqr_1km']  = df['neighbor_price_iqr_1km'].fillna(global_iqr)
        # count_neighbors_1km は 0 のままでOK

    return train_df_fe, test_df_fe


In [14]:
train_df_fe, test_df_fe = add_neighbor_price_features(
    train_df_fe,
    test_df_fe,
    target_col='money_room',
    lat_col='lat',
    lon_col='lon',
    radius_km=1.0
)

## 市区町村ごとの緯度・経度の中心

In [15]:
city_col = 'City/town/village name'

# --- ① train + test を結合 ---
combined_df = pd.concat([train_df_fe, test_df_fe], axis=0, ignore_index=True)

# 既に city_lat/city_lon がある場合はいったん削除（任意）
for col in ['city_lat', 'city_lon']:
    if col in combined_df.columns:
        combined_df = combined_df.drop(columns=[col])

combined_df['lat'] = combined_df['lat'].astype(float)
combined_df['lon'] = combined_df['lon'].astype(float)

# --- ② 市区町村ごとの lat / lon の中央値 ---
city_lat_median = combined_df.groupby(city_col)['lat'].median()
city_lon_median = combined_df.groupby(city_col)['lon'].median()

# --- ③ 各レコードに city_lat / city_lon を付与 ---
combined_df['city_lat'] = combined_df[city_col].map(city_lat_median)
combined_df['city_lon'] = combined_df[city_col].map(city_lon_median)

# 型を float に統一
combined_df['city_lat'] = combined_df['city_lat'].astype('float')
combined_df['city_lon'] = combined_df['city_lon'].astype('float')

# --- ④ NaN を全体の中央値で埋める ---
combined_df['city_lat'] = combined_df['city_lat'].fillna(combined_df['lat'].median())
combined_df['city_lon'] = combined_df['city_lon'].fillna(combined_df['lon'].median())

# --- ⑤ 再分割 ---
train_df_fe = combined_df.iloc[:len(train_df_fe)].copy()
test_df_fe  = combined_df.iloc[len(train_df_fe):].copy()

## タグ情報のPCA

In [16]:
from sklearn.decomposition import PCA

# --- 1) combined_df を作成 ---
combined_df = pd.concat([train_df_fe, test_df_fe], ignore_index=True)

# --- 2) タグ列グループ（新カテゴリごと） ---
tag_groups = {
    # 土地まわりのタグ
    'land_price': [
        c for c in combined_df.columns
        if c.startswith('土地価格_')
    ],

    # 建物性能・構造
    'building_struct': [
        c for c in combined_df.columns
        if c.startswith('建物構造・性能_')
    ],

    # 建物の給排水・インフラ設備
    'infra': [
        c for c in combined_df.columns
        if c.startswith('建物設備（給排水・インフラ）_')
    ],

    # 立地プレミアム（タグ由来のもの）
    'location_premium': [
        c for c in combined_df.columns
        if c.startswith('立地プレミアム_')
    ],

    # 環境プレミアム（タグ由来のもの）
    'environment': [
        c for c in combined_df.columns
        if c.startswith('環境プレミアム_')
    ],

    # 専有部分設備
    'senyu': [
        c for c in combined_df.columns
        if c.startswith('専有部分設備')
    ],

    # 用途・投資セグメント
    'sales_status': [
        c for c in combined_df.columns
        if c.startswith('用途・投資セグメント_売買ステータス_')
    ],
    'certificate': [
        c for c in combined_df.columns
        if c.startswith('用途・投資セグメント_不動産の証明書・性能評価_')
    ],
}

# --- 2.5) グループごとの PCA 次元数（推奨値） ---
pca_dims = {
    'land_price': 1,          # 土地価格系タグは少数なので 1〜2
    'building_struct': 3,     # 構造・性能
    'infra': 3,               # 給排水・インフラ
    'location_premium': 1,    # 立地プレミアムタグは情報量少なめ
    'environment': 2,         # 環境系タグ
    'senyu': 5,                # 浴室・洗面
    'certificate': 3,         # 証明書・評価系
}

# --- 3) PCA + 累積寄与率を計算する関数 ---
def add_pca_features_and_report(df, cols, prefix, n_components):
    if len(cols) == 0:
        print(f'[SKIP] {prefix}: No columns')
        return df

    # 列数より多い成分数は指定できないので調整しておく
    n_components = min(n_components, len(cols))
    if n_components <= 0:
        print(f'[SKIP] {prefix}: n_components <= 0 after adjustment')
        return df

    X = df[cols].fillna(0).astype(float)

    pca = PCA(n_components=n_components, random_state=42)
    pca_features = pca.fit_transform(X)

    # 新しい PCA 列を追加
    for i in range(n_components):
        df[f'{prefix}_pca_{i+1}'] = pca_features[:, i]

    # 累積寄与率を計算
    explained = pca.explained_variance_ratio_
    cum_explained = explained.cumsum()

    # 表示
    print(f'\n=== {prefix} PCA Explained Variance (n_components={n_components}) ===')
    for i, (e, c) in enumerate(zip(explained, cum_explained), start=1):
        print(f'PC{i}: {e:.4f},  Cumulative: {c:.4f}')
    print('========================================\n')

    return df


# --- 4) 各グループに対して PCA + 累積寄与率を表示 ---
for prefix, cols in tag_groups.items():
    n_comp = pca_dims.get(prefix, 0)
    if n_comp <= 0:
        print(f'[SKIP] {prefix}: n_components <= 0 (not defined)')
        continue

    combined_df = add_pca_features_and_report(
        combined_df,
        cols,
        prefix,
        n_components=n_comp
    )

# --- 5) train/test に戻す ---
train_len = len(train_df_fe)
train_df_fe = combined_df.iloc[:train_len].copy()
test_df_fe  = combined_df.iloc[train_len:].copy()


=== land_price PCA Explained Variance (n_components=1) ===
PC1: 0.9976,  Cumulative: 0.9976


=== building_struct PCA Explained Variance (n_components=3) ===
PC1: 0.3238,  Cumulative: 0.3238
PC2: 0.1057,  Cumulative: 0.4295
PC3: 0.0551,  Cumulative: 0.4846


=== infra PCA Explained Variance (n_components=3) ===
PC1: 0.4508,  Cumulative: 0.4508
PC2: 0.1842,  Cumulative: 0.6350
PC3: 0.1323,  Cumulative: 0.7673


=== location_premium PCA Explained Variance (n_components=1) ===
PC1: 0.9414,  Cumulative: 0.9414


=== environment PCA Explained Variance (n_components=2) ===
PC1: 0.6547,  Cumulative: 0.6547
PC2: 0.0933,  Cumulative: 0.7480


=== senyu PCA Explained Variance (n_components=5) ===
PC1: 0.3034,  Cumulative: 0.3034
PC2: 0.0859,  Cumulative: 0.3893
PC3: 0.0522,  Cumulative: 0.4415
PC4: 0.0413,  Cumulative: 0.4828
PC5: 0.0365,  Cumulative: 0.5193

[SKIP] sales_status: n_components <= 0 (not defined)

=== certificate PCA Explained Variance (n_components=3) ===
PC1: 0.3737,  Cumulativ

## 交通情報

In [17]:
def add_access_zone_features(df, far_thresh=5000, error_thresh=20000, walk_speed=80):
    """
    交通系FEをまとめて付与する関数

    追加されるカラム:
      - walk_distance1_raw
      - access_zone           : 'walk', 'bus', 'car', 'error', 'unknown'
      - walk_distance1_error_flag
      - door_to_station_min   : 駅までの実質アクセス時間（徒歩 + バス）
      - door_to_station_min_log
    """
    df = df.copy()

    # 元の距離を退避
    df['walk_distance1_raw'] = df['walk_distance1']

    # 基本フラグ
    has_eki           = df['eki_name1'].notnull()
    has_bus_stop      = df['bus_stop1'].notnull()
    has_bus_time      = df['bus_time1'].notnull()
    has_traffic_other = df['traffic_other'].notnull()

    # =========================
    # 1) access_zone の分類
    # =========================
    df['access_zone'] = 'unknown'

    # 徒歩圏：駅あり & バス停なし
    mask_walk = has_eki & ~has_bus_stop
    df.loc[mask_walk, 'access_zone'] = 'walk'

    # バス圏：バス停あり（駅あり/なしどちらも）
    mask_bus = has_bus_stop
    df.loc[mask_bus, 'access_zone'] = 'bus'

    # 自動車圏候補：「駅までめっちゃ遠い」ケース
    very_far = df['walk_distance1_raw'] >= far_thresh

    # 1) traffic_other にコメントあり → 自動車圏
    mask_car1 = very_far & has_traffic_other

    # 2) 駅までの徒歩距離が far_thresh〜error_thresh 未満 → 自動車圏
    mask_car2 = very_far & (df['walk_distance1_raw'] < error_thresh)

    mask_car = mask_car1 | mask_car2

    # walk / bus / unknown のうち、car 条件を満たすものを上書き
    df.loc[mask_car, 'access_zone'] = 'car'

    # 20,000m 以上は入力ミス疑い
    mask_error = df['walk_distance1_raw'] >= error_thresh
    df.loc[mask_error, 'access_zone'] = 'error'

    # カテゴリ化
    df['access_zone'] = df['access_zone'].astype('category')

    # =========================
    # 2) 駅までの実質アクセス時間
    #    (door_to_station_min)
    # =========================
    # 自宅→徒歩時間（分）
    walk_min1 = df['walk_distance1_raw'] / walk_speed

    # 初期化
    df['door_to_station_min'] = np.nan

    # パターンA：駅あり + バス停あり + バス時間あり（徒歩＋バス）
    mask_A = has_eki & has_bus_stop & has_bus_time
    df.loc[mask_A, 'door_to_station_min'] = (
        walk_min1[mask_A] + df.loc[mask_A, 'bus_time1']
    )

    # パターンB：駅のみ → 徒歩のみ
    mask_B = has_eki & ~has_bus_stop
    df.loc[mask_B, 'door_to_station_min'] = walk_min1[mask_B]

    # パターンC（駅なし＋バスのみ）は NaN のまま
    # → bus_only_flag などで別途対処可能

    # 非線形吸収用ログ特徴量
    df['door_to_station_min_log'] = np.log1p(df['door_to_station_min'])

    return df

train_df_fe = add_access_zone_features(train_df_fe)
test_df_fe  = add_access_zone_features(test_df_fe)

In [18]:
for df in [train_df_fe, test_df_fe]:
    if 'walk_distance1_raw' not in df.columns:
        df['walk_distance1_raw'] = df['walk_distance1']

bins = [-1, 300, 700, 1500, 5000]
labels = ['0-300', '300-700', '700-1500', '1500-5000']

for df in [train_df_fe, test_df_fe]:
    df['walk_distance_bin'] = pd.cut(
        df['walk_distance1_raw'],
        bins=bins,
        labels=labels
    ).astype('category')

In [19]:
max_min = 60  # 上限は好みで 45 / 60 / 90 など試せる

for df in [train_df_fe, test_df_fe]:
    df['door_to_station_min_clip'] = df['door_to_station_min'].clip(0, max_min)
    df['door_to_station_min_clip_log'] = np.log1p(df['door_to_station_min_clip'])

In [20]:
traffic_te_cols = ['eki_name1', 'eki_name2', 'rosen_name1', 'rosen_name2']

global_mean = train_df_fe[target_col].mean()

for col in traffic_te_cols:
    mapping = train_df_fe.groupby(col)[target_col].mean()

    train_df_fe[col + '_te'] = train_df_fe[col].map(mapping)
    test_df_fe[col + '_te']  = test_df_fe[col].map(mapping).fillna(global_mean)

In [21]:
# 精度が上がらなかったため不採用だが、改善の余地はありそう（時間かけられなかった）
# train_df_fe['walk_distance1_clean'] = train_df_fe['walk_distance1_raw'].clip(0, error_thresh)
# train_df_fe['walk_distance1_log']   = np.log1p(train_df_fe['walk_distance1'])

# test_df_fe['walk_distance1_clean'] = test_df_fe['walk_distance1_raw'].clip(0, error_thresh)
# test_df_fe['walk_distance1_log']   = np.log1p(test_df_fe['walk_distance1'])

# fe_cols += ['walk_distance1_log']


# def add_bus_only_flag(df):
#     """
#     rosen_name1 に「バス」が含まれる → bus_only_flg = 1
#     それ以外は 0
#     """
#     df = df.copy()

#     # 'バス' を含む路線名 → bus_only
#     is_bus_line = df['rosen_name1'].fillna('').str.contains('バス')

#     df['bus_only_flg'] = is_bus_line.astype(int)

#     return df

# train_df_fe = add_bus_only_flag(train_df_fe)
# test_df_fe  = add_bus_only_flag(test_df_fe)

# fe_cols += ['bus_only_flg']

## 建物情報の拡充

In [22]:
def add_building_area_features(df):
    """
    建物規模系の追加特徴量を作成する
    - density_floor_area
    - unit_count_density
    - unit_area_range
    - empty_ratio
    """
    df = df.copy()

    # ================
    # 1) 延床面積密度
    # ================
    df['density_floor_area'] = df['nobeyuka_area_log'] / df['tochi_area_log']
    
    # ================
    # 2) 総戸数密度
    # ================
    df['unit_count_density'] = df['unit_count_log'] / df['tochi_area_log']

    # ================
    # 3) 専有面積の範囲（max-min）
    # ================
    df['unit_area_range'] = df['unit_area_max'] - df['unit_area_min']

    # ================
    # 4) 空室率
    #    unit_count_log = 0 のケースは NaN にする
    # ================
    df['empty_ratio'] = df['empty_number'] / df['unit_count_log']

    return df

train_df_fe = add_building_area_features(train_df_fe)
test_df_fe  = add_building_area_features(test_df_fe)

## リフォーム・リノベ情報の拡充

In [23]:
def add_effective_age(df, train_year, train_month):
    """
    ・実質築年数（effective_age）
    ・first_reform_yearmonth / first_reform_year
    ・effective_build_year
    ・reform_total_count（リフォーム箇所数）
    ・reform_newness（リフォームからの経過月数）
    ・reform_newness_log（log1p）
    を追加する
    """
    df = df.copy()

    # -------------------------
    # 1) リフォーム年月の最小値（最初のリフォーム）
    # -------------------------
    date_cols = ['reform_interior_date',
                 'reform_wet_area_date',
                 'renovation_date']

    # 数値化（YYYYMM想定）できないものは NaN
    reform_ym = df[date_cols].apply(
        pd.to_numeric, errors='coerce'
    )

    # 行ごとの最小YYYYMM（=一番古い or 最初に記録されたリフォーム年月）
    df['first_reform_yearmonth'] = reform_ym.min(axis=1)

    # 年に変換（YYYYMM → YYYY）
    df['first_reform_year'] = (df['first_reform_yearmonth'] // 100).astype('float')

    # -------------------------
    # 2) 実質築年（effective_build_year）＆ 実質築年数（effective_age）
    # -------------------------
    df['effective_build_year'] = np.where(
        df['first_reform_year'].notnull(),
        df['first_reform_year'],     # リフォームした年
        df['year_built']             # なければ元の築年
    )

    df['effective_age'] = train_year - df['effective_build_year']

    # -------------------------
    # 3) リフォーム箇所の合計数（reform_total_count）
    #    ※箇所OneHotだけを合計するようにフィルタ
    # -------------------------
    # reform_exterior*, reform_wet_area*, reform_interior* の 0/1 カラムを想定
    reform_flag_cols = df.filter(
        regex=r'^reform_(exterior|wet_area|interior)'
    ).columns

    if len(reform_flag_cols) > 0:
        # 数値化（'1', '0', None を 1/0 に強制変換）
        reform_flags = df[reform_flag_cols].apply(
            pd.to_numeric, errors='coerce'
        ).fillna(0).astype(int)

        df['reform_total_count'] = reform_flags.sum(axis=1)
    else:
        df['reform_total_count'] = 0

    # -------------------------
    # 4) リフォームの“新しさ”（経過月数）
    # -------------------------
    # 基準年月（YYYYMM）→ 通し月数に変換
    ref_total_months = train_year * 12 + train_month

    # first_reform_yearmonth がある場合のみ計算
    df['reform_newness'] = np.where(
        df['first_reform_yearmonth'].notnull(),
        ref_total_months - df['first_reform_yearmonth'],
        np.nan
    )

    df['reform_newness_log'] = np.log1p(df['reform_newness'])

    return df


In [24]:
# 例：target_ym の最大値から基準年・月を決める
train_max_ym = train_df_fe['target_ym'].max()  # 例: 202210
train_year = train_max_ym // 100            # → 2022
train_month = train_max_ym % 100            # → 10

train_df_fe = add_effective_age(train_df_fe, train_year, train_month)
test_df_fe  = add_effective_age(test_df_fe, train_year, train_month)

In [25]:
for df in [train_df_fe, test_df_fe]:
    # effective_age が NaN になりうるので、0割防止のため +1
    df['area_age_ratio'] = df['senyu_area_log'] / (df['effective_age'] + 1.0)
    df['area_age_ratio_log'] = np.log1p(df['area_age_ratio'].clip(lower=0))

## 近傍単価

In [26]:
eps = 1e-6

for df in [train_df_fe, test_df_fe]:
    df['unit_price_neighbor'] = df['mean_price_within_1km'] / (df['building_senyu_area_log_median'] + eps)
    df['unit_price_neighbor_log'] = np.log1p(df['unit_price_neighbor'].clip(lower=0))

## 掲載期間

In [27]:
def calculate_listing_months(df):

    # 1) building_create_date を datetime へ（すでに datetime の場合はスキップ）
    df['building_create_date'] = pd.to_datetime(df['building_create_date'], errors='coerce')

    # 2) target_ym (例: 201901 → 2019-01-01) を datetime に変換
    df['target_ym_date'] = pd.to_datetime(df['target_ym'].astype(str), format='%Y%m', errors='coerce')

    # 3) 年月のみを使った '月数' 変換
    b_y = df['building_create_date'].dt.year
    b_m = df['building_create_date'].dt.month
    t_y = df['target_ym_date'].dt.year
    t_m = df['target_ym_date'].dt.month

    # 4) 掲載期間（何ヶ月経っているか）を計算
    df['listing_months'] = (t_y - b_y) * 12 + (t_m - b_m)

    # 5) マイナス値はおかしいので NaN にする
    df.loc[df['listing_months'] < 0, 'listing_months'] = pd.NA

    return df


train_df_fe = calculate_listing_months(train_df_fe)
test_df_fe  = calculate_listing_months(test_df_fe)


## 地価の比率・平均値

In [28]:
def add_land_price_ratios(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 分母が 0 or NaN のときは NaN にするマスク
    mask_mean = (df['log_land_price'] > 0) & df['mean_price_within_1km'].notna()
    mask_weighted = (df['log_weighted_land_price_3'] > 0) & df['median_price_within_1km'].notna()

    df['ratio_mean_land'] = np.nan
    df.loc[mask_mean, 'ratio_mean_land'] = (
        df.loc[mask_mean, 'mean_price_within_1km'] / df.loc[mask_mean, 'log_land_price']
    )

    df['ratio_weighted_land'] = np.nan
    df.loc[mask_weighted, 'ratio_weighted_land'] = (
        df.loc[mask_weighted, 'median_price_within_1km'] / df.loc[mask_weighted, 'log_weighted_land_price_3']
    )

    return df

train_df_fe = add_land_price_ratios(train_df_fe)
test_df_fe  = add_land_price_ratios(test_df_fe)


## 未整理FE

In [29]:
def add_block1_2_features(train_df_fe: pd.DataFrame,
                          test_df_fe: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    ① 共益費・修繕費まわりのFE
    ② 地価ギャップまわりのFE
    を train/test 両方に追加する。
    """
    train = train_df_fe.copy()
    test  = test_df_fe.copy()
    
    for df in (train, test):
        # ======================
        # ① 共益費・修繕費ブロック
        # ======================
        # 面積・戸数が無い行は NaN にしておく（LightGBM は NaN を自然に扱える）
        house = df.get('senyu_area_log')
        unit_cnt = df.get('unit_count_log')
        
        # 共益費・修繕積立の生値
        kyoueki = df.get('money_kyoueki')
        shuuzen = df.get('money_shuuzen')
        
        # 1) 面積あたり
        if (house is not None) and (kyoueki is not None):
            df['kyoueki_per_m2'] = np.where(
                (house > 0) & np.isfinite(house),
                kyoueki / house,
                np.nan
            )
        if (house is not None) and (shuuzen is not None):
            df['shuuzen_per_m2'] = np.where(
                (house > 0) & np.isfinite(house),
                shuuzen / house,
                np.nan
            )
        
        # 2) 戸数あたり
        if (unit_cnt is not None) and (kyoueki is not None):
            df['kyoueki_per_unit'] = np.where(
                (unit_cnt > 0) & np.isfinite(unit_cnt),
                kyoueki / unit_cnt,
                np.nan
            )
        if (unit_cnt is not None) and (shuuzen is not None):
            df['shuuzen_per_unit'] = np.where(
                (unit_cnt > 0) & np.isfinite(unit_cnt),
                shuuzen / unit_cnt,
                np.nan
            )
        
        # 3) 有無フラグ
        if kyoueki is not None:
            df['has_kyoueki'] = (kyoueki > 0).astype('Int8')
        if shuuzen is not None:
            df['has_shuuzen'] = (shuuzen > 0).astype('Int8')
        
        # ======================
        # ② 地価ギャップブロック
        # ======================
        nearest_lp   = df.get('log_land_price')
        weighted_lp  = df.get('log_weighted_land_price_3')
        
        # 割安/割高フラグ（threshold は仮に 0.8 / 1.2）
        if 'ratio_mean_land' in df.columns:
            df['land_cheap_flag'] = (df['ratio_mean_land'] < 0.8).astype('Int8')
            df['land_expensive_flag'] = (df['ratio_mean_land'] > 1.2).astype('Int8')
        
        # 理論土地価格（= 地価 × 土地面積）
        land_area = df.get('kukaku_area_log')
        if (land_area is not None) and (nearest_lp is not None):
            df['land_theoretical_price'] = np.where(
                (land_area > 0) & np.isfinite(land_area),
                nearest_lp * land_area,
                np.nan
            )
        if (land_area is not None) and (weighted_lp is not None):
            df['land_theoretical_price_weighted'] = np.where(
                (land_area > 0) & np.isfinite(land_area),
                weighted_lp * land_area,
                np.nan
            )
    
    return train, test


In [30]:
train_df_fe, test_df_fe = add_block1_2_features(train_df_fe, test_df_fe)

In [31]:
def add_parking_features(train_df_fe: pd.DataFrame,
                                  test_df_fe: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    駐車場まわり（有無・近さ・コスト比）
      - parking_available, parking_on_site, parking_nearby_only
      - parking_distance_log
      - parking_cost_ratio, parking_cost_ratio_log
      - parking_cost_per_sqm

    を train/test 両方に付与する。
    """
    combined = pd.concat([train_df_fe, test_df_fe], ignore_index=True)

    # ---------------------------
    # 4) 駐車場まわりの特徴量
    # ---------------------------
    pk = combined.get('parking_kubun', pd.Series(np.nan, index=combined.index))

    # 1:空有, 3:近隣, 5:有 → 駐車場あり
    combined['parking_available'] = pk.isin([1, 3, 5]).astype(int)

    # 敷地内あり（1:空有, 5:有）
    combined['parking_on_site'] = pk.isin([1, 5]).astype(int)

    # 近隣のみ（3）
    combined['parking_nearby_only'] = (pk == 3).astype(int)

    # 距離（log）
    if 'parking_distance' in combined.columns:
        dist = combined['parking_distance'].astype(float)
        dist = dist.clip(lower=0)
        combined['parking_distance_log'] = np.log1p(dist)
    else:
        combined['parking_distance_log'] = np.nan

    # コスト比：駐車場料金 / 物件価格
    parking_money = combined.get('parking_money', pd.Series(np.nan, index=combined.index)).astype(float)
    money_room    = combined.get('money_room',    pd.Series(np.nan, index=combined.index)).astype(float)

    valid_cost = (parking_money > 0) & (money_room > 0)
    parking_cost_ratio = np.where(valid_cost, parking_money / money_room, np.nan)
    parking_cost_ratio = np.clip(parking_cost_ratio, 0, 0.1)  # 10% 以上はアウトライヤーとしてクリップ（お好み）

    combined['parking_cost_ratio']      = parking_cost_ratio

    # コスト / 専有面積（senyu_area_log 優先, なければ unit_area）
    if 'senyu_area_log' in combined.columns:
        area = combined['senyu_area_log'].astype(float)
    elif 'unit_area' in combined.columns:
        area = combined['unit_area'].astype(float)
    else:
        area = pd.Series(np.nan, index=combined.index)

    valid_area = (parking_money > 0) & (area > 0)
    parking_cost_per_sqm = np.where(valid_area, parking_money / area, np.nan)
    parking_cost_per_sqm = np.clip(parking_cost_per_sqm, 0, np.nanpercentile(parking_cost_per_sqm[~np.isnan(parking_cost_per_sqm)], 99)
                                  ) if np.any(valid_area) else parking_cost_per_sqm

    combined['parking_cost_per_sqm'] = parking_cost_per_sqm

    # ---------------------------
    # 5) train/test に戻す
    # ---------------------------
    n_train = len(train_df_fe)
    train_out = combined.iloc[:n_train].reset_index(drop=True)
    test_out  = combined.iloc[n_train:].reset_index(drop=True)

    return train_out, test_out


In [32]:
train_df_fe, test_df_fe = add_parking_features(train_df_fe, test_df_fe)

In [33]:
for col in ['parking_cost_ratio', 'parking_cost_per_sqm', 'parking_money']:
    x = train_df_fe[col]
    # 上位1%でクリップ
    hi = x.quantile(0.99)
    train_df_fe[col + '_clip'] = x.clip(upper=hi)
    test_df_fe[col + '_clip']  = test_df_fe[col].clip(upper=hi)

# ratio 系は明示的に log バージョンだけ使うのもアリ
train_df_fe['parking_cost_ratio_clip_log'] = np.log1p(train_df_fe['parking_cost_ratio_clip'])
test_df_fe['parking_cost_ratio_clip_log']  = np.log1p(test_df_fe['parking_cost_ratio_clip'])

In [34]:
def add_life_convenience_features(
    train_df_fe: pd.DataFrame,
    test_df_fe: pd.DataFrame,
    amenity_cols: list[str] | None = None,
    thresholds: tuple[int, int] = (500, 1000),
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    生活利便距離の log & 最小距離 & 閾値フラグ を追加する。

    - 各 amenity 距離について log 変換列 xxx_log を追加
    - min_amenity_distance: 複数の生活利便距離の最小値
    - log_min_amenity_distance: 上記の log1p
    - amenity_within_500m, amenity_within_1000m: 閾値以下の便益フラグ
    """
    combined = pd.concat([train_df_fe, test_df_fe], ignore_index=True)

    # デフォルトの生活利便距離カラム
    if amenity_cols is None:
        amenity_cols = [
            c for c in [
                'convenience_distance',
                'super_distance',
                'hospital_distance',
                'park_distance',
                'drugstore_distance',
                'bank_distance',
                'shopping_street_distance',
                'est_other_distance',
            ]
            if c in combined.columns
        ]

    # 何もなければそのまま返す
    if len(amenity_cols) == 0:
        print('[add_life_convenience_features] amenity_cols is empty. Skip.')
        return train_df_fe, test_df_fe

    # 各 amenity 距離の log 変換
    for col in amenity_cols:
        vals = combined[col].astype(float)
        vals = vals.clip(lower=0)  # マイナスは 0 に丸める
        combined[f'{col}_log'] = np.log1p(vals)

    # 最小距離
    amenity_dist = combined[amenity_cols].astype(float)
    min_dist = amenity_dist.min(axis=1)

    combined['min_amenity_distance'] = min_dist
    combined['log_min_amenity_distance'] = np.log1p(min_dist.clip(lower=0))

    # 閾値フラグ（例: 500m, 1000m）
    th1, th2 = thresholds
    combined[f'amenity_within_{th1}m'] = (min_dist <= th1).astype(int)
    combined[f'amenity_within_{th2}m'] = (min_dist <= th2).astype(int)

    # 分割して返す
    n_train = len(train_df_fe)
    train_out = combined.iloc[:n_train].reset_index(drop=True)
    test_out  = combined.iloc[n_train:].reset_index(drop=True)

    return train_out, test_out


In [35]:
train_df_fe, test_df_fe = add_life_convenience_features(train_df_fe, test_df_fe)

In [36]:
def add_status_and_management_features(
    train_df_fe: pd.DataFrame,
    test_df_fe: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    現況(genkyo_code)、引渡し(usable_status, usable_date)、
    管理状態(management_form, management_association_flg, house_kanrinin)
    を再整理 & スコア化して特徴量追加。
    """

    combined = pd.concat([train_df_fe, test_df_fe], ignore_index=True)

    # ---------------------------
    # 1) 現況 (genkyo_code)
    # ---------------------------
    genkyo = combined.get('genkyo_code', pd.Series(np.nan, index=combined.index)).astype(float)

    # 空き家/更地/更地引渡可 = フレキシブル
    combined['genkyo_is_vacant'] = genkyo.isin([2, 1, 10]).astype(int)

    # 賃貸中
    combined['genkyo_has_tenant'] = (genkyo == 3).astype(int)

    # 未完成
    combined['genkyo_is_under_construction'] = (genkyo == 4).astype(int)

    # 土地系（更地/古屋あり/古屋あり更地引渡可）と仮定
    combined['genkyo_is_land_only'] = genkyo.isin([1, 2, 10]).astype(int)

    # 自由度スコア
    flex_map = {
        1: 0.5,   # 居住中 or 更地（兼ねるので控えめ）
        2: 1.5,   # 空家
        3: 0.0,   # 賃貸中 → 自由度低い
        4: 1.0,   # 未完成
        10: 1.5,  # 古屋あり更地引渡可
    }
    combined['genkyo_flex_score'] = genkyo.map(flex_map).fillna(0.0)

    # ---------------------------
    # 2) 引渡し (usable_status, usable_date)
    # ---------------------------
    usable_status = combined.get('usable_status', pd.Series(np.nan, index=combined.index)).astype(float)
    usable_date   = combined.get('usable_date',   pd.Series(np.nan, index=combined.index)).astype(float)
    target_ym     = combined.get('target_ym',     pd.Series(np.nan, index=combined.index)).astype(float)

    # フラグ
    combined['usable_immediate_flag']   = (usable_status == 1).astype(int)
    combined['usable_fixed_date_flag']  = (usable_status == 3).astype(int)

    # スコア: 即時 > 期日指定 > 相談・未定
    usable_score_map = {
        1: 2.0,   # 即時
        3: 1.0,   # 期日指定
        2: 0.5,   # 相談
        4: 0.0,   # 未定
    }
    combined['usable_status_score'] = usable_status.map(usable_score_map).fillna(0.0)

    # yyyymm 同士の差分 → おおざっぱに「ヶ月差」とみなす
    # 例: target_ym=202201, usable_date=202204 → 3 ヶ月
    usable_months_delay = np.nan

    if 'usable_date' in combined.columns and 'target_ym' in combined.columns:
        # 年月を整数に分解
        u = usable_date.copy()
        t = target_ym.copy()

        u_year  = (u // 100).astype('Int64')
        u_month = (u % 100).astype('Int64')

        t_year  = (t // 100).astype('Int64')
        t_month = (t % 100).astype('Int64')

        usable_months_delay = (u_year - t_year) * 12 + (u_month - t_month)

    combined['usable_months_delay'] = usable_months_delay

    # ---------------------------
    # 3) 管理状態
    # ---------------------------
    management_form          = combined.get('management_form',          pd.Series(np.nan, index=combined.index)).astype(float)
    management_association   = combined.get('management_association_flg', pd.Series(np.nan, index=combined.index)).astype(float)
    house_kanrinin           = combined.get('house_kanrinin',          pd.Series(np.nan, index=combined.index)).astype(float)

    # 管理組合あり
    combined['has_management_association'] = (management_association == 2).astype(int)

    # プロ管理（委託あり）
    combined['has_professional_management'] = management_form.isin([2, 3]).astype(int)

    # 管理人あり
    combined['has_manager'] = house_kanrinin.isin([1, 2, 3, 5]).astype(int)

    # 管理形態スコア
    management_form_score_map = {
        1: 1.0,   # 自主管理
        2: 2.0,   # 一部委託
        3: 3.0,   # 全部委託
    }
    combined['management_form_score'] = management_form.map(management_form_score_map).fillna(0.0)

    # 管理人スコア
    manager_score_map = {
        4: 0.0,   # 無
        5: 0.5,   # 非常駐
        3: 1.0,   # 巡回
        2: 1.5,   # 日勤
        1: 2.0,   # 常駐
    }
    combined['manager_presence_score'] = house_kanrinin.map(manager_score_map).fillna(0.0)

    # 合計管理スコア
    combined['management_total_score'] = (
        combined['management_form_score']
        + combined['manager_presence_score']
        + combined['has_management_association'] * 0.5  # 管理組合ありに少し加点
    )

    # ---------------------------
    # 4) train/test に戻す
    # ---------------------------
    n_train = len(train_df_fe)
    train_out = combined.iloc[:n_train].reset_index(drop=True)
    test_out  = combined.iloc[n_train:].reset_index(drop=True)

    return train_out, test_out


In [37]:
train_df_fe, test_df_fe = add_status_and_management_features(train_df_fe, test_df_fe)

## 特徴量の追加・削除

In [38]:
fe_cols += [
    'Prefecture name_te', 'City/town/village name_te',
    'area_ratio', 'relative_floor',
    'area_per_room',
    'land_building_ratio',
    'senyu_area_x_built_diff', 'area_per_room_x_built_diff',
    'building_senyu_area_log_median', 'building_room_floor_max', 'building_unit_count',
    'mean_price_within_1km', 'median_price_within_1km', 'count_neighbors_1km', 'neighbor_price_std_1km', 'neighbor_price_iqr_1km',
    'city_lat', 'city_lon',
    'access_zone', 'walk_distance_bin', 'door_to_station_min_clip_log',
    'unit_area_range',
    'effective_age', 'reform_total_count', 'reform_newness_log', 'area_age_ratio_log',
    'unit_price_neighbor_log',
    'listing_months',
    'ratio_mean_land', 'ratio_weighted_land',
    "kyoueki_per_m2", "shuuzen_per_m2", "kyoueki_per_unit", "shuuzen_per_unit", "has_kyoueki", "has_shuuzen",
    "land_cheap_flag", "land_expensive_flag", "land_theoretical_price", "land_theoretical_price_weighted",
    "parking_available", "parking_on_site", "parking_nearby_only", "parking_distance_log", 'parking_money', "parking_cost_ratio_clip", "parking_cost_ratio_clip_log",  "parking_cost_per_sqm_clip",
    "min_amenity_distance", "log_min_amenity_distance",
    "convenience_distance_log", "super_distance_log", "drugstore_distance_log",
    "genkyo_is_vacant", "genkyo_has_tenant", "genkyo_is_under_construction", "genkyo_is_land_only", "genkyo_flex_score",
    "usable_immediate_flag", "usable_fixed_date_flag", "usable_status_score", "usable_months_delay",
    "has_management_association", "has_professional_management", "has_manager", "management_form_score", "manager_presence_score", "management_total_score",
] + [c for c in train_df_fe.columns if "pca" in c.lower()] + [c + '_te' for c in traffic_te_cols]

# 削除する特徴量
remove_cols = [
    'lat', 'lon',
    'bus_stop1', 'bus_stop2',
    'building_tag_id', 'unit_tag_id', 'statuses',
    'reform_date', 'reform_interior_date', 'reform_exterior_date', 'reform_wet_area_date', 'renovation_date',
    'building_create_date', 
    'unit_area_max', 'unit_area_min',
    'parking_money', 'parking_distance',
    'convenience_distance', 'super_distance', 'drugstore_distance'
]
fe_cols = [c for c in fe_cols if c not in remove_cols]

## 出力

In [40]:
train_df_fe[fe_cols + [target_col]].to_csv(f'{intermediate_path}train_df_fe_v{fe_ver}.csv', index=False)
test_df_fe[fe_cols].to_csv(f'{intermediate_path}test_df_fe_v{fe_ver}.csv', index=False)