# 前処理

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import japanize_matplotlib

import geopandas as gpd
from sklearn.neighbors import NearestNeighbors

import gc

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../../input/'
train_file_path = ROOT_DIR + 'train.csv'
test_file_path = ROOT_DIR + 'test.csv'
data_definition_path = ROOT_DIR + 'data_definition.xlsx'
intermediate_path = '../../output/intermediate_file/'
gis_path = ROOT_DIR + 'GISデータ/'

target_col = 'money_room'

# スクリプトのバージョン指定
preprocessing_ver = 2

## File Import

In [3]:
data_definition = pd.ExcelFile(data_definition_path)
data_definition_df = pd.read_excel(data_definition_path, sheet_name=data_definition.sheet_names[0])

fe_cols = list(data_definition_df[data_definition_df['fe_cols'] == 1]['本番データ特徴量名'])

In [4]:
train_df = pd.read_csv(train_file_path)[fe_cols + [target_col]]
test_df = pd.read_csv(test_file_path)[fe_cols]

## 建物種別の場合分け

In [5]:
import numpy as np
import pandas as pd

def assign_building_category(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # ------------------------------------------------
    # 1) building_type → 細かいカテゴリ (building_category)
    # ------------------------------------------------
    # 1: マンション
    # 2: タウンハウス
    # 3: アパート
    # 4: 一戸建
    # 5: テラスハウス
    # 6: 土地
    # 7: 駐車場
    # 8: ビル
    # 9: 店舗
    # 10: 倉庫
    # 11: 工場
    # 12: 寮
    # 13: ホテル
    # 14: 旅館
    # 15: その他
    # 901, 902, 999: 不明
    mapping_row = {
        1:   'mansion',
        2:   'townhouse',
        3:   'apartment',
        4:   'house',
        5:   'terrace_house',
        6:   'land',
        7:   'parking',
        8:   'office_building',
        9:   'shop',
        10:  'warehouse',
        11:  'factory',
        12:  'dormitory',
        13:  'hotel',
        14:  'ryokan',
        15:  'other',
        901: 'unknown',
        902: 'other_unknown',
        999: 'other_unknown',
    }

    df['building_category'] = df['building_type'].map(mapping_row).fillna('unknown')

    # ------------------------------------------------
    # 2) モデル用の大分類 (model_category)
    #    → 集合住宅 / 戸建 / 土地系 / 非住宅
    # ------------------------------------------------
    residential_multi_codes   = [1, 2, 3, 5, 12]  # マンション・アパート・タウン/テラス・寮
    residential_detached_codes = [4]              # 一戸建
    land_codes                = [6, 7]            # 土地・駐車場
    non_residential_codes     = [8, 9, 10, 11, 13, 14, 15, 901, 902, 999]

    model_mapping = {}

    for c in residential_multi_codes:
        model_mapping[c] = 'residential_multi'
    for c in residential_detached_codes:
        model_mapping[c] = 'house'
    for c in land_codes:
        model_mapping[c] = 'land'
    for c in non_residential_codes:
        model_mapping[c] = 'non_residential'

    df['model_category'] = df['building_type'].map(model_mapping).fillna('unknown')

    return df

In [6]:
train_df_preprocessed = assign_building_category(train_df)
test_df_preprocessed  = assign_building_category(test_df)

## データ更新日以降のリフォーム・リノベ情報の削除

In [7]:
def convert_yyyymm_to_datetime(s):
    """
    YYYYMM(float or int) → datetime(YYYY-MM-01) に変換する関数。
    NaN は NaT のまま。
    """
    return pd.to_datetime(s.astype('Int64').astype(str), format='%Y%m', errors='coerce')


def nullify_future_reform_info(df):
    # 日付カラム一覧
    yyyymm_cols = [
        'reform_date',
        'reform_wet_area_date',
        'reform_interior_date',
        'reform_exterior_date'
    ]
    free_date_cols = [
        'renovation_date'
    ]

    # snapshot_modify_date を datetime に変換
    df['snapshot_modify_date'] = pd.to_datetime(df['snapshot_modify_date'], errors='coerce')

    # reform_date 系（float YYYYMM → datetime）
    for col in yyyymm_cols:
        if col in df.columns:
            df[col] = convert_yyyymm_to_datetime(df[col])

            # 未来情報なら NaT
            df.loc[df[col] > df['snapshot_modify_date'], col] = pd.NaT

    # renovation_date（文字列 → datetime）
    for col in free_date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

            # 未来情報なら NaT
            df.loc[df[col] > df['snapshot_modify_date'], col] = pd.NaT
    
    # リフォーム内容も欠損に変換
    paired_cols = {
        'reform_interior': 'reform_interior_date',
        'reform_exterior': 'reform_exterior_date',
        'reform_wet_area': 'reform_wet_area_date',
    }

    for flag_col, date_col in paired_cols.items():
        if flag_col in df.columns and date_col in df.columns:
            # date_col が未来なら flag_col を欠損へ
            df.loc[df[date_col] > df['snapshot_modify_date'], flag_col] = pd.NA

    return df

In [8]:
train_df_preprocessed = nullify_future_reform_info(train_df_preprocessed)
test_df_preprocessed  = nullify_future_reform_info(test_df_preprocessed)

## 都道府県・市区町村情報の置換

In [9]:
codes = pd.read_excel(f'{ROOT_DIR}/data_definition.xlsx', sheet_name=data_definition.sheet_names[3])
codes.columns = ['No.', 'addr1_1', 'addr1_2', 'Prefecture name',
       'City/town/village name']
codes = codes[['addr1_1', 'addr1_2', 'Prefecture name',
       'City/town/village name']]

train_df_preprocessed = pd.merge(train_df_preprocessed, codes, on=['addr1_1', 'addr1_2'], how='inner')
test_df_preprocessed = pd.merge(test_df_preprocessed, codes, on=['addr1_1', 'addr1_2'], how='inner')

In [10]:
train_df_preprocessed.drop(['addr1_1', 'addr1_2'], axis=1, inplace=True)
test_df_preprocessed.drop(['addr1_1', 'addr1_2'], axis=1, inplace=True)
del codes

## 面積関連の欠損値・異常値の処理

カラム名 | 意味 | 補足 | データの紐づけ | 欠損率
:- |:- |:- |:- |:-
total_floor_area | 延べ床面積 | 建物全体の床面積 | 棟情報 | 0.69
building_area | 建築面積 | 建築面積 | 棟情報 | 0.98
building_land_area | 土地面積 | | 棟情報 | 0.52
land_area_all | 敷地全体面積 | 敷地全体の面積 | 棟情報 | 0.82
unit_area_min | 専有面積 下限 | | 棟情報 | 0.65
unit_area_max | 専有面積 上限 | | 棟情報 | 0.65
land_kenpei | 建ぺい率(建築面積 ÷ 敷地面積) | 賃貸：土地、売買：土地で「都市計画」が 1:市街化区域 の場合に必須 単位：% | 棟情報 | 0.28
land_youseki | 容積率(延べ床面積 ÷ 敷地面積) | 賃貸：土地、売買：土地で「都市計画」が 1:市街化区域 の場合に必須 単位：% | 棟情報 | 0.28
unit_area | 専有面積 | | 棟情報 | 0.13
snapshot_land_area | 区画面積(代表) | 単位：平米 | 物件情報 | 0.52
house_area | 建物面積/専有面積(代表) | 単位：平米 | 物件情報 | 0

対応
- 専有面積（senyu）
    - 集合住宅：unit_area_corrected
    - 戸建：house_area
    - 土地・非住宅：NaN
- 延床（nobeyuka）
    - 戸建・非住宅：total_floor_area（なければ戸建は house_area も許容）
    - 集合住宅・土地：NaN
- 区画（kukaku）/ 土地（tochi）/ 敷地（shikichi）
    - 集合住宅：基本 NaN（価格決定にはほぼ効かない）
    - 戸建・土地・非住宅：snapshot_land_area → なければ building_land_area → なければ land_area_all を順に使う

In [11]:
import numpy as np
import pandas as pd

def assign_area_category(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # ========= 0. 便利マスク =========
    is_multi  = df['model_category'] == 'residential_multi'
    is_house  = df['model_category'] == 'house'
    is_land   = df['model_category'] == 'land'
    is_nonres = df['model_category'] == 'non_residential'

    # ========= 1. unit_area_corrected =========
    df['unit_area_corrected'] = df['unit_area']

    # (A) min/max からの補正
    mask_min = df['unit_area_corrected'].isna() & df['unit_area_min'].notna()
    df.loc[mask_min, 'unit_area_corrected'] = df.loc[mask_min, 'unit_area_min']

    mask_max = df['unit_area_corrected'].isna() & df['unit_area_max'].notna()
    df.loc[mask_max, 'unit_area_corrected'] = df.loc[mask_max, 'unit_area_max']

    # (B) min/max をはみ出しているものは中点で補正
    mask_outside = (
        df['unit_area_corrected'].notna()
        & df['unit_area_min'].notna()
        & df['unit_area_max'].notna()
        & (
            (df['unit_area_corrected'] < df['unit_area_min'])
            | (df['unit_area_corrected'] > df['unit_area_max'])
        )
    )
    df.loc[mask_outside, 'unit_area_corrected'] = (
        (df['unit_area_min'] + df['unit_area_max']) / 2
    )

    # ========= 2. 専有面積 senyu_area =========
    df['senyu_area'] = np.nan

    # 集合住宅：unit_area_corrected
    df.loc[is_multi, 'senyu_area'] = df.loc[is_multi, 'unit_area_corrected']

    # 戸建：house_area
    df.loc[is_house, 'senyu_area'] = df.loc[is_house, 'house_area']

    # 土地・非住宅は NaN のまま（専有面積の概念なし）

    # ========= 3. 延床面積 nobeyuka_area =========
    df['nobeyuka_area'] = np.nan

    # 戸建：total_floor_area があればそれ、なければ house_area を代用
    mask_house_any = is_house & df['total_floor_area'].notna()
    df.loc[mask_house_any, 'nobeyuka_area'] = df.loc[mask_house_any, 'total_floor_area']

    mask_house_fallback = is_house & df['nobeyuka_area'].isna() & df['house_area'].notna()
    df.loc[mask_house_fallback, 'nobeyuka_area'] = df.loc[mask_house_fallback, 'house_area']

    # 非住宅：total_floor_area をそのまま使う
    df.loc[is_nonres, 'nobeyuka_area'] = df.loc[is_nonres, 'total_floor_area']

    # 集合住宅・土地は NaN のまま（延床は基本使わない）

    # ========= 4. 区画面積 kukaku_area =========
    # 土地・戸建・非住宅で使う（集合住宅は NaN）
    df['kukaku_area'] = np.nan

    # 優先順位: snapshot_land_area → building_land_area
    base_land = df['snapshot_land_area'].copy()
    base_land = base_land.fillna(df['building_land_area'])

    mask_need_kukaku = is_house | is_land | is_nonres
    df.loc[mask_need_kukaku, 'kukaku_area'] = base_land.loc[mask_need_kukaku]

    # ========= 5. 土地面積 tochi_area =========
    # 基本は building_land_area、あれば snapshot_land_area で上書き
    df['tochi_area'] = np.nan

    land_base = df['building_land_area'].copy()
    land_base = land_base.where(land_base.notna(), df['snapshot_land_area'])
    # 戸建・土地・非住宅にのみセット
    mask_need_tochi = is_house | is_land | is_nonres
    df.loc[mask_need_tochi, 'tochi_area'] = land_base.loc[mask_need_tochi]

    # ========= 6. 敷地面積 shikichi_area =========
    # site 全体の面積イメージ: land_area_all → tochi_area → kukaku_area
    df['shikichi_area'] = np.nan

    df.loc[:, 'shikichi_area'] = df['land_area_all']
    df['shikichi_area'] = df['shikichi_area'].fillna(df['tochi_area'])
    df['shikichi_area'] = df['shikichi_area'].fillna(df['kukaku_area'])

    # 集合住宅は敷地面積の意味が薄いのであえて NaN にしてもよい
    df.loc[is_multi, ['kukaku_area', 'tochi_area', 'shikichi_area']] = np.nan

    return df

In [12]:
train_df_preprocessed = assign_area_category(train_df_preprocessed)
test_df_preprocessed  = assign_area_category(test_df_preprocessed)

## 欠損値・異常値の対応
このブロックも見直した方がいいかも

In [13]:
n_train = len(train_df_preprocessed)

# ===== 1. 結合 =====
df = pd.concat([train_df_preprocessed, test_df_preprocessed],
               axis=0, ignore_index=True)

# -------------------------------------------------
# 2. クリーニング（clean列を作る）
#   面積系は building_category ごとの分位点で外れ値を NaN にする
# -------------------------------------------------

def make_area_clean(df: pd.DataFrame,
                    col: str,
                    lower_q: float = 0.01,
                    upper_q: float = 0.99) -> pd.Series:
    """
    model_category ごとに分位点クリーニング。
    そのカテゴリに col の有効値が 1つもない場合は、clean 列はすべて NaN になる。
    """
    q = (
        df.loc[df[col].notna()]
          .groupby('model_category')[col]   # ★ここだけ変更
          .quantile([lower_q, upper_q])
          .unstack()
    )
    q.columns = ['q_low', 'q_high']

    def _clean(group: pd.Series) -> pd.Series:
        cat = group.name
        if cat not in q.index:
            return pd.Series([np.nan] * len(group), index=group.index)
        lo = q.loc[cat, 'q_low']
        hi = q.loc[cat, 'q_high']
        return group.where(group.between(lo, hi))

    return df.groupby('model_category')[col].transform(_clean)  # ★ここも



# 間取り数（これは従来どおりでよい）
df['madori_number_clean'] = df['madori_number_all'].where(
    df['madori_number_all'].between(1, 7)
)

# 専有面積・区画面積・延べ床・土地面積を building_category 別にクレンジング
df['senyu_area_clean']     = make_area_clean(df, 'senyu_area')
df['kukaku_area_clean']    = make_area_clean(df, 'kukaku_area')
df['nobeyuka_area_clean']  = make_area_clean(df, 'nobeyuka_area')
df['tochi_area_clean']     = make_area_clean(df, 'tochi_area')
df['shikichi_area_clean']  = make_area_clean(df, 'shikichi_area')

# 間取りコード：有効コード以外を NaN
valid_codes = [10, 20, 25, 30, 35, 40, 45, 50, 55]
df['madori_kind_clean'] = df['madori_kind_all'].where(
    df['madori_kind_all'].isin(valid_codes)
)

# 階数：明らかな外れ値だけ除外（例：全体の 99% タイルで上限を切る）
floor_upper = (
    df.loc[df['floor_count'].notna(), 'floor_count']
      .quantile(0.99)
)
df['floor_count_clean'] = df['floor_count'].where(df['floor_count'] <= floor_upper)

# -------------------------------------------------
# 3. building_id 単位で補完
# -------------------------------------------------

def mode_or_nan(s: pd.Series):
    m = s.mode()
    return m.iloc[0] if not m.empty else np.nan

def median_or_nan(s: pd.Series):
    return s.median() if s.notna().any() else np.nan

g = df.groupby('building_id')

# --- mode 補完するカラム ---
mode_fill_specs = {
    'madori_number_all': 'madori_number_clean',
    'madori_kind_all':   'madori_kind_clean',
    'floor_count':       'floor_count_clean',
}

for out_col, clean_col in mode_fill_specs.items():
    df[out_col] = g[clean_col].transform(mode_or_nan)

# --- median 補完するカラム（面積） ---
median_fill_specs = {
    'senyu_area':     'senyu_area_clean',
    'kukaku_area':    'kukaku_area_clean',
    'nobeyuka_area':  'nobeyuka_area_clean',
    'tochi_area':     'tochi_area_clean',
    'shikichi_area':  'shikichi_area_clean',
}

for out_col, clean_col in median_fill_specs.items():
    df[out_col] = g[clean_col].transform(median_or_nan)

# -------------------------------------------------
# 4. その他の補正・フラグ
# -------------------------------------------------

# room_floor > floor_count の場合は floor_count でクリップ
mask = (df['room_floor'] > df['floor_count']) & df['floor_count'].notna()
df.loc[mask, 'room_floor'] = df.loc[mask, 'floor_count']

# 欠損フラグ
df['floor_count_missing']   = df['floor_count'].isna().astype(int)
df['building_type_missing'] = df['building_type'].isna().astype(int)

# -------------------------------------------------
# 5. 再び train / test に分割
# -------------------------------------------------
train_df_preprocessed = df.iloc[:n_train].copy()
test_df_preprocessed  = df.iloc[n_train:].copy()


## 分布が歪なカラムをlog変換

In [14]:
log_cols = [
    'senyu_area',
    'nobeyuka_area',
    'kukaku_area',
    'tochi_area',
    'shikichi_area',
    'unit_count'
]

In [15]:
log_fe_cols = []

for col in log_cols:
    for df in [train_df_preprocessed, test_df_preprocessed]:
        new_col = f'{col}_log'
        # 負値はないはずだが、念のため0でクリップ
        df[new_col] = np.log1p(df[col].clip(lower=0))
    log_fe_cols.append(new_col)

## リフォーム・リノベのタグ変換

In [16]:
def get_slashed_tags(df, cols_list):
    """スラッシュ区切り列を 0/1 の int8 フラグ列に分解する"""
    temp_dfs = []
    for col in cols_list:

        temp_df = df[col].str.get_dummies(sep='/')
        # if is_tag_master:
        #     temp_df.rename(columns=tag_master, inplace=True)
        temp_df = temp_df.add_prefix(f'{col} ')
        temp_df = temp_df.astype('int8')

        temp_dfs.append(temp_df)

    # すべて結合
    temp_dfs = pd.concat(temp_dfs, axis=1).astype('int8')
    return temp_dfs

In [17]:
# --- train + test を結合 ---
combined_df = pd.concat([train_df_preprocessed, test_df_preprocessed], ignore_index=True)

# --- 新しいタグ列を生成 ---
slashed_cols = ['reform_interior', 'reform_exterior', 'reform_wet_area']
slashed_df = get_slashed_tags(combined_df, slashed_cols)

In [18]:
reform_cols = slashed_df.columns.tolist()

# --- 元 DF に結合 ---
combined_df = pd.concat([combined_df, slashed_df], axis=1)

# --- スラッシュ区切り列を削除 ---
combined_df = combined_df.drop(columns=slashed_cols)

# --- 再分割 ---
train_df_preprocessed = combined_df.iloc[:len(train_df_preprocessed)].copy()
test_df_preprocessed  = combined_df.iloc[len(train_df_preprocessed):].copy()

In [19]:
del slashed_df
gc.collect()

70

## 築年数の算出

In [20]:
def parse_year(date_input):
    try:
        s = str(date_input)
        if len(s) < 4:
            return np.nan
        return int(s[:4])
    except:
        return np.nan

def add_age_features(df):
    # 元の year_built と target_ym の年だけ抽出
    df['built_year']  = df['year_built'].apply(parse_year)
    df['target_year'] = df['target_ym'].apply(parse_year)

    # 築年数 = 対象年 − 建築年
    df['built_diff'] = df['target_year'] - df['built_year']

    # 築年数がマイナスになることはありえないので NaN に修正
    df.loc[df['built_diff'] < 0, 'built_diff'] = np.nan

    return df

train_df = add_age_features(train_df_preprocessed)
test_df_preprocessed  = add_age_features(test_df_preprocessed)

## 周期変換

In [21]:
# --- 緯度・経度をラジアンに ---
for df in [train_df_preprocessed, test_df_preprocessed]:
    df['lat_rad'] = np.radians(df['lat'].astype(float))
    df['lon_rad'] = np.radians(df['lon'].astype(float))

    # sin / cos 変換
    df['sin_lat'] = np.sin(df['lat_rad'])
    df['cos_lat'] = np.cos(df['lat_rad'])
    df['sin_lon'] = np.sin(df['lon_rad'])
    df['cos_lon'] = np.cos(df['lon_rad'])

In [22]:
# TODO: 風向きも周期変換したい

## タグ情報のOneHot

In [23]:
tag_info = pd.read_excel(f'{ROOT_DIR}/data_definition.xlsx', sheet_name=data_definition.sheet_names[2])
tag_info = tag_info[['タグID', 'タグ内容', 'タグ分類']]

facilities_info = pd.read_excel(f'{ROOT_DIR}/data_definition.xlsx', sheet_name=data_definition.sheet_names[4])
facilities_info = facilities_info[['タグID', 'タグ内容', 'タグ分類']]

In [24]:
tag_master = pd.concat([tag_info, facilities_info], axis=0, ignore_index=True).drop_duplicates()
tag_master = tag_master[tag_master['タグ分類'] != '不要']
tag_master['タグ情報'] = tag_master['タグ分類'] + '_' + tag_master['タグ内容']

In [25]:
tag_master['タグID'] = tag_master['タグID'].astype('str')
tag_master.set_index('タグID', inplace=True)
tag_master = tag_master.to_dict()['タグ情報']

In [26]:
combined_df = pd.concat([train_df_preprocessed, test_df_preprocessed], ignore_index=True)
org_tag_cols = ['building_tag_id', 'unit_tag_id', 'statuses']

# tag_master は「{tag_id: tag_name}」の辞書であることを前提
valid_tag_ids = set(tag_master.keys())  # リネーム対象の tag_id
valid_tag_names = set(tag_master.values())  # リネーム後のタグ名

tag_dfs = []

for col in org_tag_cols:
    temp_df = combined_df[col].str.get_dummies(sep='/')

    # --- rename ---
    temp_df.rename(columns=tag_master, inplace=True)

    # --- ★ リネームされなかった元IDを削除 ★ ---
    # 残すべきカラム：valid_tag_names に含まれるもののみ
    temp_df = temp_df.loc[:, temp_df.columns.isin(valid_tag_names)]

    temp_df = temp_df.astype('int8')
    tag_dfs.append(temp_df)

# 結合
tag_df = pd.concat(tag_dfs, axis=1).astype('int8')

# 同名列は 1 を優先（max を取る）
tag_df = tag_df.groupby(level=0, axis=1).max()

In [27]:
tag_cols = tag_df.columns.drop_duplicates().tolist()

# --- 元 DF に結合 ---
combined_df = pd.concat([combined_df, tag_df], axis=1)

# --- スラッシュ区切り列を削除 ---
combined_df = combined_df.drop(columns=org_tag_cols)

# --- 再分割 ---
train_df_preprocessed = combined_df.iloc[:len(train_df_preprocessed)].copy()
test_df_preprocessed  = combined_df.iloc[len(train_df_preprocessed):].copy()

## 地価

In [28]:
land_gdf = gpd.read_file(f'{gis_path}公示地価/L01-23_GML/L01-23.geojson') 
print(land_gdf.crs) # EPSG:4612(日本測地系2000) EPSG:4326が世界測地系

EPSG:4612


In [29]:
# TODO: 対前年変動率も特徴量に入れたい
land_gdf = land_gdf.rename(columns={
    'L01_005': 'land_year',            # 年度（例: 2023）
    'L01_006': 'land_price_2023',    # 公示地価 [円/㎡]
    'L01_007': 'land_price_yoy_pct',   # 対前年変動率 [%]
})

year_to_col = {y: f'L01_{y - 1921:03d}' for y in range(2018, 2023)}
rename_price_cols = {col: f'land_price_{year}' 
                     for year, col in year_to_col.items()}

land_gdf = land_gdf.rename(columns=rename_price_cols)

In [30]:
def fill_jgd_from_wgs(df, el_col='el', nl_col='nl',
                      lon_wgs_col='lon', lat_wgs_col='lat'):
    """
    日本測地系(el/nl)が NaN で、世界測地系(lon_wgs/lat_wgs) がある行だけ
    JGD2011(EPSG:6668) → JGD2000(EPSG:4612) に変換して補完する
    """
    df['lon_jgd'] = df[el_col] / 3_600_000
    df['lat_jgd'] = df[nl_col] / 3_600_000


    # 1) 日本測地系が欠損していて、世界測地系はある行だけマスク
    mask = df['lon_jgd'].isna() & df['lat_jgd'].isna() \
           & df[lon_wgs_col].notna() & df[lat_wgs_col].notna()

    if mask.sum() == 0:
        return df  # 補完する行がなければそのまま返す

    df_sub = df.loc[mask].copy()

    # 2) 世界測地系の GeoDataFrame（EPSG:4326 = WGS84）
    gdf_wgs = gpd.GeoDataFrame(
        df_sub,
        geometry=gpd.points_from_xy(df_sub[lon_wgs_col], df_sub[lat_wgs_col]),
        crs='EPSG:6668'
    )

    # 3) 日本測地系（JGD2000, EPSG:4612）に変換
    gdf_jgd = gdf_wgs.to_crs(epsg=4612)

    # 4) 変換後の座標を el/nl に入れる
    df.loc[mask, 'lon_jgd'] = gdf_jgd.geometry.x.values  # 経度（日本測地系）
    df.loc[mask, 'lat_jgd'] = gdf_jgd.geometry.y.values  # 緯度（日本測地系）

    return df

In [31]:
# まず train/test それぞれ実行
train_df_preprocessed = fill_jgd_from_wgs(train_df_preprocessed)
test_df_preprocessed = fill_jgd_from_wgs(test_df_preprocessed)

In [32]:
def add_land_price_features_haversine(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    land_gdf: gpd.GeoDataFrame,
    lat_col: str,
    lon_col: str,
    year_col: str,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    各物件について、同じ year の land_price_YYYY を持つ標準地から

    - 最近傍1点の地価: nearest_land_price
    - その距離[m]: distance_to_landpoint_m
    - log(nearest_land_price + 1): log_land_price
    - 最近傍3点距離加重平均地価: weighted_land_price_3
    - その log: log_weighted_land_price_3

    を haversine 距離（球面距離）を用いて付与する。
    緯度経度は「度（deg）」前提（JGD2000/JGD2011/WGS84 いずれでも OK、揃っていればよい）。
    """

    # 1) train/test を結合
    combined_df = pd.concat([train_df, test_df], ignore_index=True)

    # 2) 座標がまともな行だけ対象
    lat = combined_df[lat_col]
    lon = combined_df[lon_col]

    valid_mask = (
        lat.notna() & lon.notna()
        & lat.between(-90, 90)
        & lon.between(-180, 180)
    )
    df_valid = combined_df.loc[valid_mask].copy()

    n_all = len(combined_df)
    nearest_all  = np.full(n_all, np.nan, dtype=float)
    dist_all     = np.full(n_all, np.nan, dtype=float)
    weighted_all = np.full(n_all, np.nan, dtype=float)

    # 3) 標準地側の座標（land_gdf）をラジアンに変換
    land_lon_deg = land_gdf.geometry.x.to_numpy()
    land_lat_deg = land_gdf.geometry.y.to_numpy()

    land_lon_rad = np.deg2rad(land_lon_deg)
    land_lat_rad = np.deg2rad(land_lat_deg)

    land_X = np.vstack([land_lat_rad, land_lon_rad]).T  # (n_land, 2)

    # 4) 最近傍 3点探索 (haversine)
    nn = NearestNeighbors(n_neighbors=3, metric="haversine")
    nn.fit(land_X)

    # 5) 物件側の座標をラジアンに変換
    prop_lat_deg = df_valid[lat_col].to_numpy()
    prop_lon_deg = df_valid[lon_col].to_numpy()

    prop_lat_rad = np.deg2rad(prop_lat_deg)
    prop_lon_rad = np.deg2rad(prop_lon_deg)

    prop_X = np.vstack([prop_lat_rad, prop_lon_rad]).T  # (n_valid, 2)

    finite_mask = np.isfinite(prop_lat_rad) & np.isfinite(prop_lon_rad)
    valid_index = df_valid.index.to_numpy()[finite_mask]

    if len(valid_index) > 0:
        # 距離は「ラジアン」で返ってくる
        distances_rad, indices = nn.kneighbors(prop_X[finite_mask])  # (n_valid, 3)

        # 年情報
        years_valid = df_valid.loc[valid_index, year_col].to_numpy().astype(int)

        nearest_prices  = []
        nearest_dists_m = []
        weighted_prices = []

        R = 6_371_000.0  # 地球半径[m]

        for dist_row_rad, idx_row, year in zip(distances_rad, indices, years_valid):
            # 距離[m] に変換
            dist_row_m = dist_row_rad * R

            col_name = f'land_price_{year}'
            if col_name not in land_gdf.columns:
                raise KeyError(
                    f'{col_name} が land_gdf に存在しません。'
                    f'列名や対象年の範囲を確認してください。'
                )

            prices = land_gdf.iloc[idx_row][col_name].to_numpy().astype(float)

            # --- 1点目（最近傍）の情報 ---
            nearest_prices.append(prices[0])
            nearest_dists_m.append(dist_row_m[0])

            # --- 3点距離加重平均 ---
            zero_mask = dist_row_m == 0
            if zero_mask.any():
                # 距離0が含まれる場合は、その点/その複数点の平均
                wp = prices[zero_mask].mean()
            else:
                # 1/d で重み付け
                w = 1.0 / dist_row_m
                w = w / w.sum()
                wp = np.dot(w, prices)

            weighted_prices.append(wp)

        nearest_prices  = np.array(nearest_prices, dtype=float)
        nearest_dists_m = np.array(nearest_dists_m, dtype=float)
        weighted_prices = np.array(weighted_prices, dtype=float)

        nearest_all[valid_index]  = nearest_prices
        dist_all[valid_index]     = nearest_dists_m
        weighted_all[valid_index] = weighted_prices

    # 6) combined_df に列を追加
    combined_df = combined_df.copy()
    combined_df['nearest_land_price']       = nearest_all
    combined_df['distance_to_landpoint_m']  = dist_all
    combined_df['log_land_price']           = np.log1p(combined_df['nearest_land_price'])
    combined_df['weighted_land_price_3']    = weighted_all
    combined_df['log_weighted_land_price_3'] = np.log1p(combined_df['weighted_land_price_3'])

    # 7) train/test に戻す
    n_train = len(train_df)
    train_out = combined_df.iloc[:n_train].reset_index(drop=True)
    test_out  = combined_df.iloc[n_train:].reset_index(drop=True)

    return train_out, test_out


In [34]:
train_df_preprocessed, test_df_preprocessed = add_land_price_features_haversine(
    train_df=train_df_preprocessed,
    test_df=test_df_preprocessed,
    land_gdf=land_gdf,
    lat_col='lat_jgd',
    lon_col='lon_jgd',
    year_col='target_year'
)

## 税部分の計算

In [35]:
# TODO: money_kyoueki_tax, parking_money_tax

## 比率算出の計算

In [36]:
# TODO: 私道、持分比率など

## 特徴量の追加・削除

In [37]:
fe_cols += [
    'building_category', 'model_category',
    'Prefecture name', 'City/town/village name',
    'floor_count_missing', 'building_type_missing',
    'built_diff', 'target_year',
    'senyu_area', 'nobeyuka_area', 'kukaku_area', 'tochi_area', 'shikichi_area',
    'nearest_land_price', 'weighted_land_price_3', 'distance_to_landpoint_m', 'log_land_price', 'log_weighted_land_price_3'
] + log_fe_cols + tag_cols + reform_cols

# 削除する特徴量
remove_cols = [
    'bukken_type', 'building_type',
    'addr1_1', 'addr1_2',
    'snapshot_modify_date',
    'unit_area', 'house_area', 'total_floor_area', 'snapshot_land_area', 'building_land_area', 'land_area_all', 'building_area',
    'el', 'nl',
    'building_tag_id', 'unit_tag_id', 'statuses',
] + slashed_cols
fe_cols = [c for c in fe_cols if c not in remove_cols]

## 出力

In [38]:
train_df_preprocessed[fe_cols + [target_col]].to_csv(f'{intermediate_path}train_df_preprocessed_v{preprocessing_ver}.csv', index=False)
test_df_preprocessed[fe_cols].to_csv(f'{intermediate_path}test_df_preprocessed_v{preprocessing_ver}.csv', index=False)