# 国土数値情報の取得

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import japanize_matplotlib

import geopandas as gpd
from sklearn.neighbors import NearestNeighbors

import gc
from pathlib import Path
import re
from pyogrio import read_dataframe

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 200)

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
train_file_path = ROOT_DIR + 'train.csv'
test_file_path = ROOT_DIR + 'test.csv'
intermediate_path = '../output/intermediate_file/'
gis_path = ROOT_DIR + 'GISデータ/'

pkey_cols = ['target_ym', 'building_id', 'unit_id']
geo_cols = ['lat', 'lon']

geo_ver = 1

## File Import

In [3]:
train_df_geo = pd.read_csv(train_file_path)[pkey_cols + geo_cols]
test_df_geo = pd.read_csv(test_file_path)[pkey_cols + geo_cols]

## データの変換

#### 対象年の抽出

In [4]:
def parse_year(date_input):
    try:
        s = str(date_input)
        if len(s) < 4:
            return np.nan
        return int(s[:4])
    except:
        return np.nan

In [5]:
train_df_geo['target_year'] = train_df_geo['target_ym'].apply(parse_year)
test_df_geo['target_year'] = test_df_geo['target_ym'].apply(parse_year)

#### 測地系の変換(EPSG:6668 → EPSG:4612)

In [6]:
def fill_jgd_from_wgs(df, lon_wgs_col='lon', lat_wgs_col='lat'):
    """
    JGD2011(EPSG:6668) → JGD2000(EPSG:4612) に変換して補完する
    """

    # 2) 世界測地系の GeoDataFrame
    gdf_wgs = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[lon_wgs_col], df[lat_wgs_col]),
        crs='EPSG:6668'
    )

    # 3) 日本測地系（JGD2000, EPSG:4612）に変換
    gdf_jgd = gdf_wgs.to_crs(epsg=4612)

    # 4) 変換後の座標を el/nl に入れる
    df['lon_jgd'] = gdf_jgd.geometry.x.values  # 経度（4612）
    df['lat_jgd'] = gdf_jgd.geometry.y.values  # 緯度（4612）

    return df

In [7]:
train_df_geo = fill_jgd_from_wgs(train_df_geo)
test_df_geo = fill_jgd_from_wgs(test_df_geo)

## 地価

In [8]:
# land_gdf2022 = read_dataframe(f'{gis_path}公示地価/L01-22_GML/L01-22.geojson') 
land_gdf = read_dataframe(f'{gis_path}公示地価/L01-23_GML/L01-23.geojson') 
print(land_gdf.crs) # EPSG:4612(日本測地系2000)

EPSG:4612


In [9]:
land_gdf = land_gdf.rename(columns={
    'L01_005': 'land_year',            # 年度（例: 2023）
    'L01_006': 'land_price_2023',    # 公示地価 [円/㎡]
    'L01_007': 'land_price_yoy_pct',   # 対前年変動率 [%]
})

year_to_col = {y: f'L01_{y - 1922:03d}' for y in range(2018, 2023)}
rename_price_cols = {col: f'land_price_{year}' 
                     for year, col in year_to_col.items()}

land_gdf = land_gdf.rename(columns=rename_price_cols)

In [10]:
def add_land_price_features_haversine(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    land_gdf,
    lat_col: str,
    lon_col: str,
    year_col: str,
    n_neighbors: int = 3,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    target_year=y の各行について、標準地を (land_price_y>0 かつ land_price_{y-1}>0) に絞ってから
    最近傍/距離加重の地価特徴量と前年比(yoy/dlog)を作る。

    目的:
    - 「前年が存在しない新設標準地（前年=0）」を参照して train に 0 地価が入るのを避ける
    - yoy/dlog を意味のある標準地だけで計算する

    前提:
    - year_col は 2019 のような年
    - land_gdf に land_price_YYYY 列がある
    - lat/lon は経緯度（度）
    """

    if n_neighbors <= 0:
        raise ValueError('n_neighbors must be positive')

    # --- 1) train/test を MultiIndex で安全に結合（index重複回避） ---
    combined = pd.concat({'train': train_df, 'test': test_df}, axis=0, names=['__set__'])

    # --- 2) 有効座標のみ ---
    lat = combined[lat_col]
    lon = combined[lon_col]
    valid_mask = (
        lat.notna() & lon.notna()
        & lat.between(-90, 90)
        & lon.between(-180, 180)
    )
    df_valid = combined.loc[valid_mask]

    # --- 3) 出力（MultiIndexで持つ） ---
    nearest_all = pd.Series(np.nan, index=combined.index, dtype='float64')
    dist_all = pd.Series(np.nan, index=combined.index, dtype='float64')
    weighted_all = pd.Series(np.nan, index=combined.index, dtype='float64')
    nearest_prev_all = pd.Series(np.nan, index=combined.index, dtype='float64')
    weighted_prev_all = pd.Series(np.nan, index=combined.index, dtype='float64')

    if len(df_valid) > 0:
        # 物件座標（df_validの行順で固定）
        prop_lat_rad = np.deg2rad(df_valid[lat_col].to_numpy())
        prop_lon_rad = np.deg2rad(df_valid[lon_col].to_numpy())
        prop_X = np.vstack([prop_lat_rad, prop_lon_rad]).T
        finite_mask = np.isfinite(prop_lat_rad) & np.isfinite(prop_lon_rad)

        if finite_mask.any():
            prop_X_f = prop_X[finite_mask]
            idx_valid_f = df_valid.index.to_numpy()[finite_mask]  # MultiIndexラベル
            years_valid = df_valid[year_col].to_numpy(dtype=np.int64)[finite_mask]

            R = 6_371_000.0

            # --- 4) yearごとに処理（yearごとに land_gdf を 0除外してNN） ---
            unique_years = np.unique(years_valid)

            for y in unique_years:
                # 対象行（このyearの物件）
                rows_mask = (years_valid == y)
                if not rows_mask.any():
                    continue

                col_y = f'land_price_{int(y)}'
                col_y_1 = f'land_price_{int(y) - 1}'

                if col_y not in land_gdf.columns or col_y_1 not in land_gdf.columns:
                    # 列がなければこのyearは作れないのでスキップ
                    continue

                # --- 4-1) 標準地を「当年>0 かつ 前年>0」に限定 ---
                land_price_y = land_gdf[col_y].to_numpy(dtype=float)
                land_price_y1 = land_gdf[col_y_1].to_numpy(dtype=float)

                land_ok = (land_price_y > 0) & (land_price_y1 > 0)
                if land_ok.sum() < n_neighbors:
                    # 近傍が作れない（点が少ない）→ このyearの行は NaN のまま
                    continue

                land_sub = land_gdf.loc[land_ok]

                # 座標
                land_lon_rad = np.deg2rad(land_sub.geometry.x.to_numpy())
                land_lat_rad = np.deg2rad(land_sub.geometry.y.to_numpy())
                land_X = np.vstack([land_lat_rad, land_lon_rad]).T

                # 値（subsetに対応）
                land_price_y_sub = land_sub[col_y].to_numpy(dtype=float)
                land_price_y1_sub = land_sub[col_y_1].to_numpy(dtype=float)

                # --- 4-2) NN構築（yearごと） ---
                nn = NearestNeighbors(n_neighbors=n_neighbors, metric='haversine')
                nn.fit(land_X)

                # --- 4-3) 物件→近傍検索（このyearの行だけ） ---
                Xq = prop_X_f[rows_mask]
                idx_q_labels = idx_valid_f[rows_mask]

                distances_rad, indices = nn.kneighbors(Xq)  # (m, k)
                distances_m = distances_rad * R

                # land_price を (m,k) へ
                prices = land_price_y_sub[indices]
                prices_prev = land_price_y1_sub[indices]

                # 最近傍（1点目）
                nearest_prices = prices[:, 0]
                nearest_prev_prices = prices_prev[:, 0]
                nearest_dists = distances_m[:, 0]

                # 距離加重平均（ゼロ距離対応）
                d_m = distances_m
                zero_mask = (d_m == 0)
                has_zero = zero_mask.any(axis=1)

                w = np.zeros_like(d_m, dtype=float)
                nz = ~zero_mask
                w[nz] = 1.0 / d_m[nz]

                w_sum = w.sum(axis=1, keepdims=True)
                w_sum = np.where(w_sum == 0, 1.0, w_sum)
                w = w / w_sum

                wp = (w * prices).sum(axis=1)
                wp_prev = (w * prices_prev).sum(axis=1)

                if has_zero.any():
                    rows = np.where(has_zero)[0]
                    for r in rows:
                        z = zero_mask[r]
                        wp[r] = prices[r, z].mean()
                        wp_prev[r] = prices_prev[r, z].mean()

                # --- 4-4) 代入（MultiIndexラベルで安全に） ---
                nearest_all.loc[idx_q_labels] = nearest_prices
                nearest_prev_all.loc[idx_q_labels] = nearest_prev_prices
                dist_all.loc[idx_q_labels] = nearest_dists
                weighted_all.loc[idx_q_labels] = wp
                weighted_prev_all.loc[idx_q_labels] = wp_prev

    # --- 5) 列付与 ---
    out = combined.copy()
    out['nearest_land_price'] = nearest_all
    out['distance_to_landpoint_m'] = dist_all
    out['log_land_price'] = np.log1p(out['nearest_land_price'])

    out['weighted_land_price_3'] = weighted_all
    out['log_weighted_land_price_3'] = np.log1p(out['weighted_land_price_3'])

    out['nearest_land_price_prev'] = nearest_prev_all
    out['weighted_land_price_3_prev'] = weighted_prev_all

    # yoy/dlog（前年がNaNの行は自然にNaNになる）
    eps = 1.0
    denom_n = np.maximum(out['nearest_land_price_prev'].to_numpy(), eps)
    denom_w = np.maximum(out['weighted_land_price_3_prev'].to_numpy(), eps)

    out['land_price_yoy_nearest'] = (out['nearest_land_price'] - out['nearest_land_price_prev']) / denom_n
    out['land_price_yoy_w3'] = (out['weighted_land_price_3'] - out['weighted_land_price_3_prev']) / denom_w
    out['land_price_dlog_nearest'] = np.log1p(out['nearest_land_price']) - np.log1p(out['nearest_land_price_prev'])
    out['land_price_dlog_w3'] = np.log1p(out['weighted_land_price_3']) - np.log1p(out['weighted_land_price_3_prev'])

    # --- 6) train/test に戻す（元index保持） ---
    train_out = out.xs('train', level='__set__')
    test_out = out.xs('test', level='__set__')

    assert train_out.index.equals(train_df.index)
    assert test_out.index.equals(test_df.index)

    return train_out, test_out


In [11]:
train_df_geo, test_df_geo = add_land_price_features_haversine(
    train_df=train_df_geo,
    test_df=test_df_geo,
    land_gdf=land_gdf,
    lat_col='lat_jgd',
    lon_col='lon_jgd',
    year_col='target_year'
)

## 1kmメッシュ将来人口

In [12]:
fu_pop_path = gis_path + '人口メッシュ/1km_mesh_2024_GEOJSON/future_pop_1km.parquet'

#### データの結合（最初の1回のみなのでコメントアウト）

In [13]:
# in_dir = Path(gis_path + '人口メッシュ/1km_mesh_2024_GEOJSON')
# files = sorted(in_dir.glob('1km_mesh_2024_*_GEOJSON/*.geojson'))
# output_path = gis_path + '人口メッシュ/1km_mesh_2024_GEOJSON'

# gdfs = []
# base_crs = None

# keep_cols = [
#     'MESH_ID',
#     'SHICODE',
#     'PTN_2020', # 2020年の総数人口
#     'PTN_2025', 'PTN_2030', 'PTN_2035', 'PTN_2040', 'PTN_2045', 'PTN_2050', # 将来人口
#     'PTA_2025', 'RTA_2025', # 2025年男女計0～14歳人口/比率
#     'PTB_2025', 'RTB_2025', # 2025年男女計15～64歳人口/比率
#     'PTC_2025', 'RTC_2025', # 2025年男女計65歳以上人口/比率
#     'PTD_2025', 'RTD_2025', # 2025年男女計75歳以上人口/比率
#     'PTE_2025', 'RTE_2025', # 2025年男女計80歳以上人口/比率
#     'geometry',
# ]

In [14]:
# for fp in files:
#     gdf = read_dataframe(fp)
#     gdf_filtered = gdf[keep_cols]

#     # CRS を統一（最初のファイルの CRS に合わせる）
#     if base_crs is None:
#         base_crs = gdf.crs
#     else:
#         if gdf_filtered.crs != base_crs:
#             gdf_filtered = gdf_filtered.to_crs(base_crs)

#     gdfs.append(gdf_filtered)

# # 結合（index を振り直す）
# fu_pop_df = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=base_crs)
# print(f"crs: {fu_pop_df.crs}")

# del gdfs
# gc.collect()

In [15]:
# # --- 1) 対象年の列を抽出（例：PTN_2025, PTN_2030, ..., PTN_2050）
# ptn_cols = sorted(
#     [c for c in fu_pop_df.columns if re.match(r'^PTN_20\d{2}$', c)]
# )

# years = np.array([int(c.split('_')[1]) for c in ptn_cols])

# # --- 2) 線形回帰で傾きを計算
# def _calc_slope(row):
#     y = row[ptn_cols].values.astype(float)
#     if np.isnan(y).sum() >= len(y) - 1:
#         return np.nan
#     slope, _ = np.polyfit(years, y, 1)
#     return slope

# fu_pop_df['pop_trend_slope'] = fu_pop_df.apply(_calc_slope, axis=1)

# # --- 3) 正規化（0 除算・欠損対策込み）
# fu_pop_df['pop_trend_rate'] = (
#     fu_pop_df['pop_trend_slope'] /
#     fu_pop_df['PTN_2020'].replace({0: np.nan})
# )

In [16]:
# # ① 平面系へ
# fu_pop_df_m = fu_pop_df.to_crs('EPSG:6677')

# # ② centroid で Point 化
# fu_pop_df_m['geometry'] = fu_pop_df_m.geometry.centroid

# # ③ 必要なら緯度経度へ戻す
# fu_pop_df_pt = fu_pop_df_m.to_crs('EPSG:4612')
# fu_pop_df_pt['lon_4612'] = fu_pop_df_pt.geometry.x
# fu_pop_df_pt['lat_4612'] = fu_pop_df_pt.geometry.y

In [17]:
# fu_pop_df_pt.to_parquet(output_path + '/future_pop_1km.parquet')

#### 学習データ・予測データに結合して特徴量作成

In [18]:
fu_pop_df = gpd.read_parquet(fu_pop_path)

In [19]:
# #NOTE: 歪みが気になるのであればclipを用いる
# display(fu_pop_df[['pop_trend_slope', 'pop_trend_rate']].describe())
# q_low, q_high = fu_pop_df['pop_trend_rate'].quantile([0.01, 0.99])

# fu_pop_df['pop_trend_rate_clip'] = (
#     fu_pop_df['pop_trend_rate']
#     .clip(lower=q_low, upper=q_high)
# )

# display(fu_pop_df[['pop_trend_slope', 'pop_trend_rate_clip']].describe())

In [20]:
def _ensure_point_gdf_from_lonlat(df: pd.DataFrame,
                                 lon_col: str,
                                 lat_col: str,
                                 crs: str) -> gpd.GeoDataFrame:
    gdf = gpd.GeoDataFrame(
        df.copy(),
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs=crs
    )
    return gdf

def add_pop_knn_features(train_df_geo: pd.DataFrame,
                         test_df_geo: pd.DataFrame,
                         fu_pop_df: gpd.GeoDataFrame,
                         pop_cols: list[str],
                         lon_col: str = 'lon_jgd',
                         lat_col: str = 'lat_jgd',
                         in_crs: str = 'EPSG:4612',
                         work_crs: str = 'EPSG:6677',
                         ) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    train/test の各点に対して、人口メッシュ点の
    - 最近傍（nn）
    - k近傍の距離加重平均（idw）
    を付与する。
    """

    # --- 1) train/test を Point GeoDataFrame 化（EPSG:4612）
    tr_gdf = _ensure_point_gdf_from_lonlat(train_df_geo, lon_col, lat_col, in_crs)
    te_gdf = _ensure_point_gdf_from_lonlat(test_df_geo, lon_col, lat_col, in_crs)

    # --- 2) population を Point 化（Polygon の場合は centroid）
    pop_gdf = fu_pop_df.copy()
    if not all(pop_gdf.geometry.geom_type == 'Point'):
        pop_gdf = pop_gdf.to_crs(work_crs)
        pop_gdf['geometry'] = pop_gdf.geometry.centroid
        pop_gdf = pop_gdf.to_crs(in_crs)

    # --- 3) 作業用 CRS（メートル系）へ
    tr_m = tr_gdf.to_crs(work_crs)
    te_m = te_gdf.to_crs(work_crs)
    pop_m = pop_gdf.to_crs(work_crs)

    # --- 4) KNN 検索（メートル座標で）
    pop_xy = np.column_stack([pop_m.geometry.x.values, pop_m.geometry.y.values])
    nn = NearestNeighbors(n_neighbors=3, algorithm='auto', metric='euclidean')
    nn.fit(pop_xy)

    def _apply(gdf_m: gpd.GeoDataFrame, prefix: str) -> pd.DataFrame:
        q_xy = np.column_stack([gdf_m.geometry.x.values, gdf_m.geometry.y.values])
        dists, idxs = nn.kneighbors(q_xy, return_distance=True)  # shapes: (n, k)

        out = pd.DataFrame(index=gdf_m.index)

        # 最近傍距離
        out[f'{prefix}_dist_nn_m'] = dists[:, 0].astype(np.float32)

        # 各 pop_col について nn を作る
        for col in pop_cols:
            vals = pop_m[col].values[idxs]  # (n, k)
            out[f'{col}_nn'] = vals[:, 0]

        return out

    tr_feat = _apply(tr_m, 'pop')
    te_feat = _apply(te_m, 'pop')

    train_out = train_df_geo.join(tr_feat)
    test_out = test_df_geo.join(te_feat)

    return train_out, test_out


In [21]:
pop_cols = [
    'PTN_2020', # 2020年の総数人口
    'RTA_2025', # 2025年男女計0～14歳人口/比率
    'RTB_2025', # 2025年男女計15～64歳人口/比率
    'RTC_2025', # 2025年男女計65歳以上人口/比率
    'RTD_2025', # 2025年男女計75歳以上人口/比率
    'RTE_2025', # 2025年男女計80歳以上人口/比率
    'pop_trend_rate'
]

train_df_geo, test_df_geo = add_pop_knn_features(
    train_df_geo=train_df_geo,
    test_df_geo=test_df_geo,
    fu_pop_df=fu_pop_df,
    pop_cols=pop_cols
)

## 道路密度・道路延長メッシュ

In [22]:
road_mesh_path = gis_path + '道路密度・道路延長メッシュ/road_mesh_fe.parquet'
road_path = gis_path + '道路/N01-07L-48-01.0a_GML/N01-07L-2K_Road.shp'

#### データの結合（最初の1回のみなのでコメントアウト）

In [23]:
# in_dir = Path(gis_path + '道路密度・道路延長メッシュ')
# files = sorted(in_dir.glob('N04-10_*-jgd_GML/*.shp'))
# output_path = gis_path + '道路密度・道路延長メッシュ'

# gdfs = []

In [24]:
# for fp in files:
#     gdf = read_dataframe(fp) # 高速化しないとめちゃ時間かかる
#     gdf = gdf.set_crs('EPSG:4612')
#     # gdf_filtered = gdf[keep_cols]
        
#     gdfs.append(gdf)

# # 結合（index を振り直す）
# road_df = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdf.crs)
# print(f"crs: {road_df.crs}")

# del gdfs
# gc.collect()

In [25]:
# attr_cols = road_df.columns.drop('geometry')

# road_df[attr_cols] = (
#     road_df[attr_cols]
#         .replace('unknown', pd.NA)
#         .apply(pd.to_numeric, errors='coerce')
#         .astype('float32')
# )

In [26]:
# def build_road_base_and_gap_features(
#     gdf: gpd.GeoDataFrame,
#     nn_epsg: int = 6677,          # 距離計算用（投影）
#     k: int = 8,                   # 近傍メッシュ数
#     idw_p: float = 1.0,           # 重み 1/(d^p)
#     self_exclude: bool = True,    # 自メッシュを加重平均から除外
#     add_density: bool = True,     # road_len_total / area を作るか
#     area_unit_scale: float = 1e6, # 密度を 'm per km^2' にしたいなら 1e6
#     eps: float = 1e-6,
# ) -> gpd.GeoDataFrame:
#     """
#     3次メッシュ polygon の GeoDataFrame（road_df想定）に対して、
#     1) メッシュ内の道路延長集約（total/wide/narrow）と比率
#     2) 周辺 k 近傍メッシュのIDW加重平均との差分（gap）
#     3) 任意で道路延長密度（m/km^2 等）
#     を付与して返す。

#     前提:
#       - gdf は polygon geometry を持つ
#       - N04_*** 列は「当該メッシュ内の道路延長（m）」等の集計値として扱う（あなたの設計を踏襲）
#     """
#     if gdf is None or len(gdf) == 0:
#         raise ValueError('gdf is empty.')

#     out = gdf.copy()

#     # ---------
#     # 1) ベース特徴量（メッシュ内）
#     # ---------
#     total_cols = ['N04_051', 'N04_052', 'N04_053', 'N04_054', 'N04_055', 'N04_056']
#     wide_cols  = ['N04_003', 'N04_009', 'N04_015']  # ≒ 11m以上
#     narrow_cols= ['N04_027', 'N04_033', 'N04_039']  # ≒ 5.5m未満

#     # ない列があっても落ちないように 0 列を補う（データによって列が欠けることがあるため）
#     for c in set(total_cols + wide_cols + narrow_cols):
#         if c not in out.columns:
#             out[c] = 0.0

#     # 数値化（'unknown' 等が混ざるケースにも耐性）
#     use_cols = list(dict.fromkeys(total_cols + wide_cols + narrow_cols))
#     out[use_cols] = (
#         out[use_cols]
#             .replace('unknown', np.nan)
#             .apply(lambda s: np.nan_to_num(np.asarray(s, dtype='float64'), nan=0.0))
#     )

#     out['road_len_total'] = out[total_cols].sum(axis=1).astype('float32')
#     out['road_len_wide'] = out[wide_cols].sum(axis=1).astype('float32')
#     out['road_len_narrow'] = out[narrow_cols].sum(axis=1).astype('float32')

#     denom = (out['road_len_total'].to_numpy(dtype='float32') + eps)
#     out['road_wide_ratio'] = (out['road_len_wide'].to_numpy(dtype='float32') / denom).astype('float32')
#     out['road_narrow_ratio'] = (out['road_len_narrow'].to_numpy(dtype='float32') / denom).astype('float32')

#     # 密度（任意）
#     # 注意: EPSG:4612 のまま area を取るのはNG（度単位）なので、投影して計算して戻す
#     if add_density:
#         out_p = out.to_crs(epsg=nn_epsg)
#         area_m2 = out_p.geometry.area.to_numpy(dtype='float64')
#         area_m2 = np.maximum(area_m2, eps)
#         # m / m^2 = 1/m。見やすくするなら m/km^2 = (m / m^2) * 1e6
#         out['road_len_density'] = (out['road_len_total'].to_numpy(dtype='float64') / area_m2 * area_unit_scale).astype('float32')

#     # ---------
#     # 2) gap（周辺メッシュ平均との差）
#     # ---------
#     # centroid の点で kNN（polygon のままだと不安定/重い）
#     gdf_p = out.to_crs(epsg=nn_epsg).copy()
#     cent = gdf_p.geometry.centroid
#     xy = np.column_stack([cent.x.to_numpy(), cent.y.to_numpy()])

#     # scipy が無い環境対策（ただしあなたの環境は概ねOKのはず）
#     try:
#         from scipy.spatial import cKDTree
#     except Exception as e:
#         raise ImportError('scipy is required for kNN gap features. Please install scipy.') from e

#     tree = cKDTree(xy)

#     # k+1 で self を含めて後で落とす
#     qk = k + 1 if self_exclude else k
#     dists, idxs = tree.query(xy, k=qk, workers=-1)

#     # k=1 等で1次元になるケース対策
#     if qk == 1:
#         dists = dists.reshape(-1, 1)
#         idxs = idxs.reshape(-1, 1)

#     if self_exclude:
#         # 先頭が self（距離0）になる前提
#         dists = dists[:, 1:]
#         idxs = idxs[:, 1:]

#     # IDW 重み
#     w = 1.0 / np.power(np.maximum(dists, eps), idw_p)
#     w_sum = w.sum(axis=1, keepdims=True)
#     valid = (w_sum[:, 0] > 0)

#     def _wmean(values: np.ndarray) -> np.ndarray:
#         # values: (n,)
#         neigh = values[idxs]              # (n, k)
#         num = (w * neigh).sum(axis=1)     # (n,)
#         out_arr = np.full(len(values), np.nan, dtype='float32')
#         out_arr[valid] = (num[valid] / w_sum[valid, 0]).astype('float32')
#         return out_arr

#     # 近傍加重平均
#     v_total = out['road_len_total'].to_numpy(dtype='float32')
#     v_narrow_ratio = out['road_narrow_ratio'].to_numpy(dtype='float32')
#     wmean_total = _wmean(v_total)
#     wmean_narrow_ratio = _wmean(v_narrow_ratio)

#     # gap = self - wmean(neighbors)
#     out['road_len_total_gap'] = (v_total - wmean_total).astype('float32')
#     out['road_narrow_ratio_gap'] = (v_narrow_ratio - wmean_narrow_ratio).astype('float32')

#     # 任意: 密度のgapも作りたい場合
#     if add_density:
#         v_density = out['road_len_density'].to_numpy(dtype='float32')
#         wmean_density = _wmean(v_density)
#         out['road_len_density_gap'] = (v_density - wmean_density).astype('float32')

#     keep_cols = [
#         'geometry',
#         'road_len_total',
#         'road_len_wide',
#         'road_len_narrow',
#         'road_wide_ratio',
#         'road_narrow_ratio',
#         'road_len_total_gap',
#         'road_narrow_ratio_gap',
#     ]
#     if add_density:
#         keep_cols += ['road_len_density', 'road_len_density_gap']

#     return out[keep_cols].copy()


In [27]:
# road_df = build_road_base_and_gap_features(road_df)
# road_df.to_parquet(output_path + '/road_mesh_fe.parquet')

#### 学習データ・予測データに結合

In [28]:
road_mesh_df = gpd.read_parquet(road_mesh_path)
road_df = read_dataframe(road_path)

road_df = road_df.set_crs('EPSG:4612')

In [29]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Iterable, Sequence

@dataclass(frozen=True)
class RoadLineFeatureConfig:
    # 距離・バッファ計算用の投影座標系（メートル単位）
    proj_epsg: int = 6677

    # road_df の道路種別コード列
    road_type_col: str = 'N01_001'

    # 主要道路とみなすコード（要件次第で調整）
    # 例: 1=高速, 2=一般国道, 3=主要地方道, 5=特例都道 などを含める
    major_codes: tuple[int, ...] = (1, 2, 3, 5)

    # 近傍カウント/延長を作る半径（m）
    radii_m: tuple[int, ...] = (100, 300, 500)

    # 半径内の延長（m）を計算するか（重い）
    compute_length_in_buffer: bool = False

    # sjoin の集計で「同一路線名をユニークカウント」したい場合に使う列（なければ None）
    # 例: 'N01_002'（路線名）など
    unique_key_col: str | None = None


def _to_points_gdf(
    df: pd.DataFrame,
    lon_col: str,
    lat_col: str,
    crs: str = 'EPSG:4612',
) -> gpd.GeoDataFrame:
    gdf = gpd.GeoDataFrame(
        df.copy(),
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs=crs,
    )
    return gdf


def _safe_int_series(s: pd.Series) -> pd.Series:
    # 'unknown' 等が混ざっても落ちないようにする
    return (
        s.replace('unknown', pd.NA)
         .astype('Int64')
    )


def _nearest_distance(
    pts_p: gpd.GeoDataFrame,
    lines_p: gpd.GeoDataFrame,
    out_col: str,
) -> pd.Series:
    if lines_p is None or len(lines_p) == 0:
        return pd.Series(np.nan, index=pts_p.index, name=out_col)

    joined = gpd.sjoin_nearest(
        pts_p[['geometry']],
        lines_p[['geometry']],
        how='left',
        distance_col=out_col,
    )

    # 同距離などで点indexが重複するケースがあるので、点ごとに最小距離に潰す
    s = joined[out_col]
    if not s.index.is_unique:
        s = s.groupby(level=0).min()

    # pts_p の index に整列して返す（欠けは NaN）
    s = s.reindex(pts_p.index)

    return s.astype('float32')

def _count_lines_within_radius(
    pts_p: gpd.GeoDataFrame,
    lines_p: gpd.GeoDataFrame,
    radius_m: int,
    out_col: str,
    unique_key_col: str | None = None,
) -> pd.Series:
    """
    半径バッファ内に intersect する road セグメント数（または unique_key_col のユニーク数）
    """
    if lines_p is None or len(lines_p) == 0:
        return pd.Series(0, index=pts_p.index, name=out_col, dtype='int32')

    buf = pts_p[['geometry']].copy()
    buf['geometry'] = buf.geometry.buffer(radius_m)

    joined = gpd.sjoin(
        buf,
        lines_p,
        how='left',
        predicate='intersects',
    )

    if unique_key_col is not None and unique_key_col in joined.columns:
        cnt = joined.groupby(level=0)[unique_key_col].nunique(dropna=True)
    else:
        # セグメント（行）数。index_right のユニークで数えるのが安定
        if 'index_right' in joined.columns:
            cnt = joined.groupby(level=0)['index_right'].nunique(dropna=True)
        else:
            cnt = joined.groupby(level=0).size()

    out = pd.Series(0, index=pts_p.index, name=out_col, dtype='int32')
    out.loc[cnt.index] = cnt.astype('int32')
    return out


def _length_lines_within_radius(
    pts_p: gpd.GeoDataFrame,
    lines_p: gpd.GeoDataFrame,
    radius_m: int,
    out_col: str,
) -> pd.Series:
    """
    半径バッファ内の道路延長（m）合計。
    注意: 計算コストが高いので radii が大きい・件数が多い場合は重くなる。
    """
    if lines_p is None or len(lines_p) == 0:
        return pd.Series(0.0, index=pts_p.index, name=out_col, dtype='float32')

    buf = pts_p[['geometry']].copy()
    buf['geometry'] = buf.geometry.buffer(radius_m)

    joined = gpd.sjoin(
        buf,
        lines_p[['geometry']],
        how='left',
        predicate='intersects',
    )

    # index の対応でバッファ形状を引けるようにする
    buf_geom = buf.geometry

    # intersect した道路をバッファでクリップして長さ
    # （ベクトル化しにくいので apply。重い）
    def _clip_len(row) -> float:
        if pd.isna(row.get('index_right')):
            return 0.0
        g_line = row['geometry_right']
        g_buf = buf_geom.loc[row.name]
        return float(g_line.intersection(g_buf).length)

    # sjoin の結果に right geometry が入らない場合があるので付与
    joined = joined.rename(columns={'geometry': 'geometry_left'})
    joined = joined.merge(
        lines_p[['geometry']].rename(columns={'geometry': 'geometry_right'}),
        left_on='index_right',
        right_index=True,
        how='left',
    )

    lens = joined.apply(_clip_len, axis=1)
    agg = lens.groupby(level=0).sum()

    out = pd.Series(0.0, index=pts_p.index, name=out_col, dtype='float32')
    out.loc[agg.index] = agg.astype('float32')
    return out


def add_road_features(
    df: pd.DataFrame,
    road_poly: gpd.GeoDataFrame,
    road_line: gpd.GeoDataFrame,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    poly_join_predicate: str = 'within',
    poly_keep_cols: Sequence[str] | None = None,
    line_cfg: RoadLineFeatureConfig = RoadLineFeatureConfig(),
) -> pd.DataFrame:
    """
    df（train_df_geo / test_df_geo想定）に対して、
      A) road_mesh_df(POLYGON) 由来の列を sjoin で付与
      B) road_df(LINESTRING) 由来の近傍本数・主要道路距離等を付与
    をまとめて実行する。

    返り値: 元の df と同じ index の DataFrame（geometry は持たない）
    """
    out = df.copy()

    # ----------------------------
    # A) POLYGON（メッシュ特徴量）を sjoin
    # ----------------------------
    pts_4612 = _to_points_gdf(out, lon_col=lon_col, lat_col=lat_col, crs='EPSG:4612')
    poly_4612 = road_poly.to_crs('EPSG:4612')

    joined_poly = gpd.sjoin(
        pts_4612[['geometry']],
        poly_4612,
        how='left',
        predicate=poly_join_predicate,
    )

    # 必要な列だけ付与（index_right 等の管理列は落とす）
    if poly_keep_cols is None:
        # 典型的にはあなたの road_mesh_df が保持している特徴量列をすべて付与でOK
        drop_cols = [c for c in ['index_right', 'geometry'] if c in joined_poly.columns]
        poly_cols = [c for c in joined_poly.columns if c not in drop_cols]
    else:
        poly_cols = [c for c in poly_keep_cols if c in joined_poly.columns]

    out = out.join(joined_poly[poly_cols])

    # ----------------------------
    # B) LINESTRING（近傍・距離）特徴量
    # ----------------------------
    # 投影CRSへ
    pts_p = pts_4612.to_crs(epsg=line_cfg.proj_epsg)

    road_line_4612 = road_line
    if road_line_4612.crs is None:
        # ここはあなたのデータ取得仕様に合わせて変更可
        road_line_4612 = road_line_4612.set_crs('EPSG:4612')

    lines_p = road_line_4612.to_crs(epsg=line_cfg.proj_epsg)

    # 道路種別コードを整形
    if line_cfg.road_type_col in lines_p.columns:
        lines_p = lines_p.copy()
        lines_p[line_cfg.road_type_col] = _safe_int_series(lines_p[line_cfg.road_type_col])

    # サブセット
    major_mask = pd.Series(False, index=lines_p.index)
    if line_cfg.road_type_col in lines_p.columns:
        major_mask = lines_p[line_cfg.road_type_col].isin(list(line_cfg.major_codes))

    lines_major = lines_p.loc[major_mask].copy()
    lines_highway = lines_p.loc[lines_p[line_cfg.road_type_col].isin([1])] if line_cfg.road_type_col in lines_p.columns else lines_p.iloc[0:0]

    # 最近傍距離
    out['dist_to_road_any_m'] = _nearest_distance(pts_p, lines_p, 'dist_to_road_any_m')
    out['dist_to_road_major_m'] = _nearest_distance(pts_p, lines_major, 'dist_to_road_major_m')
    out['dist_to_road_highway_m'] = _nearest_distance(pts_p, lines_highway, 'dist_to_road_highway_m')

    # 半径内の道路本数（セグメント数 or 路線名ユニーク数）
    for r in line_cfg.radii_m:
        out[f'road_cnt_any_in_{r}m'] = _count_lines_within_radius(
            pts_p,
            lines_p if line_cfg.unique_key_col is None else lines_p[[line_cfg.unique_key_col, 'geometry']].copy(),
            radius_m=r,
            out_col=f'road_cnt_any_in_{r}m',
            unique_key_col=line_cfg.unique_key_col,
        )
        out[f'road_cnt_major_in_{r}m'] = _count_lines_within_radius(
            pts_p,
            lines_major if line_cfg.unique_key_col is None else lines_major[[line_cfg.unique_key_col, 'geometry']].copy(),
            radius_m=r,
            out_col=f'road_cnt_major_in_{r}m',
            unique_key_col=line_cfg.unique_key_col,
        )

        # （任意）半径内延長
        if line_cfg.compute_length_in_buffer:
            out[f'road_len_any_in_{r}m'] = _length_lines_within_radius(
                pts_p,
                lines_p,
                radius_m=r,
                out_col=f'road_len_any_in_{r}m',
            )
            out[f'road_len_major_in_{r}m'] = _length_lines_within_radius(
                pts_p,
                lines_major,
                radius_m=r,
                out_col=f'road_len_major_in_{r}m',
            )

    return out


In [30]:
cfg = RoadLineFeatureConfig(
    proj_epsg=6677,
    major_codes=(1, 2, 3, 5),
    radii_m=(100, 300, 500),
    compute_length_in_buffer=False,     # 重いのでまずはOFF推奨
    unique_key_col=None,               # 'N01_002' で路線名ユニーク数にしたいなら 'N01_002'
)

train_df_geo = add_road_features(
    df=train_df_geo,
    road_poly=road_mesh_df,
    road_line=road_df,
    lon_col='lon_jgd',
    lat_col='lat_jgd',
    poly_keep_cols=None,   # 付けたいメッシュ列だけに絞るなら ['road_len_total', ...] を指定
    line_cfg=cfg,
)

test_df_geo = add_road_features(
    df=test_df_geo,
    road_poly=road_mesh_df,
    road_line=road_df,
    lon_col='lon_jgd',
    lat_col='lat_jgd',
    poly_keep_cols=None,
    line_cfg=cfg,
)


## 用途地域

In [31]:
youto_path = gis_path + '用途地域/youto.parquet'

#### データの結合（最初の1回のみなのでコメントアウト）

In [32]:
# in_dir = Path(gis_path + '用途地域')
# files = sorted(in_dir.glob('A29-19*/01-03_GeoJSON形式/*.geojson'))
# output_path = gis_path + '用途地域'

# gdfs = []
# base_crs = None

In [33]:
# for fp in files:
#     gdf = read_dataframe(fp) # 高速化しないとめちゃ時間かかる
#     # gdf_filtered = gdf[keep_cols]

#     # CRS を統一（最初のファイルの CRS に合わせる）
#     if base_crs is None:
#         base_crs = gdf.crs
#     else:
#         if gdf.crs != base_crs:
#             gdf = gdf.to_crs(base_crs)

#     gdfs.append(gdf)

# # 結合（index を振り直す）
# youto_df = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=base_crs)
# print(f"crs: {youto_df.crs}")

# del gdfs
# gc.collect()

In [34]:
# def _normalize_zone_name(s: pd.Series) -> pd.Series:
#     # 文字列化＋前後空白除去（None/NaNはそのまま）
#     return s.astype('string').str.strip()


# def add_zone_group_and_rank(
#     df: pd.DataFrame,
#     zone_name_col: str = 'A29_005',
#     *,
#     out_group_col: str = 'zone_group',
#     out_rank_col: str = 'zone_residential_rank',
#     non_res_rank_value: int | None = 0,  # 0にしたくなければ None にして pd.NA にする
# ) -> pd.DataFrame:
#     """
#     用途地域名から
#       - zone_group: residential / commercial / industrial
#       - zone_residential_rank: 住居系の階層（低層=1, 中高層=2, 一般住居=3, 準住居=4）
#     を付与して返す。
#     """
#     out = df.copy()
#     z = _normalize_zone_name(out[zone_name_col])
#     out['density_cat'] = z

#     # --- zone_group ---
#     out[out_group_col] = pd.NA
#     out.loc[z.str.contains('住居', na=False), out_group_col] = 'residential'
#     out.loc[z.str.contains('商業', na=False), out_group_col] = 'commercial'
#     out.loc[z.str.contains('工業', na=False), out_group_col] = 'industrial'

#     # --- zone_residential_rank ---
#     # 住居系以外は 0（or NA）
#     if non_res_rank_value is None:
#         out[out_rank_col] = pd.Series(pd.NA, index=out.index, dtype='Int64')
#     else:
#         out[out_rank_col] = pd.Series(non_res_rank_value, index=out.index, dtype='Int64')

#     # 低層住居専用（第一種/第二種）
#     lowrise = z.str.contains('低層住居専用', na=False)
#     out.loc[lowrise, out_rank_col] = 1

#     # 中高層住居専用（第一種/第二種）
#     midhigh = z.str.contains('中高層住居専用', na=False)
#     out.loc[midhigh, out_rank_col] = 2

#     # 一般住居系（第一種住居/第二種住居）
#     general = z.str.contains('第一種住居|第二種住居', na=False)
#     out.loc[general, out_rank_col] = 3

#     # 準住居
#     quasi = z.str.contains('準住居', na=False)
#     out.loc[quasi, out_rank_col] = 4

#     # もし住居系なのにランクが未設定なら、とりあえず3（一般）に寄せる等も可
#     # unknown_res = (out[out_group_col] == 'residential') & (out[out_rank_col].isna() | (out[out_rank_col] == non_res_rank_value))
#     # out.loc[unknown_res, out_rank_col] = 3

#     return out


In [35]:
# def add_youto_basic_features(
#     df: pd.DataFrame,
#     *,
#     zone_name_col: str = 'A29_005',
#     kenpei_col: str = 'A29_006',
#     youseki_col: str = 'A29_007',
#     unknown_value: int = 9999,
# ) -> pd.DataFrame:
#     """
#     用途地域（A29）から以下を作る:
#       - is_lowrise_residential
#       - kenpei, youseki（9999→NaN）
#       - kenpei_missing, youseki_missing
#       - kenpei_bin, youseki_bin
#     """
#     out = df.copy()

#     # --- normalize zone name ---
#     z = out[zone_name_col].astype('string').str.strip()

#     # --- is_lowrise_residential ---
#     # 「第一種低層住居専用地域」「第一種低層住居地域」「第二種低層住居専用地域」などを拾う
#     out['is_lowrise_residential'] = z.str.contains('低層', na=False).astype('int8')

#     # --- kenpei / youseki + missing flags ---
#     out['kenpei_missing'] = (out[kenpei_col] == unknown_value) | (out[kenpei_col].isna())
#     out['youseki_missing'] = (out[youseki_col] == unknown_value) | (out[youseki_col].isna())

#     out['kenpei'] = pd.to_numeric(out[kenpei_col], errors='coerce').astype('float64')
#     out['youseki'] = pd.to_numeric(out[youseki_col], errors='coerce').astype('float64')

#     out.loc[out['kenpei_missing'], 'kenpei'] = np.nan
#     out.loc[out['youseki_missing'], 'youseki'] = np.nan

#     # --- bins (category) ---
#     # 典型値（40,50,60,70...）に合わせた実務向けビン
#     def _bin_kenpei(x: pd.Series) -> pd.Categorical:
#         bins = [-np.inf, 40, 50, 60, 70, np.inf]
#         labels = ['<=40', '41-50', '51-60', '61-70', '>=71']
#         return pd.cut(x, bins=bins, labels=labels, right=True, include_lowest=True)

#     def _bin_youseki(x: pd.Series) -> pd.Categorical:
#         bins = [-np.inf, 80, 100, 150, 200, 300, np.inf]
#         labels = ['<=80', '81-100', '101-150', '151-200', '201-300', '>=301']
#         return pd.cut(x, bins=bins, labels=labels, right=True, include_lowest=True)

#     out['kenpei_bin'] = _bin_kenpei(out['kenpei']).astype('category')
#     out['youseki_bin'] = _bin_youseki(out['youseki']).astype('category')

#     # 欠損は明示カテゴリに寄せる（LightGBM/TEの安定化用）
#     out['kenpei_bin'] = out['kenpei_bin'].cat.add_categories(['missing']).fillna('missing')
#     out['youseki_bin'] = out['youseki_bin'].cat.add_categories(['missing']).fillna('missing')

#     return out


In [36]:
# youto_df = add_zone_group_and_rank(youto_df, zone_name_col='A29_005')
# youto_df = add_youto_basic_features(youto_df)

In [37]:
# keep_cols = [
#         'geometry',
#         'zone_group',
#         'zone_residential_rank',
#         'is_lowrise_residential',
#         'kenpei_missing',
#         'youseki_missing',
#         'kenpei',
#         'youseki',
#         'kenpei_bin',
#         'youseki_bin',
#         'density_cat',
#         'A29_005'
# ]

# youto_df[keep_cols].to_parquet(output_path + '/youto.parquet')

#### 学習データ・予測データに結合

In [38]:
youto_df = gpd.read_parquet(youto_path)

In [39]:
def add_youto_features_sjoin(
    df: pd.DataFrame,
    youto_poly: gpd.GeoDataFrame,
    *,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    df_crs: str = 'EPSG:4612',   # ★ 明示的に 4612
    keep_cols: tuple[str, ...] = (
        'A29_004', 'A29_005', 'A29_006', 'A29_007',
        'zone_group', 'zone_residential_rank',
        'is_lowrise_residential',
        'kenpei', 'youseki', 'kenpei_missing', 'youseki_missing',
        'kenpei_bin', 'youseki_bin', 'density_cat'
    ),
    predicate_primary: str = 'intersects',
    predicate_fallback: str | None = None,
) -> pd.DataFrame:

    out = df.copy()

    # --- point gdf: pt_id を作って index 依存を消す ---
    gdf_pts = gpd.GeoDataFrame(
    out[[lon_col, lat_col]].copy(),
    geometry=gpd.points_from_xy(out[lon_col], out[lat_col]),
    crs=df_crs,   # ★ 4612
    )
    gdf_pts['pt_id'] = np.arange(len(gdf_pts), dtype=np.int64)


    # --- poly ---
    youto = youto_poly.copy()
    if youto.crs is None:
        raise ValueError('youto_poly.crs が None です。CRS を設定してください。')

    if youto.crs.to_string() != df_crs:
        youto = youto.to_crs(df_crs)

    cols_exist = [c for c in keep_cols if c in youto.columns]
    youto_small = youto[cols_exist + ['geometry']].copy()

    # 代表選択キー（厳しい規制を優先：小さいほど優先）
    if 'kenpei' not in youto_small.columns and 'A29_006' in youto_small.columns:
        youto_small['kenpei'] = pd.to_numeric(youto_small['A29_006'], errors='coerce').astype('float64')
    if 'youseki' not in youto_small.columns and 'A29_007' in youto_small.columns:
        youto_small['youseki'] = pd.to_numeric(youto_small['A29_007'], errors='coerce').astype('float64')

    youto_small['kenpei_key'] = youto_small.get('kenpei', pd.Series(np.nan, index=youto_small.index)).fillna(1e9)
    youto_small['youseki_key'] = youto_small.get('youseki', pd.Series(np.nan, index=youto_small.index)).fillna(1e9)

    # --- primary join ---
    j1 = gpd.sjoin(
        gdf_pts[['pt_id', 'geometry']],
        youto_small,
        how='left',
        predicate=predicate_primary,
    )

    # --- fallback join（必要なときだけ）---
    if predicate_fallback is not None:
        miss = j1['index_right'].isna() if 'index_right' in j1.columns else j1.isna().any(axis=1)
        miss_pt_ids = j1.loc[miss, 'pt_id'].unique()

        if len(miss_pt_ids) > 0:
            pts_miss = gdf_pts.loc[gdf_pts['pt_id'].isin(miss_pt_ids), ['pt_id', 'geometry']]
            j2 = gpd.sjoin(
                pts_miss,
                youto_small,
                how='left',
                predicate=predicate_fallback,
            )
            j_all = pd.concat([j1, j2], axis=0, ignore_index=True)
        else:
            j_all = j1
    else:
        j_all = j1

    # --- 複数マッチを 1件に落とす（pt_id単位）---
    j_all = j_all.sort_values(['pt_id', 'kenpei_key', 'youseki_key'])
    j_best = j_all.groupby('pt_id', sort=False).head(1).copy()

    # --- df の行順に合わせて join ---
    j_best = j_best.set_index('pt_id')

    drop_cols = [c for c in ['index_right', 'geometry', 'kenpei_key', 'youseki_key'] if c in j_best.columns]
    j_best = j_best.drop(columns=drop_cols, errors='ignore')

    out2 = out.reset_index(drop=True).join(j_best, how='left')
    out2.index = out.index

    return out2


In [40]:
train_df_geo = add_youto_features_sjoin(train_df_geo, youto_df)
test_df_geo  = add_youto_features_sjoin(test_df_geo,  youto_df)

print(train_df_geo['A29_005'].isna().mean())
print(test_df_geo['A29_005'].isna().mean())
print(train_df_geo['density_cat'].isna().mean())
print(test_df_geo['density_cat'].isna().mean())

0.1523642298941537
0.14870549730071062
0.1523642298941537
0.14870549730071062


## 都市計画

In [41]:
koudoti_path = gis_path + '都市計画/koudoti.parquet'
chikukei_path = gis_path + '都市計画/chikukei.parquet'
tochiku_path = gis_path + '都市計画/tochiku.parquet'
koudori_path = gis_path + '都市計画/koudori.parquet'
toshisaisei_path = gis_path + '都市計画/toshisaisei.parquet'
tokureiyouseki_path = gis_path + '都市計画/tokureiyouseki.parquet'
kousoujyukyo_path = gis_path + '都市計画/kousoujyukyo.parquet'
tokuteibousai_path = gis_path + '都市計画/tokuteibousai.parquet'
fukkousaiseikyoten_path = gis_path + '都市計画/fukkousaiseikyoten.parquet'
senbiki_path = gis_path + '都市計画/senbiki_df.parquet'
bouka_path = gis_path + '都市計画/bouka_df.parquet'

#### データの結合（最初の1回のみなのでコメントアウト）

In [42]:
# in_dir = Path(gis_path + '都市計画')
# bouka_files = sorted(in_dir.glob('A55-24*/A55-24*/*_bouka.geojson'))
# chikukei_files = sorted(in_dir.glob('A55-24*/A55-24*/*chikukei.geojson'))
# fukkousaiseikyoten_files = sorted(in_dir.glob('A55-24*/A55-24*/*fukkousaiseikyoten.geojson'))
# koudori_files = sorted(in_dir.glob('A55-24*/A55-24*/*koudori.geojson'))
# koudoti_files = sorted(in_dir.glob('A55-24*/A55-24*/*koudoti.geojson'))
# senbiki_files = sorted(in_dir.glob('A55-24*/A55-24*/*senbiki.geojson'))
# tochiku_files = sorted(in_dir.glob('A55-24*/A55-24*/*tochiku.geojson'))
# toshisaisei_files = sorted(in_dir.glob('A55-24*/A55-24*/*toshisaisei.geojson'))
# tokuteibousai_files = sorted(in_dir.glob('A55-24*/A55-24*/*tokuteibouka.geojson'))
# tokureiyouseki_files = sorted(in_dir.glob('A55-24*/A55-24*/*tokureiyouseki.geojson'))
# kousoujyukyo_files = sorted(in_dir.glob('A55-24*/A55-24*/*_kousoujyukyo.geojson'))
# output_path = gis_path + '都市計画'

# gdfs = []
# base_crs = None

In [43]:
# def load_union_polygons(files: list[Path], *, target_crs: str = 'EPSG:4612') -> gpd.GeoDataFrame:
#     """
#     複数geojsonを読み、1つのGeoDataFrameに結合。
#     geometryだけ欲しい（＝存在フラグ用途）なら、属性は落としてOK。
#     """
#     gdfs = []
#     for fp in files:
#         g = gpd.read_file(fp)
#         if g.crs is None:
#             # ここは必要に応じて allow_override=True で固定してもよい
#             raise ValueError(f'CRS is None: {fp}')
#         g = g.to_crs(target_crs)
#         gdfs.append(g[['geometry']])
#     if not gdfs:
#         return gpd.GeoDataFrame({'geometry': []}, geometry='geometry', crs=target_crs)
#     return gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), geometry='geometry', crs=target_crs)


# def make_presence_flag_gdf(files: list[Path], flag_name: str) -> gpd.GeoDataFrame:
#     poly = load_union_polygons(files, target_crs='EPSG:4612')
#     if len(poly) == 0:
#         poly[flag_name] = pd.Series([], dtype='int8')
#         return poly
#     poly[flag_name] = 1
#     poly[flag_name] = poly[flag_name].astype('int8')
#     return poly[[flag_name, 'geometry']]


In [44]:
# def make_senbiki_flag_gdf(files: list[Path]) -> gpd.GeoDataFrame:
#     # まず全部読む（属性が必要）
#     gdfs = []
#     for fp in files:
#         g = gpd.read_file(fp)
#         if g.crs is None:
#             raise ValueError(f'CRS is None: {fp}')
#         g = g.to_crs('EPSG:4612')
#         gdfs.append(g)
#     poly = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), geometry='geometry', crs='EPSG:4612')

#     code = pd.to_numeric(poly['AreaCode'], errors='coerce')

#     # ここはあなたが前に示した通り（22=市街化区域, 23=市街化調整区域）を採用
#     poly['is_urbanized_area'] = (code == 22).astype('int8')
#     poly['is_urban_control_area'] = (code == 23).astype('int8')

#     return poly[['is_urbanized_area', 'is_urban_control_area', 'geometry']]

In [45]:
# def make_bouka_flag_gdf(files: list[Path]) -> gpd.GeoDataFrame:
#     gdfs = []
#     for fp in files:
#         g = gpd.read_file(fp)
#         if g.crs is None:
#             raise ValueError(f'CRS is None: {fp}')
#         g = g.to_crs('EPSG:4612')
#         gdfs.append(g)
#     poly = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), geometry='geometry', crs='EPSG:4612')

#     code = pd.to_numeric(poly['AreaCode'], errors='coerce')

#     # 24=防火地域, 25=準防火地域（想定）
#     poly['is_fireproof_area'] = (code == 24).astype('int8')
#     poly['is_quasi_fireproof_area'] = (code == 25).astype('int8')

#     return poly[['is_fireproof_area', 'is_quasi_fireproof_area', 'geometry']]


In [46]:
# koudoti_df = make_presence_flag_gdf(koudoti_files, 'has_height_limit')
# chikukei_df = make_presence_flag_gdf(chikukei_files, 'has_district_plan')
# tochiku_df = make_presence_flag_gdf(tochiku_files, 'is_land_readjustment_area')
# koudori_df = make_presence_flag_gdf(koudori_files, 'is_high_utilization_area')
# toshisaisei_df = make_presence_flag_gdf(toshisaisei_files, 'is_urban_renaissance_area')
# tokureiyouseki_df = make_presence_flag_gdf(tokureiyouseki_files, 'is_special_far_area')
# kousoujyukyo_df = make_presence_flag_gdf(kousoujyukyo_files, 'is_highrise_residential_area')
# tokuteibousai_df = make_presence_flag_gdf(tokuteibousai_files, 'is_disaster_prevention_block')
# fukkousaiseikyoten_df = make_presence_flag_gdf(fukkousaiseikyoten_files, 'is_redevelopment_core_area')
# senbiki_df = make_senbiki_flag_gdf(senbiki_files)
# bouka_df = make_bouka_flag_gdf(bouka_files)

In [47]:
# koudoti_df.to_parquet(output_path + '/koudoti.parquet')
# chikukei_df.to_parquet(output_path + '/chikukei.parquet')
# tochiku_df.to_parquet(output_path + '/tochiku.parquet')
# koudori_df.to_parquet(output_path + '/koudori.parquet')
# toshisaisei_df.to_parquet(output_path + '/toshisaisei.parquet')
# tokureiyouseki_df.to_parquet(output_path + '/tokureiyouseki.parquet')
# kousoujyukyo_df.to_parquet(output_path + '/kousoujyukyo.parquet')
# tokuteibousai_df.to_parquet(output_path + '/tokuteibousai.parquet')
# fukkousaiseikyoten_df.to_parquet(output_path + '/fukkousaiseikyoten.parquet')
# senbiki_df.to_parquet(output_path + '/senbiki_df.parquet')
# bouka_df.to_parquet(output_path + '/bouka_df.parquet')

#### 学習データ・予測データに結合

In [48]:
koudoti_df = gpd.read_parquet(koudoti_path)
chikukei_df = gpd.read_parquet(chikukei_path)
tochiku_df = gpd.read_parquet(tochiku_path)
koudori_df = gpd.read_parquet(koudori_path)
toshisaisei_df = gpd.read_parquet(toshisaisei_path)
tokureiyouseki_df = gpd.read_parquet(tokureiyouseki_path)
kousoujyukyo_df = gpd.read_parquet(kousoujyukyo_path)
tokuteibousai_df = gpd.read_parquet(tokuteibousai_path)
fukkousaiseikyoten_df = gpd.read_parquet(fukkousaiseikyoten_path)
senbiki_df = gpd.read_parquet(senbiki_path)
bouka_df = gpd.read_parquet(bouka_path)

In [49]:
def _make_points(df: pd.DataFrame, lon_col: str, lat_col: str, crs: str) -> gpd.GeoDataFrame:
    pts = gpd.GeoDataFrame(
        df[[lon_col, lat_col]].copy(),
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs=crs,
    )
    pts['__pt_id__'] = np.arange(len(pts), dtype=np.int64)
    return pts


def _sjoin_flags_one_layer(
    df: pd.DataFrame,
    poly: gpd.GeoDataFrame,
    flag_cols: list[str],
    *,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    df_crs: str = 'EPSG:4612',
    predicate_main: str = 'within',
    predicate_fallback: str = 'intersects',
) -> pd.DataFrame:
    """
    poly(フラグ列+geometry) を df(point) に空間結合し、flag_cols を付与。
    複数マッチは OR(max) で集約。
    """
    if poly is None or len(poly) == 0:
        out = df.copy()
        for c in flag_cols:
            if c not in out.columns:
                out[c] = 0
            out[c] = out[c].fillna(0).astype('int8')
        return out

    if poly.crs is None:
        raise ValueError('poly.crs is None. set_crs / to_crs を確認してください。')

    poly_ = poly.to_crs(df_crs).copy()
    pts = _make_points(df, lon_col, lat_col, df_crs)

    # main join
    j1 = gpd.sjoin(
        pts[['__pt_id__', 'geometry']],
        poly_[flag_cols + ['geometry']],
        how='left',
        predicate=predicate_main,
    )

    # fallback join（mainで当たらなかったptだけ）
    if predicate_fallback is not None:
        miss_ids = j1.loc[j1['index_right'].isna(), '__pt_id__'].unique()
        if len(miss_ids) > 0:
            pts_miss = pts.loc[pts['__pt_id__'].isin(miss_ids), ['__pt_id__', 'geometry']]
            j2 = gpd.sjoin(
                pts_miss,
                poly_[flag_cols + ['geometry']],
                how='left',
                predicate=predicate_fallback,
            )
            j = pd.concat([j1, j2], ignore_index=True)
        else:
            j = j1
    else:
        j = j1

    # OR集約（フラグなのでmaxでよい）
    for c in flag_cols:
        j[c] = pd.to_numeric(j[c], errors='coerce')

    agg = (
        j.groupby('__pt_id__', sort=False)[flag_cols]
        .max()
        .reset_index()
    )

    out = df.reset_index(drop=True).copy()
    out = out.join(agg.set_index('__pt_id__'), how='left')
    out.index = df.index

    for c in flag_cols:
        out[c] = out[c].fillna(0).astype('int8')

    return out


In [50]:
def add_cityplan_layers(
    df: pd.DataFrame,
    layers: dict[str, tuple[gpd.GeoDataFrame, list[str]]],
    *,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    df_crs: str = 'EPSG:4612',
    predicate_main: str = 'within',
    predicate_fallback: str = 'intersects',
) -> pd.DataFrame:
    out = df.copy()

    for name, (poly, cols) in layers.items():
        # 念のため cols が poly にあるかチェック
        if poly is not None and len(poly) > 0:
            missing = [c for c in cols if c not in poly.columns]
            if missing:
                raise KeyError(f'layer={name} missing cols={missing}')

        out = _sjoin_flags_one_layer(
            out,
            poly,
            cols,
            lon_col=lon_col,
            lat_col=lat_col,
            df_crs=df_crs,
            predicate_main=predicate_main,
            predicate_fallback=predicate_fallback,
        )

    return out


In [51]:
layers = {
    'senbiki': (senbiki_df, ['is_urbanized_area', 'is_urban_control_area']),
    'bouka': (bouka_df, ['is_fireproof_area', 'is_quasi_fireproof_area']),
    'koudoti': (koudoti_df, ['has_height_limit']),
    'chikukei': (chikukei_df, ['has_district_plan']),
    'tochiku': (tochiku_df, ['is_land_readjustment_area']),
    'koudori': (koudori_df, ['is_high_utilization_area']),
    'toshisaisei': (toshisaisei_df, ['is_urban_renaissance_area']),
    'tokureiyouseki': (tokureiyouseki_df, ['is_special_far_area']),
    'kousoujyukyo': (kousoujyukyo_df, ['is_highrise_residential_area']),
    'tokuteibousai': (tokuteibousai_df, ['is_disaster_prevention_block']),
    'fukkousaiseikyoten': (fukkousaiseikyoten_df, ['is_redevelopment_core_area']),
}

train_df_geo = add_cityplan_layers(train_df_geo, layers, lon_col='lon_jgd', lat_col='lat_jgd')
test_df_geo  = add_cityplan_layers(test_df_geo,  layers, lon_col='lon_jgd', lat_col='lat_jgd')

## 標高

In [52]:
height_path = gis_path + '標高/height.parquet'

#### データの結合（最初の1回のみなのでコメントアウト）

In [None]:
# in_dir = Path(gis_path + '標高')
# files = sorted(in_dir.glob('G04-c-11_*-jgd_GML/*.shp'))
# output_path = gis_path + '標高'

# gdfs = []

In [None]:
# for fp in files:
#     gdf = read_dataframe(fp) # 高速化しないとめちゃ時間かかる
        
#     gdfs.append(gdf)

# # 結合（index を振り直す）
# height_df = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdf.crs)
# print(f"crs: {height_df.crs}")

# del gdfs
# gc.collect()

# height_df.to_parquet(output_path + '/height.parquet')

crs: None


#### 学習データ・予測データに結合

In [55]:
height_df = gpd.read_parquet(height_path)

In [56]:
def add_height_features(
    df: pd.DataFrame,
    height_poly: gpd.GeoDataFrame,
    lon_col='lon_jgd',
    lat_col='lat_jgd',
):
    gdf = gpd.GeoDataFrame(
        df.copy(),
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs='EPSG:4612',
    )

    height_poly = height_poly.set_crs('EPSG:4612')

    joined = gpd.sjoin(
        gdf,
        height_poly,
        how='left',
        predicate='within',
    )

    return pd.DataFrame(joined.drop(columns='geometry'))


In [57]:
HEIGHT_NUM_COLS = [
    'G04c_002', 'G04c_003', 'G04c_004',
    'G04c_006', 'G04c_008', 'G04c_010'
]

def coerce_height_numeric(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    for c in HEIGHT_NUM_COLS:
        if c in out.columns:
            out[c] = pd.to_numeric(
                out[c].replace('unknown', np.nan),
                errors='coerce'
            )
    return out

In [58]:
# 1) 結合（sjoin）
train_df_geo = add_height_features(train_df_geo, height_df)
test_df_geo  = add_height_features(test_df_geo, height_df)

# 2) 数値化（unknown -> NaN, object -> float）
train_df_geo = coerce_height_numeric(train_df_geo)
test_df_geo  = coerce_height_numeric(test_df_geo)

# 3) 特徴量作成
for df in [train_df_geo, test_df_geo]:
    df['elev_mean']   = df['G04c_002']
    df['elev_range']  = df['G04c_003'] - df['G04c_004']

    df['slope_mean']  = df['G04c_010']
    df['slope_max']   = df['G04c_006']
    df['slope_range'] = df['G04c_006'] - df['G04c_008']


## 駅別乗降者数

In [59]:
file = Path(gis_path + '駅別乗降者数/S12-24_GML/S12-24_NumberOfPassengers.geojson')
eki_gdf = read_dataframe(file) # 高速化しないとめちゃ時間かかる
print(eki_gdf.crs)

EPSG:6668


In [60]:
eki_gdf = eki_gdf.rename(columns={
    'S12_001': '駅名',
    'S12_003': '路線名',
    'S12_037': '乗降客数_2018',
    'S12_041': '乗降客数_2019',
    'S12_045': '乗降客数_2020',
    'S12_049': '乗降客数_2021',
    'S12_053': '乗降客数_2022',
    'S12_057': '乗降客数_2023',
})

In [61]:
year_cols = [
    '乗降客数_2018', '乗降客数_2019',
    '乗降客数_2020', '乗降客数_2021',
    '乗降客数_2022', '乗降客数_2023'
]

eki_gdf[year_cols] = eki_gdf[year_cols].replace(0, np.nan)

normal_cols = [
    '乗降客数_2018',
    '乗降客数_2019',
    '乗降客数_2022',
    '乗降客数_2023',
]

eki_gdf['passenger_normal'] = (
    eki_gdf[normal_cols]
    .median(axis=1, skipna=True)
)

In [62]:
station_agg = (
    eki_gdf
    .dropna(subset=['passenger_normal'])
    .groupby('駅名', as_index=False)
    .agg(
        passenger_normal=('passenger_normal', 'sum'),
        geometry=('geometry', 'first')  # ほぼ同一なので代表
    )
)

station_agg['passenger_normal_log'] = np.log1p(
    station_agg['passenger_normal']
)

station_agg = gpd.GeoDataFrame(
    station_agg,
    geometry='geometry',
    crs='EPSG:6668'   # 駅データのCRS
)

station_agg['geometry'] = station_agg.geometry.centroid

station_agg['lon'] = station_agg.geometry.x
station_agg['lat'] = station_agg.geometry.y

In [63]:
from sklearn.neighbors import BallTree

def add_station_power_summax3(
    df: pd.DataFrame,
    station_agg: pd.DataFrame,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    station_lon_col: str = 'lon',
    station_lat_col: str = 'lat',
    passenger_col: str = 'passenger_normal_log',
    k: int = 3,
    tau_km: float = 1.0,
) -> pd.DataFrame:
    """
    物件ごとに近傍3駅の駅力スコアを作成（ソフト減衰）
      - station_power_sum3
      - station_power_max3

    score_i = passenger_log_i * exp(-distance_km / tau_km)

    前提:
      - df: 物件DF（lon_jgd / lat_jgd）
      - station_agg: 1駅=1行（lon / lat / passenger_normal_log）
    """
    out = df.copy()

    # --- 物件座標（ラジアン） ---
    prop_xy = np.deg2rad(
        out[[lat_col, lon_col]].to_numpy()
    )

    # --- 駅座標（ラジアン） ---
    station_xy = np.deg2rad(
        station_agg[[station_lat_col, station_lon_col]].to_numpy()
    )

    # --- BallTree (haversine) ---
    tree = BallTree(station_xy, metric='haversine')

    # k近傍探索（距離はラジアン）
    dist_rad, idx = tree.query(prop_xy, k=k)

    # km に変換
    dist_km = dist_rad * 6371.0

    # 駅の passenger_log
    pax = station_agg[passenger_col].to_numpy()

    # --- スコア計算 ---
    scores = np.zeros_like(dist_km)

    for i in range(k):
        scores[:, i] = pax[idx[:, i]] * np.exp(-dist_km[:, i] / tau_km)

    # --- 集約 ---
    out['station_power_sum3'] = scores.sum(axis=1)
    out['station_power_max3'] = scores.max(axis=1)

    return out


In [64]:
train_df_geo = add_station_power_summax3(
    train_df_geo,
    station_agg,
    lon_col='lon_jgd',
    lat_col='lat_jgd',
    tau_km=1.0,   # まずは 1km
)

test_df_geo = add_station_power_summax3(
    test_df_geo,
    station_agg,
    lon_col='lon_jgd',
    lat_col='lat_jgd',
    tau_km=1.0,
)

## 災害危険区域

In [65]:
disaster_path = gis_path + '災害危険区域/disaster.parquet'

#### データの結合（最初の1回のみなのでコメントアウト）

In [66]:
# in_dir = Path(gis_path + '災害危険区域')
# files = sorted(in_dir.glob('A48-21_*/*.geojson'))
# output_path = gis_path + '災害危険区域'

# gdfs = []

In [67]:
# for fp in files:
#     gdf = read_dataframe(fp) # 高速化しないとめちゃ時間かかる
        
#     gdfs.append(gdf)

# # 結合（index を振り直す）
# disaster_df = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdf.crs)
# print(f"crs: {disaster_df.crs}")

# del gdfs
# gc.collect()

In [68]:
# keep_cols = [
#     'geometry',
#     'A48_007'
# ]

# disaster_df[keep_cols].to_parquet(output_path + '/disaster.parquet')

#### 学習データ・予測データに結合

In [69]:
disaster_df = gpd.read_parquet(disaster_path)

In [70]:
def add_disaster_features_sjoin(
    df: pd.DataFrame,
    disaster_poly: gpd.GeoDataFrame,
    *,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    df_crs: str = 'EPSG:4612',
    reason_col: str = 'A48_007',
    predicate_primary: str = 'within',
    predicate_fallback: str | None = None,
    alpha_multi: float = 0.3,
    score_clip_max: float | None = 2.0,
    weight_map: dict[int, float] | None = None,
) -> pd.DataFrame:
    """
    災害危険区域（A48）ポリゴンと物件代表点を空間結合し、
    指定理由（A48_007）を重み付けして災害スコアを付与する。

    付与する列:
      - disaster_hit (0/1)
      - disaster_n_types
      - disaster_score_sum
      - disaster_score_max
      - disaster_score (max + alpha*(n_types-1), optional clip)
    """
    out = df.copy()

    # --- point gdf ---
    gdf_pts = gpd.GeoDataFrame(
        out[[lon_col, lat_col]].copy(),
        geometry=gpd.points_from_xy(out[lon_col], out[lat_col]),
        crs=df_crs,
    )
    gdf_pts['pt_id'] = np.arange(len(gdf_pts), dtype=np.int64)

    # --- poly ---
    dis = disaster_poly.copy()
    if dis.crs is None:
        raise ValueError('disaster_poly.crs が None です。CRS を設定してください。')

    if dis.crs.to_string() != df_crs:
        dis = dis.to_crs(df_crs)

    # reason_col を確実に持つ
    if reason_col not in dis.columns:
        raise ValueError(f'reason_col={reason_col} が disaster_poly に存在しません。')

    dis_small = dis[[reason_col, 'geometry']].copy()

    # --- join primary ---
    j1 = gpd.sjoin(
        gdf_pts[['pt_id', 'geometry']],
        dis_small,
        how='left',
        predicate=predicate_primary,
    )

    # --- fallback join（必要なら）---
    if predicate_fallback is not None:
        miss = j1['index_right'].isna()
        miss_pt_ids = j1.loc[miss, 'pt_id'].unique()

        if len(miss_pt_ids) > 0:
            pts_miss = gdf_pts.loc[gdf_pts['pt_id'].isin(miss_pt_ids), ['pt_id', 'geometry']]
            j2 = gpd.sjoin(
                pts_miss,
                dis_small,
                how='left',
                predicate=predicate_fallback,
            )
            j_all = pd.concat([j1, j2], axis=0, ignore_index=True)
        else:
            j_all = j1
    else:
        j_all = j1

    # --- 重み付け ---
    if weight_map is None:
        weight_map = {
            1: 0.8,  # 水害（河川）
            2: 0.7,  # 水害（海）
            3: 1.0,  # 水害（河川・海）
            4: 0.9,  # 急傾斜地崩壊等
            5: 0.9,  # 地すべり等
            6: 0.6,  # 火山災害
            7: 0.4,  # その他
        }

    j_all['reason_code'] = pd.to_numeric(j_all[reason_col], errors='coerce')
    j_all['reason_w'] = j_all['reason_code'].map(weight_map).astype('float64')

    # --- pt_id 単位に集計（複数マッチを“集計して残す”）---
    agg = (
        j_all.groupby('pt_id', as_index=False)
             .agg(
                 disaster_hit=('reason_code', lambda x: int(x.notna().any())),
                 disaster_n_types=('reason_code', lambda x: x.dropna().nunique()),
                 disaster_score_sum=('reason_w', 'sum'),
                 disaster_score_max=('reason_w', 'max'),
             )
    )

    # --- 元df順で戻す ---
    feat = pd.DataFrame({'pt_id': np.arange(len(out), dtype=np.int64)})
    feat = feat.merge(agg, on='pt_id', how='left')

    for c in ['disaster_hit', 'disaster_n_types', 'disaster_score_sum', 'disaster_score_max']:
        feat[c] = feat[c].fillna(0)

    # 複合リスク強調（max + alpha*(n_types-1)）
    feat['disaster_score'] = (
        feat['disaster_score_max'] + alpha_multi * np.maximum(feat['disaster_n_types'] - 1, 0)
    )

    if score_clip_max is not None:
        feat['disaster_score'] = feat['disaster_score'].clip(upper=score_clip_max)

    feat = feat.drop(columns=['pt_id'])

    out2 = out.reset_index(drop=True).join(feat, how='left')
    out2.index = out.index

    return out2


In [71]:
weight_map = {
    1: 0.8,  # 水害（河川）
    2: 0.7,  # 水害（海）
    3: 1.0,  # 水害（河川・海）
    4: 0.9,  # 急傾斜地崩壊等
    5: 0.9,  # 地すべり等
    6: 0.6,  # 火山災害
    7: 0.4,  # その他
}

train_df_geo = add_disaster_features_sjoin(train_df_geo, disaster_df, predicate_primary='within', weight_map=weight_map)
test_df_geo  = add_disaster_features_sjoin(test_df_geo,  disaster_df, predicate_primary='within', weight_map=weight_map)


In [72]:
rate_all = test_df_geo['disaster_hit'].mean()

print(f'train全体 disaster_hit率: {rate_all:.4%}')
print(f'該当件数: {test_df_geo["disaster_hit"].sum():,.0f} / {len(test_df_geo):,.0f}')


train全体 disaster_hit率: 0.5025%
該当件数: 565 / 112,437


## 学校

In [73]:
school_path = Path(gis_path + '学校/P29-23_GML/P29-23.geojson')
school_df = gpd.read_file(school_path)

In [74]:
school_use = school_df.loc[
    school_df['P29_007'] != 2
].copy()

school_use['school_type_norm'] = school_use['P29_003']

# 中等教育学校 → 中学校に寄せる
school_use.loc[
    school_use['school_type_norm'] == '16003',
    'school_type_norm'
] = '16002'

In [75]:
school_elem = school_use.loc[
    school_use['school_type_norm'] == '16001'
].copy()

school_junior = school_use.loc[
    school_use['school_type_norm'] == '16002'
].copy()

In [76]:
def add_school_features_from_gdfs(
    df: pd.DataFrame,
    school_elem: gpd.GeoDataFrame,
    school_junior: gpd.GeoDataFrame,
    *,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    df_crs: str = 'EPSG:4612',
    # 距離の計算用投影（m単位）
    proj_crs: str = 'EPSG:3857',
    # bin しきい値
    elem_threshold_m: int = 500,
    junior_threshold_m: int = 1000,
    # 欠損対策（通常は不要だが安全に）
    fill_missing_distance_m: float = 10_000.0,
    add_log_features: bool = True,
) -> pd.DataFrame:
    """
    物件代表点（lon/lat）に対し、school_elem（小学校）・school_junior（中学校）
    の最近傍距離を計算して特徴量追加する。

    追加列:
      - dist_elem_school_m
      - dist_junior_school_m
      - elem_school_500m
      - junior_school_1km
      - (optional) dist_elem_school_log, dist_junior_school_log
    """
    out = df.copy()

    # --- 入力チェック ---
    if lon_col not in out.columns or lat_col not in out.columns:
        raise ValueError(f'df に {lon_col}/{lat_col} がありません。')

    if school_elem is None or len(school_elem) == 0:
        raise ValueError('school_elem が空です。')
    if school_junior is None or len(school_junior) == 0:
        raise ValueError('school_junior が空です。')

    if school_elem.crs is None or school_junior.crs is None:
        raise ValueError('school_elem / school_junior の crs が None です。')

    # --- 物件点GDF ---
    gdf_pts = gpd.GeoDataFrame(
        out[[lon_col, lat_col]].copy(),
        geometry=gpd.points_from_xy(out[lon_col], out[lat_col]),
        crs=df_crs,
    )
    gdf_pts['pt_id'] = np.arange(len(gdf_pts), dtype=np.int64)

    # --- 距離計算は投影してm単位で ---
    pts_proj = gdf_pts.to_crs(proj_crs)
    elem_proj = school_elem.to_crs(proj_crs)
    junior_proj = school_junior.to_crs(proj_crs)

    # sjoin_nearest で最近傍距離（高速）
    j_elem = gpd.sjoin_nearest(
        pts_proj[['pt_id', 'geometry']],
        elem_proj[['geometry']],
        how='left',
        distance_col='dist_elem_school_m',
    )
    j_jun = gpd.sjoin_nearest(
        pts_proj[['pt_id', 'geometry']],
        junior_proj[['geometry']],
        how='left',
        distance_col='dist_junior_school_m',
    )

    # pt_id に揃えて距離列を抽出
    dist_elem = j_elem.set_index('pt_id')['dist_elem_school_m']
    dist_jun = j_jun.set_index('pt_id')['dist_junior_school_m']

    feat = pd.DataFrame({'pt_id': np.arange(len(out), dtype=np.int64)})
    feat = feat.join(dist_elem, on='pt_id')
    feat = feat.join(dist_jun, on='pt_id')

    # 欠損（通常は起きないが）
    feat['dist_elem_school_m'] = pd.to_numeric(feat['dist_elem_school_m'], errors='coerce').fillna(fill_missing_distance_m)
    feat['dist_junior_school_m'] = pd.to_numeric(feat['dist_junior_school_m'], errors='coerce').fillna(fill_missing_distance_m)

    # bin（閾値効果）
    feat['elem_school_500m'] = (feat['dist_elem_school_m'] <= elem_threshold_m).astype('int8')
    feat['junior_school_1km'] = (feat['dist_junior_school_m'] <= junior_threshold_m).astype('int8')

    # log（Tableau/非線形吸収用：入れるかは後で判断可）
    if add_log_features:
        feat['dist_elem_school_log'] = np.log1p(feat['dist_elem_school_m'])
        feat['dist_junior_school_log'] = np.log1p(feat['dist_junior_school_m'])

    feat = feat.drop(columns=['pt_id'])

    out2 = out.reset_index(drop=True).join(feat, how='left')
    out2.index = out.index
    return out2


In [77]:
train_df_geo = add_school_features_from_gdfs(train_df_geo, school_elem, school_junior)
test_df_geo  = add_school_features_from_gdfs(test_df_geo,  school_elem, school_junior)

## 病院

In [78]:
hospital_path = Path(gis_path + '病院/P04-20_GML/P04-20.geojson')
hospital_df = gpd.read_file(hospital_path)

In [79]:
def add_medical_features_from_gdf(
    df: pd.DataFrame,
    medical_gdf: gpd.GeoDataFrame,
    *,
    lon_col: str = 'lon_jgd',
    lat_col: str = 'lat_jgd',
    df_crs: str = 'EPSG:4612',
    proj_crs: str = 'EPSG:3857',  # m距離用（全国一律の近似）
    type_col: str = 'P04_001',     # 1=病院,2=診療所,3=歯科
    beds_col: str = 'P04_008',     # 病床数（病院のみ想定）
    emergency_col: str = 'P04_009',# 救急告示: 指定あり=1, 指定なし=9（想定）
    disaster_col: str = 'P04_010', # 災害拠点: 基幹=1, 地域=2, 指定なし=9（想定）
    use_types: tuple[int, ...] = (1, 2),  # まずは病院・診療所
    # bin しきい値
    hospital_threshold_m: int = 1000,
    clinic_threshold_m: int = 500,
    # 欠損距離の埋め（通常起きないはず）
    fill_missing_distance_m: float = 10_000.0,
    add_log_features: bool = True,
    add_nearest_attributes: bool = True,
) -> pd.DataFrame:
    """
    医療機関データ（点）から、物件点への最近傍距離を算出し特徴量追加。
    - 病院(1)と診療所(2)をデフォルトで利用（歯科(3)は原則後回し）
    - 距離は投影してm単位で計算
    - 任意で「最寄り病院の病床数」「救急/災害拠点フラグ」も付与可能

    追加される列（use_typesにより変動）:
      病院:
        dist_hospital_m, hospital_1km, dist_hospital_log
        (optional) hospital_beds_nearest, hospital_is_emergency_nearest, hospital_is_disaster_nearest
      診療所:
        dist_clinic_m, clinic_500m, dist_clinic_log
    """
    out = df.copy()

    # --- 入力チェック ---
    if lon_col not in out.columns or lat_col not in out.columns:
        raise ValueError(f'df に {lon_col}/{lat_col} がありません。')
    if medical_gdf is None or len(medical_gdf) == 0:
        raise ValueError('medical_gdf が空です。')
    if medical_gdf.crs is None:
        raise ValueError('medical_gdf.crs が None です。')

    # --- 物件点 ---
    gdf_pts = gpd.GeoDataFrame(
        out[[lon_col, lat_col]].copy(),
        geometry=gpd.points_from_xy(out[lon_col], out[lat_col]),
        crs=df_crs,
    )
    gdf_pts['pt_id'] = np.arange(len(gdf_pts), dtype=np.int64)

    # --- 医療データ前処理（タイプ絞り） ---
    med = medical_gdf.copy()
    med[type_col] = pd.to_numeric(med[type_col], errors='coerce')

    med = med.loc[med[type_col].isin(use_types)].copy()
    if len(med) == 0:
        raise ValueError(f'use_types={use_types} の条件で医療機関が0件になりました。')

    # --- 投影して距離(m) ---
    pts_proj = gdf_pts.to_crs(proj_crs)
    med_proj = med.to_crs(proj_crs)

    # --- タイプ別に最近傍 ---
    feat = pd.DataFrame({'pt_id': np.arange(len(out), dtype=np.int64)})

    def _nearest_for_type(type_code: int, dist_col_out: str, take_attrs: bool):
        m_sub = med_proj.loc[med_proj[type_col] == type_code].copy()
        if len(m_sub) == 0:
            # 施設が存在しない場合は遠距離で埋める
            feat[dist_col_out] = fill_missing_distance_m
            return None

        # 必要な属性列だけ保持（重い列は落とす）
        keep = ['geometry']
        if take_attrs:
            for c in [beds_col, emergency_col, disaster_col]:
                if c in m_sub.columns:
                    keep.append(c)
        m_sub = m_sub[keep].copy()

        j = gpd.sjoin_nearest(
            pts_proj[['pt_id', 'geometry']],
            m_sub,
            how='left',
            distance_col=dist_col_out,
        )
        # 代表行（pt_idごとに1行）
        j = j.drop_duplicates('pt_id').set_index('pt_id')

        feat[dist_col_out] = pd.to_numeric(j[dist_col_out], errors='coerce')

        return j

    # 病院(1)
    j_hosp = None
    if 1 in use_types:
        j_hosp = _nearest_for_type(1, 'dist_hospital_m', add_nearest_attributes)

    # 診療所(2)
    j_clinic = None
    if 2 in use_types:
        j_clinic = _nearest_for_type(2, 'dist_clinic_m', False)

    # 欠損埋め
    for c in ['dist_hospital_m', 'dist_clinic_m']:
        if c in feat.columns:
            feat[c] = feat[c].fillna(fill_missing_distance_m)

    # --- bin ---
    if 'dist_hospital_m' in feat.columns:
        feat['hospital_1km'] = (feat['dist_hospital_m'] <= hospital_threshold_m).astype('int8')
        if add_log_features:
            feat['dist_hospital_log'] = np.log1p(feat['dist_hospital_m'])

    if 'dist_clinic_m' in feat.columns:
        feat['clinic_500m'] = (feat['dist_clinic_m'] <= clinic_threshold_m).astype('int8')
        if add_log_features:
            feat['dist_clinic_log'] = np.log1p(feat['dist_clinic_m'])

    # --- 最寄り病院の属性（任意） ---
    if add_nearest_attributes and (j_hosp is not None):
        # 病床数
        if beds_col in j_hosp.columns:
            feat['hospital_beds_nearest'] = pd.to_numeric(j_hosp[beds_col], errors='coerce').fillna(0.0)

        # 救急告示（指定あり=1、それ以外=0）
        if emergency_col in j_hosp.columns:
            e = pd.to_numeric(j_hosp[emergency_col], errors='coerce')
            feat['hospital_is_emergency_nearest'] = (e == 1).astype('int8')

        # 災害拠点（基幹=1,地域=2 を 1扱い、それ以外0）
        if disaster_col in j_hosp.columns:
            d = pd.to_numeric(j_hosp[disaster_col], errors='coerce')
            feat['hospital_is_disaster_nearest'] = (d.isin([1, 2])).astype('int8')

    feat = feat.drop(columns=['pt_id'])

    out2 = out.reset_index(drop=True).join(feat, how='left')
    out2.index = out.index
    return out2


In [80]:
train_df_geo = add_medical_features_from_gdf(train_df_geo, hospital_df, use_types=(1, 2))
test_df_geo  = add_medical_features_from_gdf(test_df_geo,  hospital_df, use_types=(1, 2))


## 特徴量の追加・削除

In [81]:
train_df_geo.columns

Index(['target_ym', 'building_id', 'unit_id', 'lat', 'lon', 'target_year',
       'lon_jgd', 'lat_jgd', 'nearest_land_price', 'distance_to_landpoint_m',
       ...
       'dist_junior_school_log', 'dist_hospital_m', 'dist_clinic_m',
       'hospital_1km', 'dist_hospital_log', 'clinic_500m', 'dist_clinic_log',
       'hospital_beds_nearest', 'hospital_is_emergency_nearest',
       'hospital_is_disaster_nearest'],
      dtype='object', length=107)

In [82]:
add_cols = [
    'nearest_land_price', 'weighted_land_price_3', 'distance_to_landpoint_m', 'log_land_price', 'log_weighted_land_price_3',
    'land_price_yoy_nearest', 'land_price_yoy_w3', 'land_price_dlog_nearest', 'land_price_dlog_w3',
    'PTN_2020_nn', 'RTA_2025_nn', 'RTB_2025_nn', 'RTC_2025_nn', 'RTD_2025_nn', 'RTE_2025_nn',
    'pop_trend_rate_nn',
    'road_len_total', 'road_len_wide', 'road_len_narrow',
    'road_wide_ratio', 'road_narrow_ratio',
    'road_len_total_gap', 'road_narrow_ratio_gap',
    'road_len_density', 'road_len_density_gap',
    'dist_to_road_any_m', 'dist_to_road_major_m', 'dist_to_road_highway_m',
    'road_cnt_any_in_100m', 'road_cnt_major_in_100m',
    'road_cnt_any_in_300m', 'road_cnt_major_in_300m',
    'road_cnt_any_in_500m', 'road_cnt_major_in_500m',
    'zone_group', 'zone_residential_rank',
    'is_lowrise_residential', 'kenpei', 'youseki',
    'is_urbanized_area', 'is_urban_control_area', 'is_fireproof_area',
    'is_quasi_fireproof_area', 'has_height_limit', 'has_district_plan',
    'is_land_readjustment_area', 'is_high_utilization_area',
    'is_urban_renaissance_area', 'is_special_far_area',
    'is_highrise_residential_area', 'is_disaster_prevention_block',
    'is_redevelopment_core_area',
    'elev_mean', 'elev_range', 'slope_mean', 'slope_max', 'slope_range',
    'station_power_sum3', 'station_power_max3',
    'disaster_hit', 'disaster_n_types', 'disaster_score_sum', 'disaster_score_max', 'disaster_score',
    'elem_school_500m', 'junior_school_1km', 'dist_elem_school_m', 'dist_junior_school_m',
    'dist_hospital_m', 'dist_clinic_m',
    'hospital_1km', 'dist_hospital_log', 'clinic_500m', 'dist_clinic_log',
    'hospital_beds_nearest', 'hospital_is_emergency_nearest', 'hospital_is_disaster_nearest'
]

## 出力

In [83]:
train_df_geo[pkey_cols + add_cols].to_parquet(f'{intermediate_path}train_df_geo_v{geo_ver}.parquet')
test_df_geo[pkey_cols + add_cols].to_parquet(f'{intermediate_path}test_df_geo_v{geo_ver}.parquet')

In [84]:
train_df_geo

Unnamed: 0,target_ym,building_id,unit_id,lat,lon,target_year,lon_jgd,lat_jgd,nearest_land_price,distance_to_landpoint_m,log_land_price,weighted_land_price_3,log_weighted_land_price_3,nearest_land_price_prev,weighted_land_price_3_prev,land_price_yoy_nearest,land_price_yoy_w3,land_price_dlog_nearest,land_price_dlog_w3,pop_dist_nn_m,PTN_2020_nn,RTA_2025_nn,RTB_2025_nn,RTC_2025_nn,RTD_2025_nn,RTE_2025_nn,pop_trend_rate_nn,road_len_total,road_len_wide,road_len_narrow,road_wide_ratio,road_narrow_ratio,road_len_total_gap,road_narrow_ratio_gap,road_len_density,road_len_density_gap,dist_to_road_any_m,dist_to_road_major_m,dist_to_road_highway_m,road_cnt_any_in_100m,road_cnt_major_in_100m,road_cnt_any_in_300m,road_cnt_major_in_300m,road_cnt_any_in_500m,road_cnt_major_in_500m,A29_005,zone_group,zone_residential_rank,is_lowrise_residential,kenpei,youseki,kenpei_missing,youseki_missing,kenpei_bin,youseki_bin,density_cat,is_urbanized_area,is_urban_control_area,is_fireproof_area,is_quasi_fireproof_area,has_height_limit,has_district_plan,is_land_readjustment_area,is_high_utilization_area,is_urban_renaissance_area,is_special_far_area,is_highrise_residential_area,is_disaster_prevention_block,is_redevelopment_core_area,index_right,G04c_001,G04c_002,G04c_003,G04c_004,G04c_005,G04c_006,G04c_007,G04c_008,G04c_009,G04c_010,elev_mean,elev_range,slope_mean,slope_max,slope_range,station_power_sum3,station_power_max3,disaster_hit,disaster_n_types,disaster_score_sum,disaster_score_max,disaster_score,dist_elem_school_m,dist_junior_school_m,elem_school_500m,junior_school_1km,dist_elem_school_log,dist_junior_school_log,dist_hospital_m,dist_clinic_m,hospital_1km,dist_hospital_log,clinic_500m,dist_clinic_log,hospital_beds_nearest,hospital_is_emergency_nearest,hospital_is_disaster_nearest
0,201901,206271,262186,35.047688,136.637467,2019,136.637467,35.047688,41600.0,565.866905,10.635879,35863.705402,10.487509,41700.0,35938.445496,-0.002398,-0.002080,-0.002401,-0.002082,603.955933,3650.2078,0.1077,0.6470,0.2453,0.1016,0.0442,-0.003442,36905.0,0.0,14.0,0.0,0.000379,18674.492188,0.000051,34935.277344,17677.650391,287.406830,287.406830,287.406830,0,0,2,2,8,8,第一種低層住居専用地域,residential,1,1.0,50.0,80.0,False,False,41-50,<=80,第一種低層住居専用地域,1,0,0,0,0,0,0,0,0,0,0,0,0,485223,523645504,45.7,83.5,29.4,0,3.7,2,2.9,1,3.3,45.7,54.1,3.3,3.7,0.8,3.687164,1.455842,0,0,0.0,0.0,0.0,1772.954394,1548.944805,0,0,7.480966,7.345975,5364.808315,910.930040,0,8.587802,0,6.815563,38,0,0
1,201901,83315,35726,35.074625,136.639936,2019,136.639936,35.074625,62900.0,282.588689,11.049317,63123.013167,11.052857,62700.0,62910.132474,0.003190,0.003384,0.003185,0.003378,546.340942,3052.2065,0.1630,0.6040,0.2330,0.1149,0.0671,-0.002776,30370.0,0.0,0.0,0.0,0.000000,-7195.054688,-0.000265,28758.222656,-6813.695312,535.973999,535.973999,535.973999,0,0,0,0,0,0,第一種低層住居専用地域,residential,1,1.0,50.0,80.0,False,False,41-50,<=80,第一種低層住居専用地域,1,0,0,0,0,0,0,0,0,0,0,0,0,486066,523645813,67.1,87.6,36.5,0,1.4,4,0.5,3,0.9,67.1,51.1,0.9,1.4,0.9,4.446455,2.402885,0,0,0.0,0.0,0.0,205.684645,921.704436,1,1,5.331194,6.827309,3630.268787,178.575020,0,8.197337,1,5.190593,242,0,0
2,201901,140201,116820,35.072248,136.644708,2019,136.644708,35.072248,62900.0,672.823701,11.049317,61908.416601,11.033428,62700.0,61775.041812,0.003190,0.002159,0.003185,0.002157,179.813858,3052.2065,0.1630,0.6040,0.2330,0.1149,0.0671,-0.002776,30370.0,0.0,0.0,0.0,0.000000,-7195.054688,-0.000265,28758.222656,-6813.695312,39.685833,39.685833,39.685833,2,2,2,2,2,2,第一種低層住居専用地域,residential,1,1.0,50.0,80.0,False,False,41-50,<=80,第一種低層住居専用地域,1,0,0,0,0,0,0,0,0,0,0,0,0,486067,523645814,63.7,92.8,39.2,0,2.8,5,2.7,4,2.7,63.7,53.6,2.7,2.8,0.1,5.016984,2.094186,0,0,0.0,0.0,0.0,538.462714,1234.676359,0,0,6.290574,7.119374,4247.761062,623.508314,0,8.354383,0,6.436965,242,0,0
3,201901,216551,281648,35.003174,136.875602,2019,136.875602,35.003174,72300.0,398.845274,11.188593,59278.598858,10.990020,73000.0,59733.422240,-0.009589,-0.007614,-0.009635,-0.007643,527.635315,4622.2464,0.1268,0.6414,0.2317,0.1321,0.0768,-0.002956,26214.0,0.0,11.0,0.0,0.000420,2859.398438,0.000187,24810.251953,2705.990234,96.763718,96.763718,8279.000000,2,2,8,8,10,10,第一種住居地域,residential,3,0.0,60.0,200.0,False,False,51-60,151-200,第一種住居地域,1,0,0,1,0,0,0,0,0,0,0,0,0,493840,523647001,3.2,4.1,2.5,0,0.2,1,0.2,2,0.2,3.2,1.6,0.2,0.2,0.0,9.859783,5.949211,0,0,0.0,0.0,0.0,1296.417895,1389.614159,0,0,7.168131,7.237501,2689.760126,402.497049,0,7.897579,1,6.000169,468,1,1
4,201901,134968,85898,35.002403,136.875754,2019,136.875754,35.002403,72300.0,334.041803,11.188593,61048.493783,11.019440,73000.0,61540.828800,-0.009589,-0.008000,-0.009635,-0.008032,538.909485,4622.2464,0.1268,0.6414,0.2317,0.1321,0.0768,-0.002956,26214.0,0.0,11.0,0.0,0.000420,2859.398438,0.000187,24810.251953,2705.990234,22.526247,22.526247,8339.544922,2,2,8,8,10,10,第一種住居地域,residential,3,0.0,60.0,200.0,False,False,51-60,151-200,第一種住居地域,1,0,0,1,0,0,0,0,0,0,0,0,0,493840,523647001,3.2,4.1,2.5,0,0.2,1,0.2,2,0.2,3.2,1.6,0.2,0.2,0.0,10.234758,6.473120,0,0,0.0,0.0,0.0,1195.125068,1297.280226,0,0,7.086843,7.168796,2653.722347,358.974671,0,7.884095,1,5.886034,468,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363919,202207,182260,153084,35.066899,136.688936,2022,136.688936,35.066899,94800.0,89.329206,11.459535,106974.654245,11.580357,95500.0,107382.633875,-0.007330,-0.003799,-0.007357,-0.003807,619.723694,3457.4671,0.0861,0.6020,0.3119,0.1711,0.1078,-0.007279,20800.0,0.0,7.0,0.0,0.000337,-451.107422,-0.000008,19697.392578,-426.435547,204.324402,204.324402,2929.297852,0,0,6,6,12,12,商業地域,commercial,0,0.0,80.0,400.0,False,False,>=71,>=301,商業地域,1,0,0,0,0,0,0,0,0,0,0,0,0,486080,523645851,2.5,4.9,-3.4,0,0.4,7,0.1,1,0.3,2.5,8.3,0.3,0.4,0.3,13.125707,6.354506,0,0,0.0,0.0,0.0,85.958991,898.916138,1,1,4.465437,6.802302,317.486636,59.226350,1,5.763581,1,4.098110,400,1,1
363920,202207,118825,175430,35.064552,136.685451,2022,136.685451,35.064552,147000.0,307.960530,11.898195,117161.684920,11.671319,147000.0,117208.660974,0.000000,-0.000401,0.000000,-0.000401,446.156494,5646.3125,0.1300,0.5795,0.2905,0.1643,0.0995,-0.005952,32753.0,0.0,12.0,0.0,0.000366,3245.701172,0.000111,31013.142578,3073.242188,43.656971,43.656971,3011.305176,2,2,4,4,10,10,商業地域,commercial,0,0.0,80.0,400.0,False,False,>=71,>=301,商業地域,1,0,0,0,0,0,0,0,0,0,0,0,0,485799,523645744,2.3,4.8,0.7,0,0.4,3,0.1,6,0.3,2.3,4.1,0.3,0.4,0.3,17.118138,7.585616,0,0,0.0,0.0,0.0,461.973808,431.565330,1,1,6.137670,6.069733,189.780051,103.787641,1,5.251121,1,4.651936,400,1,1
363921,202207,92766,314093,35.060995,136.686671,2022,136.686671,35.060995,78200.0,289.757745,11.267038,94946.390229,11.461078,78100.0,94948.090528,0.001280,-0.000018,0.001280,-0.000018,522.387695,5646.3125,0.1300,0.5795,0.2905,0.1643,0.0995,-0.005952,32753.0,0.0,12.0,0.0,0.000366,3245.701172,0.000111,31013.142578,3073.242188,150.160873,150.160873,3417.984375,0,0,4,4,10,10,工業地域,industrial,0,0.0,60.0,200.0,False,False,51-60,151-200,工業地域,1,0,0,0,0,0,0,0,0,0,0,0,0,485797,523645742,2.0,4.5,0.6,0,0.4,3,0.1,3,0.3,2.0,3.9,0.3,0.4,0.3,13.312467,5.082789,0,0,0.0,0.0,0.0,764.198934,286.377985,0,1,6.640136,5.660798,161.605356,159.717361,1,5.091326,1,5.079647,104,1,0
363922,202207,212417,170589,34.933236,136.851469,2022,136.851469,34.933236,56300.0,593.864723,10.938468,45064.048776,10.715862,56900.0,45522.661886,-0.010545,-0.010074,-0.010601,-0.010125,628.675049,1352.4896,0.1065,0.5600,0.3334,0.1912,0.1146,-0.003980,22235.0,0.0,6.0,0.0,0.000270,6144.881836,-0.000008,21024.705078,5810.166016,1530.549072,1530.549072,16081.683594,0,0,0,0,0,0,第一種低層住居専用地域,residential,1,1.0,60.0,100.0,False,False,51-60,81-100,第一種低層住居専用地域,1,0,0,0,0,0,0,0,0,0,0,0,0,481830,523636183,11.7,25.4,5.1,0,1.8,8,0.3,8,1.1,11.7,20.3,1.1,1.8,1.5,1.982090,0.850048,0,0,0.0,0.0,0.0,625.849414,1528.529692,0,0,6.440706,7.332716,2704.885717,315.055802,0,7.903185,1,5.755919,28,0,0
