## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib

import lightgbm as lgb

import gc

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
train_file_path = ROOT_DIR + 'train.csv'
test_file_path = ROOT_DIR + 'test.csv'
data_definition_path = ROOT_DIR + 'data_definition.xlsx'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
output_path = '../output/'

## File Import

In [3]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [4]:
data_definition = pd.ExcelFile(data_definition_path)
data_definition_df = pd.read_excel(data_definition_path, sheet_name=data_definition.sheet_names[0])

In [5]:
pk_cols = ['building_id', 'unit_id']
date_col = 'target_ym'
target_col = 'money_room'
fe_cols = list(data_definition_df[data_definition_df['fe_cols'] == 1]['本番データ特徴量名'])

In [6]:
print(fe_cols)

['building_type', 'lon', 'lat', 'building_structure', 'floor_count', 'year_built', 'reform_exterior', 'building_tag_id', 'room_floor', 'reform_wet_area', 'reform_interior', 'unit_tag_id', 'bukken_type', 'flg_investment', 'addr1_1', 'addr1_2', 'walk_distance1', 'house_area', 'flg_new', 'snapshot_window_angle', 'madori_number_all', 'madori_kind_all', 'statuses']


## EDA

#### データの分布

In [7]:
# 各データの概要を確認
display(train_df[fe_cols].head(3))
display(test_df[fe_cols].head(3))

Unnamed: 0,building_type,lon,lat,building_structure,floor_count,year_built,reform_exterior,building_tag_id,room_floor,reform_wet_area,...,flg_investment,addr1_1,addr1_2,walk_distance1,house_area,flg_new,snapshot_window_angle,madori_number_all,madori_kind_all,statuses
0,4,136.637467,35.047688,1.0,2.0,199204.0,,210202/210301/210101,,1/2/3/4,...,,24,205,1840.0,106.82,0.0,,4,50,210101/220701/220601/230401/310501/210301/210202
1,4,136.639936,35.074625,10.0,2.0,198108.0,2.0,330501/210301/210101/210201,,1/2/3/4,...,,24,205,1920.0,134.04,0.0,,4,50,210101/220701/220601/220801/230601/250201/2103...
2,4,136.644708,35.072248,1.0,2.0,199506.0,2.0,210201/330501/334101/210101/210301/340301,,1/2/3/4,...,,24,205,2000.0,114.59,0.0,,4,50,210101/220701/220601/230401/220801/310501/2306...


Unnamed: 0,building_type,lon,lat,building_structure,floor_count,year_built,reform_exterior,building_tag_id,room_floor,reform_wet_area,...,flg_investment,addr1_1,addr1_2,walk_distance1,house_area,flg_new,snapshot_window_angle,madori_number_all,madori_kind_all,statuses
0,1,136.688153,35.072193,5.0,14.0,199510.0,,210201/321001/210101/320101/210301,4.0,,...,0.0,24,205,887.0,70,0,5.0,3,50,253501/220301/210101/340102/290401/220701/2206...
1,4,136.673603,35.066061,1.0,2.0,199206.0,,210201/210101/210301,,,...,,24,205,880.0,171,0,,6,50,210101/290501/210201/210301/250201
2,4,136.854324,34.937964,10.0,2.0,197511.0,2/1,343401/323401/210101/210202/210301,,1/2/3/4/5,...,,23,224,2800.0,78,0,5.0,3,50,230203/210101/220401/220701/290601/220601/2901...


In [8]:
display(train_df[fe_cols].info())
display(test_df[fe_cols].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363924 entries, 0 to 363923
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   building_type          363924 non-null  int64  
 1   lon                    363924 non-null  float64
 2   lat                    363924 non-null  float64
 3   building_structure     348475 non-null  float64
 4   floor_count            362627 non-null  float64
 5   year_built             349238 non-null  float64
 6   reform_exterior        16683 non-null   object 
 7   building_tag_id        335779 non-null  object 
 8   room_floor             198452 non-null  float64
 9   reform_wet_area        82921 non-null   object 
 10  reform_interior        86442 non-null   object 
 11  unit_tag_id            300676 non-null  object 
 12  bukken_type            363924 non-null  int64  
 13  flg_investment         186971 non-null  float64
 14  addr1_1                363924 non-nu

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112437 entries, 0 to 112436
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   building_type          112437 non-null  int64  
 1   lon                    112437 non-null  float64
 2   lat                    112437 non-null  float64
 3   building_structure     107428 non-null  float64
 4   floor_count            112075 non-null  float64
 5   year_built             107618 non-null  float64
 6   reform_exterior        6153 non-null    object 
 7   building_tag_id        104491 non-null  object 
 8   room_floor             59523 non-null   float64
 9   reform_wet_area        30953 non-null   object 
 10  reform_interior        31782 non-null   object 
 11  unit_tag_id            97067 non-null   object 
 12  bukken_type            112437 non-null  int64  
 13  flg_investment         59657 non-null   float64
 14  addr1_1                112437 non-nu

None

In [9]:
display(train_df[fe_cols].describe())
display(test_df[fe_cols].describe())

Unnamed: 0,building_type,lon,lat,building_structure,floor_count,year_built,room_floor,bukken_type,flg_investment,addr1_1,addr1_2,walk_distance1,house_area,flg_new,snapshot_window_angle,madori_number_all,madori_kind_all
count,363924.0,363924.0,363924.0,348475.0,362627.0,349238.0,198452.0,363924.0,186971.0,363924.0,363924.0,356199.0,363924.0,363922.0,245767.0,363924.0,363924.0
mean,41.944411,137.461207,35.300902,3.231256,6.117669,199347.544798,5.202754,1256.575681,0.02087,19.961885,168.618643,1017.114531,90.369974,0.003493,4.843738,3.215479,46.1934
std,194.578871,2.979387,1.625129,2.072058,6.205453,1440.23109,4.249281,49.79026,0.143322,10.046767,56.900927,1397.271492,1501.318709,0.058994,1.376582,1.326026,9.984417
min,1.0,127.656334,26.090097,0.0,0.0,150001.0,-2.0,1202.0,0.0,1.0,101.0,1.0,1.0,0.0,0.0,1.0,10.0
25%,1.0,135.519223,34.729234,1.0,2.0,198211.0,2.0,1202.0,0.0,13.0,111.0,400.0,63.73,0.0,4.0,2.0,50.0
50%,1.0,139.314683,35.416999,4.0,5.0,199403.0,4.0,1302.0,0.0,14.0,201.0,720.0,78.7,0.0,5.0,3.0,50.0
75%,4.0,139.699853,35.726763,4.0,9.0,200407.0,7.0,1302.0,0.0,27.0,208.0,1200.0,101.64,0.0,6.0,4.0,50.0
max,999.0,144.444086,43.856396,12.0,980.0,203407.0,58.0,1302.0,2.0,47.0,621.0,99999.0,880084.0,1.0,8.0,63.0,99.0


Unnamed: 0,building_type,lon,lat,building_structure,floor_count,year_built,room_floor,bukken_type,flg_investment,addr1_1,addr1_2,walk_distance1,house_area,flg_new,snapshot_window_angle,madori_number_all,madori_kind_all
count,112437.0,112437.0,112437.0,107428.0,112075.0,107618.0,59523.0,112437.0,59657.0,112437.0,112437.0,110112.0,112437.0,112437.0,76555.0,112437.0,112437.0
mean,44.950852,137.326992,35.292542,3.198803,6.043212,199373.707623,5.240932,1254.958546,0.030139,20.406379,168.62499,1037.820546,86.726923,0.002472,4.855699,3.214351,46.541975
std,201.439241,3.011793,1.681197,2.092509,5.532613,1493.254615,4.081418,49.912615,0.171461,10.176881,56.043161,1293.844805,72.777883,0.049663,1.360397,1.346485,9.697361
min,1.0,127.655876,26.091937,1.0,0.0,186801.0,-1.0,1202.0,0.0,1.0,101.0,1.0,9.0,0.0,0.0,1.0,10.0
25%,1.0,135.499895,34.714576,1.0,2.0,198209.0,2.0,1202.0,0.0,13.0,111.0,400.0,63.0,0.0,4.0,2.0,50.0
50%,1.0,139.01398,35.359866,4.0,4.0,199403.0,4.0,1302.0,0.0,15.0,201.0,720.0,78.0,0.0,5.0,3.0,50.0
75%,4.0,139.694098,35.721244,4.0,9.0,200411.0,7.0,1302.0,0.0,27.0,208.0,1280.0,102.0,0.0,6.0,4.0,50.0
max,999.0,144.432181,44.116951,12.0,62.0,220211.0,53.0,1302.0,2.0,47.0,621.0,99999.0,12105.0,1.0,8.0,44.0,99.0


#### 目的変数の分布

In [10]:
# plt.figure(figsize=(8,4))
# plt.hist(train_df[target_col], bins=30)
# plt.xlabel("money_room")
# plt.ylabel("count")
# plt.title("Distribution of money_room")
# plt.tight_layout()
# plt.show()


#### 説明変数の分布

In [11]:
# # 数値型はヒストグラムで可視化
# for col in fe_cols:
#     plt.figure(figsize=(5, 4))
#     sns.histplot(train_df[col], kde=True, bins=30)
#     plt.title(f"{col}のヒストグラム")
#     plt.tight_layout()
#     plt.show()

#### 説明変数と目的変数の関係

In [12]:
# # --- 相関行列 ---
# cols = fe_cols + [target_col]
# corr = train_df[cols].corr()

# # --- ヒートマップ ---
# plt.figure(figsize=(10, 8))
# sns.heatmap(
#     corr,
#     annot=True,        # セルに相関係数を表示（不要なら False）
#     fmt=".2f",
#     cmap="coolwarm",   # カラースケール
#     vmin=-1, vmax=1,
#     linewidths=0.5,
#     square=True
# )

# plt.title("Correlation Heatmap (including y)", fontsize=14)
# plt.tight_layout()
# plt.show()

## 前処理

#### アドレス情報の置換

In [13]:
codes = pd.read_excel(f"{ROOT_DIR}/data_definition.xlsx", sheet_name=data_definition.sheet_names[3])
codes.columns = ['No.', 'addr1_1', 'addr1_2', 'Prefecture name',
       'City/town/village name']
codes = codes[['addr1_1', 'addr1_2', 'Prefecture name',
       'City/town/village name']]

train_df = pd.merge(train_df, codes, on=['addr1_1', 'addr1_2'], how='inner')
test_df = pd.merge(test_df, codes, on=['addr1_1', 'addr1_2'], how='inner')

In [14]:
train_df.drop(['addr1_1', 'addr1_2'], axis=1, inplace=True)
test_df.drop(['addr1_1', 'addr1_2'], axis=1, inplace=True)
del codes

In [15]:
fe_cols += ['Prefecture name', 'City/town/village name']

# 削除する特徴量
remove_cols = ['addr1_1', 'addr1_2']
fe_cols = [c for c in fe_cols if c not in remove_cols]

#### 築年数の置換

In [16]:
def parse_year(date_input):
    try:
        s = str(date_input)
        if len(s) < 4:
            return np.nan
        return int(s[:4])
    except:
        return np.nan

def add_age_features(df):
    # 元の year_built と target_ym の年だけ抽出
    df['built_year']  = df['year_built'].apply(parse_year)
    df['target_year'] = df['target_ym'].apply(parse_year)

    # 築年数 = 対象年 − 建築年
    df['built_diff'] = df['target_year'] - df['built_year']

    # 築年数がマイナスになることはありえないので NaN に修正
    df.loc[df['built_diff'] < 0, 'built_diff'] = np.nan

    return df

In [17]:
train_df = add_age_features(train_df)
test_df  = add_age_features(test_df)

fe_cols += ['built_diff']

#### 異常値の置換

部屋数を８部屋未満を正常値として、NaNは同じ建物内の最頻値で補完する

In [18]:
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

df['madori_number_clean'] = df['madori_number_all'].where(
    (df['madori_number_all'] >= 1) &
    (df['madori_number_all'] <= 7)
)

df['madori_number_all'] = (
    df.groupby('building_id')['madori_number_clean']
      .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

train_df = df.iloc[:len(train_df)]
test_df  = df.iloc[len(train_df):]

面積関係

In [19]:
# --- 結合 ---
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

df['house_area_clean'] = df['house_area'].where(
    (df['house_area'] >= 10) & (df['house_area'] <= 300)
)

df['snapshot_land_area_clean'] = df['snapshot_land_area'].where(
    (df['snapshot_land_area'] >= 10) &
    (df['snapshot_land_area'] <= 500)
)

# --- ② building_id ごとの中央値で補完 ---
df['house_area'] = (
    df.groupby('building_id')['house_area_clean']
      .transform(lambda x: x.median() if x.notna().sum() > 0 else np.nan)
)

df['snapshot_land_area'] = (
    df.groupby('building_id')['snapshot_land_area_clean']
      .transform(lambda x: x.median() if x.notna().sum() > 0 else np.nan)
)

# --- ③ train / test に分割 ---
train_df = df.iloc[:len(train_df)].copy()
test_df  = df.iloc[len(train_df):].copy()


部屋の階数が建物の階数より大きい場合は、建物の階数で置換

In [20]:
# --- 結合 ---
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# --- Step1: room_floor > floor_count の異常を検出 ---
mask = df['room_floor'] > df['floor_count']

# --- Step2: 置換（room_floor ← floor_count） ---
df.loc[mask, 'room_floor'] = df.loc[mask, 'floor_count']

# --- train / test に分割 ---
train_df = df.iloc[:len(train_df)].copy()
test_df  = df.iloc[len(train_df):].copy()


#### 欠損値補完

madori_kind_all

In [21]:
# --- 結合 ---
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# --- Step1: 有効コードのみ残す（その他は NaN） ---
valid_codes = [10, 20, 25, 30, 35, 40, 45, 50, 55]

df['madori_kind_clean'] = df['madori_kind_all'].where(
    df['madori_kind_all'].isin(valid_codes)
)

# --- Step2: building_id ごとの最頻値で補完 ---
df['madori_kind_all'] = (
    df.groupby('building_id')['madori_kind_clean']
      .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

# --- Step3: train / test に分割 ---
train_df = df.iloc[:len(train_df)].copy()
test_df  = df.iloc[len(train_df):].copy()


floor_count

In [22]:
# --- 結合 ---
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# --- Step1: 異常値（62階超）を NaN にする ---
df['floor_count_clean'] = df['floor_count'].where(
    df['floor_count'] <= 62
)

# --- Step2: building_id ごとの最頻値（mode）で補完 ---
df['floor_count'] = (
    df.groupby('building_id')['floor_count_clean']
      .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

# --- Step3: 欠損フラグ（めちゃ効く） ---
# df['floor_count_missing'] = df['floor_count'].isna().astype(int)

# --- Step4: train / test に分割 ---
train_df = df.iloc[:len(train_df)].copy()
test_df  = df.iloc[len(train_df):].copy()


building_type

In [23]:
# --- 結合 ---
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# --- Step1: building_type を 1→0, 3→1 に変換（それ以外は NaN） ---
df['building_type'] = df['building_type'].map({1: 0, 3: 1})

# --- Step2: building_id ごとの最頻値で補完 ---
df['building_type'] = (
    df.groupby('building_id')['building_type']
      .transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

# --- Step3: 欠損フラグ（効く） ---
# df['building_type_missing'] = df['building_type'].isna().astype(int)

# --- train / test に分割 ---
train_df = df.iloc[:len(train_df)].copy()
test_df  = df.iloc[len(train_df):].copy()


#### カテゴリ型の変換

bukken_type

In [24]:
train_df['bukken_type'] = train_df['bukken_type'].map({1202: 0, 1302: 1})
test_df['bukken_type'] = test_df['bukken_type'].map({1202: 0, 1302: 1})

アドレス情報

In [25]:
adress_cols = ['Prefecture name', 'City/town/village name']

global_mean = train_df[target_col].mean()

for col in adress_cols:

    # Step1: train でカテゴリごとの平均を計算
    mapping = train_df.groupby(col)[target_col].mean()

    # Step2: train に map を適用
    train_df[col + '_te'] = train_df[col].map(mapping)

    # Step3: test にも map を適用（未知カテゴリは global_mean）
    test_df[col + '_te'] = test_df[col].map(mapping).fillna(global_mean)

fe_cols += ['Prefecture name_te', 'City/town/village name_te']

In [26]:
for c in adress_cols:
    train_df[c] = train_df[c].astype('category')
    test_df[c]  = test_df[c].astype('category')

cat_cols = adress_cols

reform関連

In [27]:
def get_slashed_tags(df, cols_list):
    """スラッシュ区切り列を 0/1 の int8 フラグ列に分解する"""
    temp_dfs = []
    for col in cols_list:

        temp_df = df[col].str.get_dummies(sep="/")
        # if is_tag_master:
        #     temp_df.rename(columns=tag_master, inplace=True)
        temp_df = temp_df.add_prefix(f"{col} ")
        temp_df = temp_df.astype('int8')

        temp_dfs.append(temp_df)

    # すべて結合
    temp_dfs = pd.concat(temp_dfs, axis=1).astype('int8')
    return temp_dfs

In [28]:
# --- train + test を結合 ---
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# --- 新しいタグ列を生成 ---
slashed_cols = ["reform_interior", "reform_exterior", "reform_wet_area"]
slashed_df = get_slashed_tags(combined_df, slashed_cols)

In [29]:
reform_cols = slashed_df.columns.tolist()

# --- 元 DF に結合 ---
combined_df = pd.concat([combined_df, slashed_df], axis=1)

# --- スラッシュ区切り列を削除 ---
combined_df = combined_df.drop(columns=slashed_cols)

# --- 再分割 ---
train_df = combined_df.iloc[:len(train_df)].copy()
test_df  = combined_df.iloc[len(train_df):].copy()

In [30]:
del slashed_df
gc.collect()

70

In [31]:
fe_cols += reform_cols
cat_cols += list(reform_cols)

fe_cols = [c for c in fe_cols if c not in slashed_cols]

#### FE

面積比

In [32]:
train_df['area_ratio'] = train_df['house_area'] / train_df['snapshot_land_area']
test_df['area_ratio'] = test_df['house_area'] / test_df['snapshot_land_area']

fe_cols += ['area_ratio']

相対階数

In [33]:
train_df['relative_floor'] = train_df['room_floor'] / train_df['floor_count']
test_df['relative_floor']  = test_df['room_floor'] / test_df['floor_count']

fe_cols += ['relative_floor']

密度

In [34]:
for df in [train_df, test_df]:
    # 1) 階数密度: 建物階数 / 建物面積
    df['floor_area_density'] = df['floor_count'] / df['house_area']
    df.loc[df['house_area'] <= 0, 'floor_area_density'] = np.nan

    # 2) 敷地あたり専有面積密度: 専有面積 / 区画面積
    df['unit_land_density'] = df['unit_area'] / df['snapshot_land_area']
    df.loc[df['snapshot_land_area'] <= 0, 'unit_land_density'] = np.nan

    # 3) 面積 / 部屋数: 1部屋あたり専有面積
    df['area_per_room'] = df['unit_area'] / df['madori_number_all']
    df.loc[df['madori_number_all'] <= 0, 'area_per_room'] = np.nan

fe_cols += [
    'floor_area_density',
    'unit_land_density',
    'area_per_room',
]

豪邸検出

In [35]:
for df in [train_df, test_df]:
    df['land_building_ratio'] = df['snapshot_land_area'] / df['house_area']
    df.loc[df['house_area'] <= 0, 'land_building_ratio'] = np.nan

fe_cols += ['land_building_ratio']

面積と築年の交互作用

In [36]:
for df in [train_df, test_df]:
    # 1) 建物面積 × 築年数（築浅・築古で傾きが変わるのを捉える）
    df['house_area_x_built_diff'] = df['house_area'] * df['built_diff']

    # 2) 専有面積 × 築年数
    df['unit_area_x_built_diff'] = df['unit_area'] * df['built_diff']

    # 3) 1部屋あたり面積 × 築年数
    #   → 同じ築年数でも「広くてゆとりのある間取り」のプレミアムを表現
    df['area_per_room_x_built_diff'] = df['area_per_room'] * df['built_diff']

fe_cols += [
    'house_area_x_built_diff',
    'unit_area_x_built_diff'
]

building_idごとの統合特徴量

In [37]:
len_train = len(train_df)

# --- train + test を結合 ---
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# --- 1) building_id ごとの median(house_area) ---
building_house_area_median = (
    combined_df.groupby("building_id")["house_area"]
               .median()
               .rename("building_house_area_median")
)

# --- 2) building_id ごとの max(room_floor) ---
building_room_floor_max = (
    combined_df.groupby("building_id")["room_floor"]
               .max()
               .rename("building_room_floor_max")
)

# --- 3) building_id ごとの unit_count（件数） ---
building_unit_count = (
    combined_df.groupby("building_id")["unit_id"]  # unit_id がなければ建物内 index をカウントでもOK
               .count()
               .rename("building_unit_count")
)

# --- まとめて結合 ---
combined_df = combined_df.join(building_house_area_median, on="building_id")
combined_df = combined_df.join(building_room_floor_max,   on="building_id")
combined_df = combined_df.join(building_unit_count,       on="building_id")

# --- 再び train / test に分割 ---
train_df = combined_df.iloc[:len_train].copy()  # 元の train 行数を使う
test_df  = combined_df.iloc[len_train:].copy()


In [38]:
fe_cols += [
    'building_house_area_median',
    'building_room_floor_max',
    'building_unit_count'
]

近傍価格特徴量

In [39]:
import numpy as np
from sklearn.neighbors import BallTree

target_col = 'money_room'

# --- 1. 学習データ・テストデータの座標をラジアンに変換 ---
# 既に lat / lon は float 前提
train_coords_rad = np.radians(train_df[['lat', 'lon']].values)
test_coords_rad  = np.radians(test_df[['lat', 'lon']].values)

# --- 2. BallTree を train から構築（haversine 距離） ---
# haversine は「球面距離」なので地球の半径を掛けて km に変換する
tree = BallTree(train_coords_rad, metric='haversine')

# 半径 1km をラジアンに変換
R_earth_km = 6371.0
radius_km = 1.0
radius_rad = radius_km / R_earth_km

y_train = train_df[target_col].values
n_train = len(train_df)
n_test  = len(test_df)


# --- 3. train 用の 1km 近傍特徴量 ---
mean_price_train = np.full(n_train, np.nan, dtype=float)
median_price_train = np.full(n_train, np.nan, dtype=float)
count_neighbors_train = np.zeros(n_train, dtype=int)

# 半径検索（train 自身に対して）
indices_array = tree.query_radius(train_coords_rad, r=radius_rad)

for i, idx in enumerate(indices_array):
    # 自分自身を除外
    idx = idx[idx != i]
    if len(idx) == 0:
        continue
    prices = y_train[idx]
    mean_price_train[i] = prices.mean()
    median_price_train[i] = np.median(prices)
    count_neighbors_train[i] = len(idx)

train_df['mean_price_within_1km']   = mean_price_train
train_df['median_price_within_1km'] = median_price_train
train_df['count_neighbors_1km']     = count_neighbors_train


# --- 4. test 用の 1km 近傍特徴量（近傍は train だけ） ---
mean_price_test = np.full(n_test, np.nan, dtype=float)
median_price_test = np.full(n_test, np.nan, dtype=float)
count_neighbors_test = np.zeros(n_test, dtype=int)

indices_array_test = tree.query_radius(test_coords_rad, r=radius_rad)

for i, idx in enumerate(indices_array_test):
    if len(idx) == 0:
        continue
    prices = y_train[idx]
    mean_price_test[i] = prices.mean()
    median_price_test[i] = np.median(prices)
    count_neighbors_test[i] = len(idx)

test_df['mean_price_within_1km']   = mean_price_test
test_df['median_price_within_1km'] = median_price_test
test_df['count_neighbors_1km']     = count_neighbors_test


# --- 5. NaN の埋め方（近傍が見つからなかった物件向け） ---
# とりあえず全体の中央値 or 市区町村単位中央値などで埋める
global_mean_price  = np.nanmean(y_train)
global_median_price = np.nanmedian(y_train)

for df in [train_df, test_df]:
    df['mean_price_within_1km'] = df['mean_price_within_1km'].fillna(global_mean_price)
    df['median_price_within_1km'] = df['median_price_within_1km'].fillna(global_median_price)
    # count_neighbors_1km は 0 のままでOK（近くに売買事例なしという情報）

In [40]:
fe_cols += [
    'mean_price_within_1km',
    'median_price_within_1km',
    'count_neighbors_1km',
   ]


In [41]:
import numpy as np
from sklearn.neighbors import BallTree

target_col = 'money_room'

# --- 位置情報（ラジアン） ---
train_coords_rad = np.radians(train_df[['lat', 'lon']].values.astype(float))
test_coords_rad  = np.radians(test_df[['lat', 'lon']].values.astype(float))

# --- BallTree 構築 ---
tree = BallTree(train_coords_rad, metric='haversine')
R_earth_km = 6371.0
radius_rad = 1.0 / R_earth_km  # 1km

y_train = train_df[target_col].to_numpy()
n_train = len(train_df)
n_test  = len(test_df)

# ---- 結果を格納する配列 ----
std_train = np.full(n_train, np.nan, dtype=float)
iqr_train = np.full(n_train, np.nan, dtype=float)

std_test = np.full(n_test, np.nan, dtype=float)
iqr_test = np.full(n_test, np.nan, dtype=float)

# --- train の近傍（インデックスのみ取得） ---
ind_array = tree.query_radius(
    train_coords_rad,
    r=radius_rad,
    return_distance=False,
    sort_results=False
)

for i, idx in enumerate(ind_array):
    idx = np.asarray(idx, dtype=int)
    # 自分自身を除外
    idx = idx[idx != i]

    if len(idx) <= 1:
        continue

    neigh_prices = y_train[idx]
    std_train[i] = np.std(neigh_prices)
    q75, q25 = np.percentile(neigh_prices, [75, 25])
    iqr_train[i] = q75 - q25

train_df['neighbor_price_std_1km'] = std_train
train_df['neighbor_price_iqr_1km'] = iqr_train

# --- test の近傍 ---
ind_array_test = tree.query_radius(
    test_coords_rad,
    r=radius_rad,
    return_distance=False,
    sort_results=False
)

for i, idx in enumerate(ind_array_test):
    idx = np.asarray(idx, dtype=int)
    if len(idx) <= 1:
        continue

    neigh_prices = y_train[idx]
    std_test[i] = np.std(neigh_prices)
    q75, q25 = np.percentile(neigh_prices, [75, 25])
    iqr_test[i] = q75 - q25

test_df['neighbor_price_std_1km'] = std_test
test_df['neighbor_price_iqr_1km'] = iqr_test

# --- 欠損埋め ---
global_std = np.nanmean(std_train)
global_iqr = np.nanmean(iqr_train)

for df in [train_df, test_df]:
    df['neighbor_price_std_1km'] = df['neighbor_price_std_1km'].fillna(global_std)
    df['neighbor_price_iqr_1km'] = df['neighbor_price_iqr_1km'].fillna(global_iqr)

In [42]:
fe_cols += [
    'neighbor_price_std_1km',
    'neighbor_price_iqr_1km'
]

緯度・経度を周期変換

In [43]:
# --- 緯度・経度をラジアンに ---
for df in [train_df, test_df]:
    df['lat_rad'] = np.radians(df['lat'].astype(float))
    df['lon_rad'] = np.radians(df['lon'].astype(float))

    # sin / cos 変換
    df['sin_lat'] = np.sin(df['lat_rad'])
    df['cos_lat'] = np.cos(df['lat_rad'])
    df['sin_lon'] = np.sin(df['lon_rad'])
    df['cos_lon'] = np.cos(df['lon_rad'])

In [44]:
fe_cols += [
    'sin_lat', 'cos_lat', 'sin_lon', 'cos_lon'
]

区町村ごとの価格中央値エンコーディング

In [45]:
city_col = 'City/town/village name'

# --- ① train + test を結合 ---
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# 既に city_lat/city_lon がある場合はいったん削除（任意）
for col in ['city_lat', 'city_lon']:
    if col in combined_df.columns:
        combined_df = combined_df.drop(columns=[col])

combined_df['lat'] = combined_df['lat'].astype(float)
combined_df['lon'] = combined_df['lon'].astype(float)

# --- ② 市区町村ごとの lat / lon の中央値 ---
city_lat_median = combined_df.groupby(city_col)['lat'].median()
city_lon_median = combined_df.groupby(city_col)['lon'].median()

# --- ③ 各レコードに city_lat / city_lon を付与 ---
combined_df['city_lat'] = combined_df[city_col].map(city_lat_median)
combined_df['city_lon'] = combined_df[city_col].map(city_lon_median)

# 型を float に統一
combined_df['city_lat'] = combined_df['city_lat'].astype('float')
combined_df['city_lon'] = combined_df['city_lon'].astype('float')

# --- ④ NaN を全体の中央値で埋める ---
combined_df['city_lat'] = combined_df['city_lat'].fillna(combined_df['lat'].median())
combined_df['city_lon'] = combined_df['city_lon'].fillna(combined_df['lon'].median())

# --- ⑤ 再分割 ---
train_df = combined_df.iloc[:len(train_df)].copy()
test_df  = combined_df.iloc[len(train_df):].copy()

fe_cols += ['city_lat', 'city_lon']

タグ情報

In [46]:
tag_info = pd.read_excel(f"{ROOT_DIR}/data_definition.xlsx", sheet_name=data_definition.sheet_names[2])
tag_info = tag_info[['タグID', 'タグ内容', 'タグ分類']]

facilities_info = pd.read_excel(f"{ROOT_DIR}/data_definition.xlsx", sheet_name=data_definition.sheet_names[4])
facilities_info = facilities_info[['タグID', 'タグ内容', 'タグ分類']]

In [47]:
tag_master = pd.concat([tag_info, facilities_info], axis=0, ignore_index=True).drop_duplicates()
tag_master['タグ情報'] = tag_master['タグ分類'] + '_' + tag_master['タグ内容']

In [48]:
tag_master["タグID"] = tag_master["タグID"].astype("str")
tag_master.set_index('タグID', inplace=True)
tag_master = tag_master.to_dict()['タグ情報']

In [49]:
combined_df = pd.concat([train_df, test_df], ignore_index=True)
tag_cols = ["building_tag_id", "unit_tag_id", "statuses"]

tag_dfs = []
for col in tag_cols:
    temp_df = combined_df[col].str.get_dummies(sep="/")
    temp_df.rename(columns=tag_master, inplace=True)
    temp_df = temp_df.astype('int8')

    tag_dfs.append(temp_df)

# すべて結合
tag_df = pd.concat(tag_dfs, axis=1).astype('int8')
tag_df = tag_df.groupby(level=0, axis=1).max()

In [50]:
tag_columns = tag_df.columns.drop_duplicates().tolist()

# --- 元 DF に結合 ---
combined_df = pd.concat([combined_df, tag_df], axis=1)

# --- スラッシュ区切り列を削除 ---
combined_df = combined_df.drop(columns=tag_cols)

# --- 再分割 ---
train_df = combined_df.iloc[:len(train_df)].copy()
test_df  = combined_df.iloc[len(train_df):].copy()

In [51]:
fe_cols += tag_columns
cat_cols += list(tag_columns)

タグ情報のPCA

In [52]:
from sklearn.decomposition import PCA

# --- 1) combined_df を作成 ---
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# --- 2) タグ列グループ ---
tag_groups = {
    "building":  [c for c in combined_df.columns if "building_facilities_tag" in c],
    "unit":      [c for c in combined_df.columns if "unit_facilities_tag" in c],
    "condition": [c for c in combined_df.columns if "condition_tag" in c],
    "location":  [c for c in combined_df.columns if "location_tag" in c],
    "reform":    [c for c in combined_df.columns if "reform_or_cert_tag" in c]
}

# --- 2.5) グループごとの PCA 次元数 ---
pca_dims = {
    "building": 10,
    "unit": 5,
    "condition": 5,
    "location": 2,
    "reform": 5
}

# --- 3) PCA + 累積寄与率を計算する関数 ---
def add_pca_features_and_report(df, cols, prefix, n_components):
    if len(cols) == 0:
        print(f"[SKIP] {prefix}: No columns")
        return df

    # 列数より多い成分数は指定できないので調整しておく
    n_components = min(n_components, len(cols))

    X = df[cols].fillna(0)

    pca = PCA(n_components=n_components, random_state=42)
    pca_features = pca.fit_transform(X)

    # 新しい PCA 列を追加
    for i in range(n_components):
        df[f"{prefix}_pca_{i+1}"] = pca_features[:, i]

    # 累積寄与率を計算
    explained = pca.explained_variance_ratio_
    cum_explained = explained.cumsum()

    # 表示
    print(f"\n=== {prefix} PCA Explained Variance (n_components={n_components}) ===")
    for i, (e, c) in enumerate(zip(explained, cum_explained), start=1):
        print(f"PC{i}: {e:.4f},  Cumulative: {c:.4f}")
    print("========================================\n")

    return df


# --- 4) 各グループに対して PCA + 累積寄与率を表示 ---
for prefix, cols in tag_groups.items():
    n_comp = pca_dims.get(prefix, 0)
    if n_comp <= 0:
        print(f"[SKIP] {prefix}: n_components <= 0")
        continue

    combined_df = add_pca_features_and_report(
        combined_df,
        cols,
        prefix,
        n_components=n_comp
    )

# --- 5) train/test に戻す ---
train_len = len(train_df)
train_df = combined_df.iloc[:train_len].copy()
test_df  = combined_df.iloc[train_len:].copy()


=== building PCA Explained Variance (n_components=10) ===
PC1: 0.4176,  Cumulative: 0.4176
PC2: 0.1014,  Cumulative: 0.5189
PC3: 0.0646,  Cumulative: 0.5836
PC4: 0.0538,  Cumulative: 0.6374
PC5: 0.0460,  Cumulative: 0.6834
PC6: 0.0383,  Cumulative: 0.7217
PC7: 0.0358,  Cumulative: 0.7576
PC8: 0.0339,  Cumulative: 0.7915
PC9: 0.0302,  Cumulative: 0.8217
PC10: 0.0272,  Cumulative: 0.8489


=== unit PCA Explained Variance (n_components=5) ===
PC1: 0.2836,  Cumulative: 0.2836
PC2: 0.0709,  Cumulative: 0.3544
PC3: 0.0512,  Cumulative: 0.4056
PC4: 0.0343,  Cumulative: 0.4399
PC5: 0.0305,  Cumulative: 0.4705


=== condition PCA Explained Variance (n_components=5) ===
PC1: 0.2296,  Cumulative: 0.2296
PC2: 0.1876,  Cumulative: 0.4172
PC3: 0.1533,  Cumulative: 0.5705
PC4: 0.1132,  Cumulative: 0.6837
PC5: 0.0959,  Cumulative: 0.7795


=== location PCA Explained Variance (n_components=2) ===
PC1: 0.8595,  Cumulative: 0.8595
PC2: 0.0802,  Cumulative: 0.9397


=== reform PCA Explained Variance (n_c

In [53]:
pca_cols = [c for c in train_df.columns if "pca" in c.lower()]

for c in pca_cols:
    if c not in fe_cols:
        fe_cols.append(c)

remove_cols = ['building_tag_id', 'unit_tag_id', 'statuses']
fe_cols = [c for c in fe_cols if c not in remove_cols]

タグのカウント

In [54]:
# --- train + test を結合 ---
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# --- building_tag / unit_tag / statuses の列名を抽出 ---
building_facilities_tag_cols = [c for c in combined_df.columns if "building_facilities_tag" in c]
unit_facilities_tag_cols = [c for c in combined_df.columns if "unit_facilities_tag" in c]
condition_tag_cols = [c for c in combined_df.columns if "condition_tag" in c]
location_tag_cols = [c for c in combined_df.columns if "location_tag" in c]
reform_or_cert_tag_cols = [c for c in combined_df.columns if "reform_or_cert_tag" in c]

combined_df["building_facilities_tag_count"] = combined_df[building_facilities_tag_cols].sum(axis=1)
combined_df["unit_facilities_tag_count"] = combined_df[unit_facilities_tag_cols].sum(axis=1)
combined_df["condition_tag_count"] = combined_df[condition_tag_cols].sum(axis=1)
combined_df["location_tag_count"] = combined_df[location_tag_cols].sum(axis=1)
combined_df["reform_or_cert_tag_count"] = combined_df[reform_or_cert_tag_cols].sum(axis=1)

# --- train / test に戻す ---
train_df = combined_df.iloc[:len(train_df)].copy()
test_df  = combined_df.iloc[len(train_df):].copy()

fe_cols += [
    'building_facilities_tag_count',
    'unit_facilities_tag_count',
    'condition_tag_count',
    'location_tag_count',
    'reform_or_cert_tag_count'
]


## モデル学習

In [65]:
X_train = train_df[fe_cols]
y_train = np.log1p(train_df[target_col]) # 対数変換しておく
X_test = test_df[fe_cols]

In [66]:
model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)

In [67]:
model.fit(
    X_train, y_train,
    categorical_feature=cat_cols  # ここでカテゴリ列を指定
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13166
[LightGBM] [Info] Number of data points in the train set: 363924, number of used features: 248
[LightGBM] [Info] Start training from score 16.872850


In [68]:
train_pred = np.expm1(model.predict(X_train))

In [69]:
from sklearn.metrics import mean_absolute_percentage_error
mape_error = mean_absolute_percentage_error(train_df[target_col], train_pred)
print("MAPE Error ", mape_error)

MAPE Error  0.15799898879735255


In [None]:
# MAPE Error  0.1780570510913692 # 緯度・経度とタグ情報を追加（採用） v1(19.83923950055616)
# MAPE Error  0.17795096907718125 # 都市ごとの緯度・経度の中央値を追加（採用）
# MAPE Error  0.17800759884305892　# 地理的クラスターの追加（不採用）
# MAPE Error  0.16469126823980784 # 1km以内周辺物件の情報（採用）
# MAPE Error  0.16461797867958786 # 緯度・経度の周期変換（採用） v2(18.866599710537425)
# MAPE Error  0.16011902764460195 # 密度特徴量の追加（採用）
# MAPE Error  0.159923931097982 # 豪邸検出の追加（採用）
# MAPE Error  0.15952363257094107　# 近傍価格の分散の追加（採用）
# MAPE Error  0.1585378265235103 # 面積x築年の相互作用の追加（採用）
# MAPE Error  0.15787655400041972 # building_idごとの統合特徴量の追加（採用） v3(18.19490499503534)
# MAPE Error  0.15755766347358469 # タグのカウント（採用）
# MAPE Error  0.15727924172432833 # 単純な20個のPCA＋特徴量重要度2以下を削除（採用） v4(18.16084588910298)
# MAPE Error  0.15746610763865648 # 単純な20個のPCA＋特徴量重要度5以下を削除（不採用） v5(18.187523682194712)

# MAPE Error  0.15901899972838118 # ハイブリッドPCA＋特徴量重要度2以下を削除（採用） v1(18.16084588910298)
# MAPE Error  0.158931750174125 # ハイブリッドPCA(分割数分ける)＋特徴量重要度2以下を削除（採用）
# MAPE Error  0.15799898879735255 # ハイブリッドPCA(分割数分ける、OneHot残す)＋特徴量重要度2以下を削除（採用） v2(18.29665009423758)

## 特徴量重要度

In [60]:
feature_importance = pd.DataFrame({
        'feature_name': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

In [62]:
feature_importance = feature_importance[feature_importance["importance"]>0]

In [63]:
feature_importance.to_csv(f'{output_path}feature_importance.csv', index=False)

#### 特徴量重要度=<2を削除

In [64]:
low_importance_feats = feature_importance.query("importance <= 2")["feature_name"].tolist()
print(low_importance_feats)
print(len(low_importance_feats))

fe_cols = [c for c in fe_cols if c not in low_importance_feats]
cat_cols = [c for c in cat_cols if c not in low_importance_feats]

['unit_facilities_tag_CATV', 'condition_tag_二世帯住宅向き', 'unit_facilities_tag_シャワー付洗面化粧台', 'unit_facilities_tag_クローゼット', 'condition_pca_4', 'Prefecture name_te', 'unit_facilities_tag_count', 'bukken_type', 'reform_pca_4', 'reform_pca_2', 'unit_facilities_tag_浄水器・活水器', 'unit_facilities_tag_メゾネット', 'unit_facilities_tag_バス・トイレ別', 'condition_tag_count', 'building_facilities_tag_セキュリティー充実', 'building_facilities_tag_都市ガス', 'unit_facilities_tag_2階以上', 'condition_tag_分譲賃貸', 'condition_tag_二人入居可', 'unit_facilities_tag_室内洗濯機置場', 'condition_tag_事務所不可', 'cos_lon', 'reform_exterior 1', 'condition_tag_ルームシェア可', 'building_facilities_tag_防犯カメラ', 'unit_facilities_tag_エアコン', 'building_facilities_tag_オール電化', 'unit_facilities_tag_オートバス', 'unit_facilities_tag_シャワー', 'building_facilities_tag_ごみ出し24時間OK', 'city_lon']
31


## 予測

In [70]:
y_pred_row = model.predict(X_test)
y_pred = np.expm1(y_pred_row)

## 提出

In [71]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [72]:
submit_df['pred'] = y_pred

In [73]:
submit_df.to_csv(
    f'{output_path}submit_20251203_v2.csv',
    index=False,
    header=False
)

## 追加するアクション一覧

- 重要都市までの距離
- 市区町村の人口密度
- 交通情報の拡充

#### 今後
- 市区町村の人口密度
- 交通情報の拡充
- 駅の緯度・経度