# 予測処理

## Library Import

In [22]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [23]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
fe_ver = 3
training_ver = 3
submit_ver = 2

today = dt.datetime.today().strftime("%Y%m%d")

## File Import

In [24]:
test_df = pd.read_parquet(f'{intermediate_path}test_df_fe_v{fe_ver}.parquet')

## モデルの読み込み

In [25]:
import pickle

with open(f'{model_path}all_base_models_v{training_ver}.pkl', "rb") as f:
    all_base_models_dict = pickle.load(f)
with open(f'{model_path}all_low_models_v{training_ver}.pkl', "rb") as f:
    all_low_models_dict = pickle.load(f)
with open(f'{model_path}house_base_models_v{training_ver}.pkl', "rb") as f:
    house_base_models_dict = pickle.load(f)
with open(f'{model_path}house_low_models_v{training_ver}.pkl', "rb") as f:
    house_low_models_dict = pickle.load(f)

In [26]:
all_base_models = all_base_models_dict['all_base_models']
all_low_models = all_low_models_dict['all_low_models']
house_base_models = house_base_models_dict['house_base_models']

all_base_cols = all_base_models_dict['all_base_cols']
all_low_cols = all_low_models_dict['all_low_cols']
house_base_cols = house_base_models_dict['house_base_cols']

## 予測

#### カテゴリ型へ変更

In [27]:
cat_cols = ['building_category', 'land_area_kind',
            'building_land_chimoku', 'land_chisei','land_road_cond', 'access_zone', 'fireproof_x_structure'
]

test_df[cat_cols] = test_df[cat_cols].astype('category')

In [28]:
# すべての category 列のリスト
obj_cols = test_df.select_dtypes(['object']).columns.tolist()

test_df[obj_cols] = test_df[obj_cols].astype('category')

#### 関数

In [29]:
def predict_all_with_sigma_switch(
    df: pd.DataFrame,
    base_models: list,
    low_models: list,
    base_cols: list,
    low_cols: list,
    low_price_th: float,
    sigma_q: float = 0.80,
    sigma_th_fixed: float | None = None,
):
    """
    all モデル推論:
      - foldごとの base 予測から mu/sigma を算出
      - sigma の上位 q（=不確実性が高い）かつ mu が低価格帯のものだけ low に切替
      - 出力は price（exp）スケール
    """
    X_base = df[base_cols]
    X_low  = df[low_cols]

    # foldごとの予測をstack
    base_stack = []
    low_stack  = []
    for bm, lm in zip(base_models, low_models):
        base_stack.append(bm.predict(X_base))
        low_stack.append(lm.predict(X_low))

    base_stack = np.column_stack(base_stack)  # (n, n_folds)
    low_stack  = np.column_stack(low_stack)

    base_mu = base_stack.mean(axis=1)
    base_sigma = base_stack.std(axis=1)

    # 閾値（test分布から作る：あなたのHO実装と同じ思想）
    sigma_th = sigma_th_fixed if sigma_th_fixed is not None else np.quantile(base_sigma, sigma_q)

    mask_low = (
        (base_mu <= np.log(low_price_th)) &
        (base_sigma >= sigma_th)
    )

    # base の fold平均を基本にして、mask_low だけ low の fold平均に差し替え
    base_mean = base_mu
    low_mean  = low_stack.mean(axis=1)

    final_log = base_mean.copy()
    final_log[mask_low] = low_mean[mask_low]

    pred = np.exp(final_log)

    return pred, mask_low, base_mu, base_sigma


In [30]:
def predict_house_base_only(
    df: pd.DataFrame,
    base_models: list,
    base_cols: list,
):
    X_base = df[base_cols]

    pred_logs = []
    for bm in base_models:
        pred_logs.append(bm.predict(X_base))

    mean_log = np.column_stack(pred_logs).mean(axis=1)
    return np.exp(mean_log)


#### データの分割

In [31]:
house_idx = test_df['building_category'] == 'house'
non_house_idx = ~house_idx

test_df_house = test_df[house_idx]
test_df_all = test_df[non_house_idx]

In [32]:
X_test_all_base = test_df_all[all_base_cols]
X_test_all_low  = test_df_all[all_low_cols]
X_test_house_base = test_df_house[house_base_cols]

LOW_PRICE_TH_ALL = 10_000_000  # 低価格の閾値（1,000万円）
LOW_PRICE_TH_HOUSE = 20_000_000  # 低価格の閾値（2,000万円）

#### house以外モデルの予測

In [33]:
# # TODO: sigma_qとsigma_th_fixedの調整は時間がない時にやろう
# pred_all, mask_low_all, mu_all, sigma_all = predict_all_with_sigma_switch(
#     df=test_df_all,
#     base_models=all_base_models,
#     low_models=all_low_models,
#     base_cols=all_base_cols,
#     low_cols=all_low_cols,
#     low_price_th=LOW_PRICE_TH_ALL,
#     sigma_q=0.70,
#     sigma_th_fixed = 0.06576116266522083 # HOの70％ラインを適用
# )

# print('low applied rate (test):', mask_low_all.mean())

In [34]:
pred_all = predict_house_base_only(
    df=test_df_all,
    base_models=all_base_models,
    base_cols=all_base_cols,
)

In [13]:
low_mu_rate_test = (mu_all <= np.log(LOW_PRICE_TH_ALL)).mean()
cond_rate = mask_low_all.mean() / low_mu_rate_test
print('mu<=th rate (test):', low_mu_rate_test)
print('P(low | mu<=th):', cond_rate)

mu<=th rate (test): 0.06746236862302836
P(low | mu<=th): 0.14766658927327608


In [14]:
print('sigma_th_fixed:', 0.056239323401596014)
print('test sigma q0.70:', np.quantile(sigma_all, 0.70))
print('test sigma q0.80:', np.quantile(sigma_all, 0.80))

sigma_th_fixed: 0.056239323401596014
test sigma q0.70: 0.04541784009124794
test sigma q0.80: 0.05186065774654801


In [35]:
q = np.quantile(pred_all, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4601499.567230255
25% (Q1) : 17073927.377272055
Median   : 26531446.23867051
75% (Q3) : 39665512.02557849
99%      : 102271594.95930675
Max      : 179544884.0688964


In [None]:
q = np.quantile(pred_all, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4601499.567230255
25% (Q1) : 17073927.377272055
Median   : 26531446.23867051
75% (Q3) : 39665512.02557849
99%      : 102271594.95930675
Max      : 179544884.0688964


#### houseモデルの予測

In [36]:
pred_house = predict_house_base_only(
    df=test_df_house,
    base_models=house_base_models,
    base_cols=house_base_cols,
)

In [37]:
q = np.quantile(pred_house, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4663092.306107346
25% (Q1) : 14175920.975113286
Median   : 21410496.332273364
75% (Q3) : 30689700.358534425
99%      : 93026033.52277295
Max      : 168667426.50239208


## 提出

In [38]:
test_pred_full = pd.Series(index=test_df.index, dtype=float)

test_pred_full.loc[non_house_idx] = pred_all
test_pred_full.loc[house_idx] = pred_house

In [39]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [40]:
submit_df['pred'] = test_pred_full

In [41]:
submit_df.to_csv(
    f'{pred_path}submit_{today}_v{submit_ver}.csv',
    index=False,
    header=False
)