## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
fe_ver = 3
training_ver = 2
pred_ver = 2
submit_ver = 4

today = dt.datetime.today().strftime("%Y%m%d")

## File Import

In [3]:
test_df = pd.read_parquet(f'{intermediate_path}test_df_fe_v{fe_ver}.parquet')

## モデルの読み込み

In [4]:
import pickle

with open(f'{model_path}all_base_models_v{training_ver}.pkl', "rb") as f:
    all_base_models_dict = pickle.load(f)
with open(f'{model_path}all_low_models_v{training_ver}.pkl', "rb") as f:
    all_low_models_dict = pickle.load(f)
with open(f'{model_path}house_base_models_v{training_ver}.pkl', "rb") as f:
    house_base_models_dict = pickle.load(f)
with open(f'{model_path}house_low_models_v{training_ver}.pkl', "rb") as f:
    house_low_models_dict = pickle.load(f)

In [5]:
all_base_models = all_base_models_dict['all_base_models']
all_low_models = all_low_models_dict['all_low_models']
house_base_models = house_base_models_dict['house_base_models']
house_low_models = house_low_models_dict['house_low_models']

all_base_cols = all_base_models_dict['all_base_cols']
all_low_cols = all_low_models_dict['all_low_cols']
house_base_cols = house_base_models_dict['house_base_cols']
house_low_cols = house_low_models_dict['house_low_cols']

## 予測

#### カテゴリ型へ変更

In [6]:
cat_cols = ['building_category', 'land_toshi', 'land_area_kind', 'land_youto', 'building_land_chimoku',
            'land_road_cond', 'building_area_kind', 'access_zone'
]

test_df[cat_cols] = test_df[cat_cols].astype('category')

In [7]:
# すべての category 列のリスト
obj_cols = test_df.select_dtypes(['object']).columns.tolist()

test_df[obj_cols] = test_df[obj_cols].astype('category')

#### 関数

In [None]:
# # NOTE: Lowモデルの適用は平日の時間ない時にちまちま試してみよう
# def predict_base_low_hard_soft(
#     df, base_models, low_models,
#     base_cols, low_cols,
#     low_price_th=10_000_000,
#     delta_ratio=0.15,   # 閾値の±15%だけ混ぜる
#     w_max=0.3,
#     k=6.0
# ):
#     X_base = df[base_cols]
#     X_low  = df[low_cols]

#     base_log = np.column_stack([m.predict(X_base) for m in base_models]).mean(axis=1)
#     low_log  = np.column_stack([m.predict(X_low)  for m in low_models ]).mean(axis=1)

#     th = np.log(low_price_th)
#     d  = np.log(1 + delta_ratio)

#     # hard switch 判定（base予測で）
#     mask_low = base_log <= th

#     # デフォルトは hard
#     final_log = base_log.copy()
#     final_log[mask_low] = low_log[mask_low]

#     # 境界だけ soft（[th-d, th+d]）
#     band = (base_log > (th - d)) & (base_log < (th + d))
#     if band.any():
#         # baseがthより低いほどlow寄りに（0〜w_max）
#         w = 1 / (1 + np.exp(k * (base_log[band] - th)))
#         w = np.clip(w, 0.0, 1.0) * w_max
#         final_log[band] = (1 - w) * base_log[band] + w * low_log[band]

#     return np.exp(final_log)


In [12]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def blend_base_low(
    base_pred_log,
    low_pred_log,
    low_price_th=10_000_000,
    r_low=0.5,
    r_base=1.5,
    k=3.0,
    w_max=0.3
):
    th_log = np.log(low_price_th)
    low_apply_log  = th_log + np.log(r_low)
    base_range_log = th_log + np.log(r_base)

    w_low_raw = sigmoid(-k * (low_pred_log - low_apply_log))
    w_allow = sigmoid(-k * (base_pred_log - base_range_log))

    w = np.clip(w_low_raw * w_allow, 0.0, w_max)
    pred_log = (1 - w) * base_pred_log + w * low_pred_log
    return pred_log

def predict_with_base_low_ensemble(
    df, base_models, low_models,
    base_cols, low_cols,
    low_price_th=10_000_000,
    r_low=0.5,
    r_base=1.5,
    k=3.0,
    w_max=0.3,
):
    X_base = df[base_cols]
    X_low  = df[low_cols]

    pred_logs = []
    for bm, lm in zip(base_models, low_models):
        base_pred_log = bm.predict(X_base)
        low_pred_log  = lm.predict(X_low)

        pred_log = blend_base_low(
            base_pred_log, low_pred_log,
            low_price_th=low_price_th,
            r_low=r_low,
            r_base=r_base,
            k=k,
            w_max=w_max,
        )
        pred_logs.append(pred_log)

    mean_pred_log = np.column_stack(pred_logs).mean(axis=1)
    return np.exp(mean_pred_log)

#### データの分割

In [13]:
house_idx = test_df['building_category'] == 'house'
non_house_idx = ~house_idx

test_df_house = test_df[house_idx]
test_df_all = test_df[non_house_idx]

In [14]:
X_test_all_base = test_df_all[all_base_cols]
X_test_all_low  = test_df_all[all_low_cols]
X_test_house_base = test_df_house[house_base_cols]
X_test_house_low  = test_df_house[house_low_cols]

LOW_PRICE_TH_ALL = 10_000_000  # 低価格の閾値（1,000万円）
LOW_PRICE_TH_HOUSE = 20_000_000  # 低価格の閾値（2,000万円）

#### house以外モデルの予測

In [15]:
pred_all  = predict_with_base_low_ensemble(
    test_df_all,
    all_base_models,
    all_low_models,
    all_base_cols,
    all_low_cols,
    low_price_th=LOW_PRICE_TH_ALL,
)

In [16]:
q = np.quantile(pred_all, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4707585.756074974
25% (Q1) : 16684002.440612057
Median   : 26292398.242291734
75% (Q3) : 39596152.27357176
99%      : 104293239.73759371
Max      : 187453184.01933575


#### houseモデルの予測

In [17]:
pred_house = predict_with_base_low_ensemble(
    test_df_house,
    house_base_models,
    house_low_models,
    house_base_cols,
    house_low_cols,
    low_price_th=LOW_PRICE_TH_HOUSE
)

In [18]:
q = np.quantile(pred_house, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4607648.735016667
25% (Q1) : 12370670.122703992
Median   : 16739878.442994537
75% (Q3) : 22664190.45240148
99%      : 92729473.07656433
Max      : 184500499.33983064


## 提出

In [19]:
test_pred_full = pd.Series(index=test_df.index, dtype=float)

test_pred_full.loc[non_house_idx] = pred_all
test_pred_full.loc[house_idx] = pred_house

In [20]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [21]:
submit_df['pred'] = test_pred_full

In [22]:
submit_df.to_csv(
    f'{pred_path}submit_{today}_v{submit_ver}.csv',
    index=False,
    header=False
)