## Library Import

In [19]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [20]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
fe_ver = 2
training_ver = 2
pred_ver = 2
submit_ver = 2

today = dt.datetime.today().strftime("%Y%m%d")

## File Import

In [21]:
test_df = pd.read_csv(f'{intermediate_path}test_df_fe_v{fe_ver}.csv')

## モデルの読み込み

In [22]:
import pickle

with open(f'{model_path}all_base_models_v{training_ver}.pkl', "rb") as f:
    all_base_models_dict = pickle.load(f)
with open(f'{model_path}all_low_models_v{training_ver}.pkl', "rb") as f:
    all_low_models_dict = pickle.load(f)
with open(f'{model_path}house_base_models_v{training_ver}.pkl', "rb") as f:
    house_base_models_dict = pickle.load(f)
with open(f'{model_path}house_low_models_v{training_ver}.pkl', "rb") as f:
    house_low_models_dict = pickle.load(f)

In [23]:
all_base_models = all_base_models_dict['all_base_models']
all_low_models = all_low_models_dict['all_low_models']
house_base_models = house_base_models_dict['house_base_models']
house_low_models = house_low_models_dict['house_low_models']

all_base_cols = all_base_models_dict['all_base_cols']
all_low_cols = all_low_models_dict['all_low_cols']
house_base_cols = house_base_models_dict['house_base_cols']
house_low_cols = house_low_models_dict['house_low_cols']

## 予測

#### カテゴリ型へ変更

In [24]:
cat_cols = ['building_category', 'land_toshi', 'land_area_kind', 'land_youto', 'building_land_chimoku',
            'land_road_cond', 'building_area_kind', 'access_zone'
]

test_df[cat_cols] = test_df[cat_cols].astype('category')

In [25]:
# すべての category 列のリスト
obj_cols = test_df.select_dtypes(['object']).columns.tolist()

test_df[obj_cols] = test_df[obj_cols].astype('category')

#### 関数

In [26]:
def predict_with_base_low_ensemble(
    df, base_models, low_models,
    base_cols, low_cols,
    low_price_th
):
    """
    base_models: 5fold base model list
    low_models : 5fold low model list
    df         : 予測対象（test_df_all や test_df_house）
    """

    X_base = df[base_cols]
    X_low  = df[low_cols]

    # --------------------
    # base model ensemble
    # --------------------
    base_pred_log = np.column_stack([
        m.predict(X_base) for m in base_models
    ])
    mean_base_log = base_pred_log.mean(axis=1)

    # --------------------
    # low model ensemble
    # --------------------
    low_pred_log = np.column_stack([
        m.predict(X_low) for m in low_models
    ])
    mean_low_log = low_pred_log.mean(axis=1)

    # --------------------
    # スイッチ
    # --------------------
    low_th_log = np.log(low_price_th)
    final_log = mean_base_log.copy()
    mask_low = mean_base_log <= low_th_log
    final_log[mask_low] = mean_low_log[mask_low]

    # --------------------
    # 予測（元スケール）
    # --------------------
    pred = np.exp(final_log)

    return pred


#### データの分割

In [27]:
house_idx = test_df['building_category'] == 'house'
non_house_idx = ~house_idx

test_df_house = test_df[house_idx]
test_df_all = test_df[non_house_idx]

In [28]:
X_test_all_base = test_df_all[all_base_cols]
X_test_all_low  = test_df_all[all_low_cols]
X_test_house_base = test_df_house[house_base_cols]
X_test_house_low  = test_df_house[house_low_cols]

LOW_PRICE_TH = 10_000_000

#### house以外モデルの予測

In [29]:
pred_all = predict_with_base_low_ensemble(
    test_df_all,
    all_base_models,
    all_low_models,
    all_base_cols,
    all_low_cols,
    low_price_th=LOW_PRICE_TH
)

In [30]:
q = np.quantile(pred_all, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 5252399.183280813
25% (Q1) : 17377003.72222933
Median   : 26877353.102599707
75% (Q3) : 40018513.4972664
99%      : 104020555.25607428
Max      : 188973056.92000905


#### houseモデルの予測

In [31]:
pred_house = predict_with_base_low_ensemble(
    test_df_house,
    house_base_models,
    house_low_models,
    house_base_cols,
    house_low_cols,
    low_price_th=LOW_PRICE_TH
)

In [32]:
q = np.quantile(pred_house, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 5306863.086503071
25% (Q1) : 14309181.10240233
Median   : 21580167.769057266
75% (Q3) : 30577951.969338167
99%      : 94936557.28148049
Max      : 189904517.96887454


## 提出

In [33]:
test_pred_full = pd.Series(index=test_df.index, dtype=float)

test_pred_full.loc[non_house_idx] = pred_all
test_pred_full.loc[house_idx] = pred_house

In [34]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [35]:
submit_df['pred'] = test_pred_full

In [36]:
submit_df.to_csv(
    f'{pred_path}submit_{today}_v{submit_ver}.csv',
    index=False,
    header=False
)