## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import japanize_matplotlib

from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import gc
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [22]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../../input/'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
intermediate_path = '../../output/intermediate_file/'
model_path = '../../output/model/'
pred_path = '../../output/pred/'

# スクリプトのバージョン指定
fe_ver = 1
training_ver = 1
pred_ver = 1
submit_ver = 1

today = dt.datetime.today().strftime("%Y%m%d")

## File Import

In [3]:
test_df = pd.read_csv(f'{intermediate_path}test_df_fe_v{fe_ver}.csv')

In [4]:
date_col = 'target_ym'
target_col = 'money_room'

## モデルの読み込み

In [8]:
import pickle

with open(f'{model_path}base_models_v{training_ver}.pkl', "rb") as f:
    base_models_dict = pickle.load(f)

with open(f'{model_path}low_models_v{training_ver}.pkl', "rb") as f:
    low_models_dict = pickle.load(f)

In [9]:
base_models = base_models_dict['base_models']
low_models = low_models_dict['low_models']

base_cols = base_models_dict['base_cols']
low_cols = low_models_dict['low_cols']

## 予測

In [12]:
# すべての category 列のリスト
obj_cols = test_df.select_dtypes(['object']).columns.tolist()

for col in obj_cols:
    test_df[col] = test_df[col].astype('category')

In [13]:
X_test_base = test_df[base_cols]
X_test_low  = test_df[low_cols]

In [14]:
LOW_PRICE_TH = 10_000_000

# 1) baseモデルのアンサンブル予測（log）
base_pred_log_all = np.zeros((len(test_df), len(base_models)))
for i, bm in enumerate(base_models):
    base_pred_log_all[:, i] = bm.predict(X_test_base)
mean_base_log = base_pred_log_all.mean(axis=1)

# 2) lowモデルのアンサンブル予測（log）
low_pred_log_all = np.zeros((len(test_df), len(low_models)))
for i, lm in enumerate(low_models):
    low_pred_log_all[:, i] = lm.predict(X_test_low)
mean_low_log = low_pred_log_all.mean(axis=1)

# 3) スイッチ
low_th_log = np.log(LOW_PRICE_TH)
pred_log = mean_base_log.copy()
mask_low = mean_base_log <= low_th_log
pred_log[mask_low] = mean_low_log[mask_low]

test_pred = np.exp(pred_log)

In [15]:
q = np.quantile(test_pred, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4878512.231614241
25% (Q1) : 15620976.08586919
Median   : 23906312.318158228
75% (Q3) : 35169538.42377687
99%      : 98235071.36974789
Max      : 179850656.7600434


In [16]:
assert np.isfinite(test_pred).all(), "NaN/Inf detected in predictions!"
assert (test_pred > 0).all(), "Negative price detected!"

## 提出

In [19]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [20]:
submit_df['pred'] = test_pred

In [23]:
submit_df.to_csv(
    f'{pred_path}submit_{today}_v{submit_ver}.csv',
    index=False,
    header=False
)

In [None]:
# OOF MAPE: 未記録 # 新しい特徴量をとりあえず追加(1207_v1 17.525144894983026)
# OOF MAPE: 0.152712 # 日付関連データの修正＋建物種別の更新＋特徴量重要度0の削除(1207_v2 16.429000600293868)
# OOF MAPE: 0.157793 # 日付関連データの修正＋建物種別の更新(1207_v3 16.481708647871166)
# OOF MAPE: 0.153923　# バス名とタグの元カラム削除＋特徴量重要度0の削除(1207_v4 16.531551562806776)
# OOF MAPE: 0.152916 # 作成した建蔽率と容積率を削除＋特徴量重要度0の削除(1207_v5 16.460013451271422)