# 予測処理

## Library Import

In [5]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [6]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
fe_ver = 3
training_ver = 4
submit_ver = 1

today = dt.datetime.today().strftime("%Y%m%d")

## File Import

In [7]:
test_df = pd.read_parquet(f'{intermediate_path}test_df_fe_v{fe_ver}.parquet')

## モデルの読み込み

In [None]:
import pickle

with open(f'{model_path}all_base_model_v{training_ver}.pkl', "rb") as f:
    all_base_model_dict = pickle.load(f)
with open(f'{model_path}house_base_model_v{training_ver}.pkl', "rb") as f:
    house_base_model_dict = pickle.load(f)

In [None]:
all_model = all_base_model_dict['all_base_model']
house_model = house_base_model_dict['house_base_model']

all_base_cols = all_base_model_dict['all_base_cols']
house_base_cols = house_base_model_dict['house_base_cols']

## 予測

#### カテゴリ型へ変更

In [10]:
cat_cols = ['building_category', 'land_area_kind',
            'building_land_chimoku', 'land_chisei','land_road_cond', 'access_zone', 'fireproof_x_structure'
]

test_df[cat_cols] = test_df[cat_cols].astype('category')

In [11]:
# すべての category 列のリスト
obj_cols = test_df.select_dtypes(['object']).columns.tolist()

test_df[obj_cols] = test_df[obj_cols].astype('category')

#### 関数

In [12]:
def predict(
    df: pd.DataFrame,
    model,
    base_cols: list[str],
) -> np.ndarray:
    """
    単体モデルで予測を行う関数（log 予測 → exp で戻す）

    Parameters
    ----------
    df : pd.DataFrame
        予測対象データ
    model : fitted model
        学習済み LightGBM モデル
    base_cols : list[str]
        使用する特徴量列

    Returns
    -------
    np.ndarray
        予測値（元スケール）
    """
    X_base = df[base_cols]
    pred_log = model.predict(X_base)
    return np.exp(pred_log)

#### データの分割

In [13]:
house_idx = test_df['building_category'] == 'house'
non_house_idx = ~house_idx

test_df_house = test_df[house_idx]
test_df_all = test_df[non_house_idx]

In [14]:
X_test_all_base = test_df_all[all_base_cols]
X_test_house_base = test_df_house[house_base_cols]

#### house以外モデルの予測

In [16]:
pred_all = predict(
    df=test_df_all,
    model=all_model,
    base_cols=all_base_cols,
)

In [17]:
q = np.quantile(pred_all, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4519570.681590892
25% (Q1) : 17146144.498732857
Median   : 26636493.04243519
75% (Q3) : 39834040.44739038
99%      : 104680122.79925281
Max      : 180555945.71645087


#### houseモデルの予測

In [18]:
pred_house = predict(
    df=test_df_house,
    model=house_model,
    base_cols=house_base_cols,
)

In [19]:
q = np.quantile(pred_house, [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4658717.334982642
25% (Q1) : 14122213.084524624
Median   : 21459362.061144203
75% (Q3) : 30781256.246183068
99%      : 92643311.81370355
Max      : 177859848.05209205


## 提出

In [20]:
test_pred_full = pd.Series(index=test_df.index, dtype=float)

test_pred_full.loc[non_house_idx] = pred_all
test_pred_full.loc[house_idx] = pred_house

In [21]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [22]:
submit_df['pred'] = test_pred_full

In [23]:
submit_df.to_csv(
    f'{pred_path}submit_{today}_v{submit_ver}.csv',
    index=False,
    header=False
)