# 予測処理

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
create_tbl_ver = 2
training_ver = 7
inference_ver = 7

today = dt.datetime.today().strftime("%Y%m%d")

In [3]:
target_model = 'residential'
# target_model = 'house'
# target_model = 'other'

## File Import

In [4]:
test_df = pd.read_parquet(f'{intermediate_path}test_df_{target_model}_v{create_tbl_ver}.parquet')

## モデルの読み込み

In [5]:
import pickle

with open(f'{model_path}/{target_model}_model_v{training_ver}.pkl', "rb") as f:
    models_dict = pickle.load(f)

In [6]:
models = models_dict['models']
base_cols = models_dict['base_cols']
cat_cols = models_dict['cat_cols']

## 予測

#### カテゴリ型へ変更

In [7]:
cat_cols_exist = [c for c in cat_cols if c in test_df.columns]
test_df[cat_cols_exist] = test_df[cat_cols_exist].astype('category')

In [8]:
# すべての category 列のリスト
obj_cols = test_df.select_dtypes(['object']).columns.tolist()

test_df[obj_cols] = test_df[obj_cols].astype('category')

#### データの分割

In [9]:
# --- 東京23区 ---
TOKYO_23 = [
    '千代田区', '中央区', '港区', '新宿区', '文京区', '台東区',
    '墨田区', '江東区', '品川区', '目黒区', '大田区', '世田谷区',
    '渋谷区', '中野区', '杉並区', '豊島区', '北区', '荒川区',
    '板橋区', '練馬区', '足立区', '葛飾区', '江戸川区'
]

# --- 政令指定都市 ---
SEIREI_CITIES = [
    '札幌市', '仙台市', 'さいたま市', '千葉市', '横浜市', '川崎市', '相模原市',
    '新潟市', '静岡市', '浜松市', '名古屋市',
    '京都市', '大阪市', '堺市', '神戸市',
    '岡山市', '広島市', '北九州市', '福岡市', '熊本市'
]

# --- 首都圏（都道府県） ---
CAPITAL_PREFS = ['東京都', '神奈川県', '埼玉県', '千葉県']

# --- 県庁所在地（市名のみ） ---
PREF_CAPITALS = [
    '札幌市','青森市','盛岡市','仙台市','秋田市','山形市','福島市',
    '水戸市','宇都宮市','前橋市','さいたま市','千葉市','新宿区',
    '横浜市','新潟市','富山市','金沢市','福井市','甲府市','長野市',
    '岐阜市','静岡市','名古屋市','津市','大津市','京都市','大阪市',
    '神戸市','奈良市','和歌山市','鳥取市','松江市','岡山市','広島市',
    '山口市','徳島市','高松市','松山市','高知市','福岡市','佐賀市',
    '長崎市','熊本市','大分市','宮崎市','鹿児島市','那覇市'
]

In [10]:
def add_urban_class(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    cond_main = (
        ((out['Prefecture name'] == '東京都') &
         (out['City/town/village name'].isin(TOKYO_23)))
        |
        (out['City/town/village name'].isin(['大阪市', '名古屋市']))
    )

    cond_mid = (
        (
            (out['Prefecture name'].isin(CAPITAL_PREFS))
            &
            ~(
                (out['Prefecture name'] == '東京都') &
                (out['City/town/village name'].isin(TOKYO_23))
            )
        )
        |
        (out['City/town/village name'].isin(SEIREI_CITIES))
        |
        (out['City/town/village name'].isin(PREF_CAPITALS))
    )

    out['UrbanClass'] = 'other'
    out.loc[cond_mid, 'UrbanClass'] = 'mid_city'
    out.loc[cond_main, 'UrbanClass'] = 'main_city'

    out['UrbanClass'] = out['UrbanClass'].astype('category')
    return out


def add_density_class(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 既存ロジックをそのまま列化
    out['DensityClass'] = 'high'
    out.loc[out['zone_residential_rank'] == 1, 'DensityClass'] = 'low'
    out.loc[out['zone_residential_rank'] == 2, 'DensityClass'] = 'mid'
    # rank in [3,4,0] or NA は high（デフォルト）で一致

    out['DensityClass'] = out['DensityClass'].astype('category')
    return out

In [11]:
def make_idx_dict_from_key(
    df: pd.DataFrame,
    key_col: str,
    expected_keys: list[str] | None = None,
) -> dict[str, pd.Index]:
    """
    df[key_col] の値ごとに df.index をまとめて idx_dict を作る。
    expected_keys を渡すと、その順序で揃え、欠けていても空Indexを作る。
    """
    s = df[key_col].astype('string')

    if expected_keys is None:
        keys = sorted([k for k in s.dropna().unique().tolist()])
    else:
        keys = expected_keys

    idx_dict: dict[str, pd.Index] = {}
    for k in keys:
        idx_dict[k] = df.index[s == k]

    return idx_dict

In [12]:
def build_split_idx_dict(
    df: pd.DataFrame,
    target_model: str,
) -> tuple[pd.DataFrame, dict[str, pd.Index] | None]:
    """
    target_model に応じて分割キー列を付与し、idx_dict を返す。
    - residential: UrbanClass を付与し main/mid/other に分割
    - house: DensityClass を付与し low/mid/high に分割
    - other: 分割しないので None
    """
    out = df.copy()

    if target_model == 'residential':
        out = add_urban_class(out)
        idx_dict = make_idx_dict_from_key(
            out,
            key_col='UrbanClass',
            expected_keys=['main_city', 'mid_city', 'other'],
        )
        return out, idx_dict

    if target_model == 'house':
        out = add_density_class(out)
        idx_dict = make_idx_dict_from_key(
            out,
            key_col='DensityClass',
            expected_keys=['low', 'mid', 'high'],
        )
        return out, idx_dict

    if target_model == 'other':
        return out, None

    raise ValueError(f'Unknown target_model: {target_model}')


In [13]:
test_df, idx_dict = build_split_idx_dict(test_df, target_model)

#### 関数

In [14]:
def _is_catboost_model(model: object) -> bool:
    """
    catboost が import されていない環境でも落ちないように判定する。
    """
    name = model.__class__.__name__.lower()
    mod = getattr(model.__class__, '__module__', '').lower()
    return ('catboost' in mod) or ('catboost' in name)


def _force_cat_cols_to_str(
    X: pd.DataFrame,
    cat_cols: list[str],
    na_token: str = 'NA',
) -> pd.DataFrame:
    """
    CatBoost の cat_features 用に、カテゴリ列を必ず str にし NaN を潰す。
    """
    if not cat_cols:
        return X

    X = X.copy()
    cat_cols_use = [c for c in cat_cols if c in X.columns]

    for c in cat_cols_use:
        X[c] = X[c].map(lambda v: na_token if pd.isna(v) else str(v))

    return X


def predict_by_split_df(
    df: pd.DataFrame,
    models: dict[str, object],
    base_cols: list[str],
    idx_dict: dict[str, pd.Index] | None,
    cat_cols: list[str] | None = None,
    na_token: str = 'NA',
    pred_col: str = 'pred',
    pred_log_col: str = 'pred_log',
) -> pd.DataFrame:
    """
    split_key ごとに対応する model で予測し、
    元 df の全カラム + 予測列を保持した DataFrame を返す。

    idx_dict が None の場合は models['all'] 1本で全件予測。
    """
    out_df = df.copy()
    out_df[pred_log_col] = np.nan
    out_df[pred_col] = np.nan

    cat_cols = cat_cols or []

    base_cols_use = [c for c in base_cols if c in df.columns]
    if not base_cols_use:
        raise ValueError('No usable columns in base_cols (none exist in df).')

    # ============================
    # idx_dict が None → all 1本
    # ============================
    if idx_dict is None:
        if 'all' not in models:
            raise KeyError("idx_dict is None, but models does not contain key 'all'.")
        model = models['all']

        X = df.loc[:, base_cols_use].copy()
        if _is_catboost_model(model) and cat_cols:
            X = _force_cat_cols_to_str(X, cat_cols, na_token=na_token)

        pred_log = model.predict(X)
        out_df.loc[:, pred_log_col] = pred_log
        out_df.loc[:, pred_col] = np.exp(pred_log)
        return out_df

    # ============================
    # 通常：split ごと
    # ============================
    for split_key, model in models.items():
        idx = idx_dict.get(split_key)
        if idx is None:
            continue

        idx_use = pd.Index(idx).intersection(df.index)
        if len(idx_use) == 0:
            continue

        X = df.loc[idx_use, base_cols_use].copy()
        if _is_catboost_model(model) and cat_cols:
            X = _force_cat_cols_to_str(X, cat_cols, na_token=na_token)

        pred_log = model.predict(X)
        out_df.loc[idx_use, pred_log_col] = pred_log
        out_df.loc[idx_use, pred_col] = np.exp(pred_log)

    return out_df


#### house以外モデルの予測

In [15]:
pred_df = predict_by_split_df(
    df=test_df,
    models=models,
    base_cols=base_cols,
    idx_dict=idx_dict,
    cat_cols=cat_cols_exist
)

In [16]:
pred_df

Unnamed: 0,Prefecture name,City/town/village name,zone_residential_rank,target_ym,target_year,building_id,unit_id,nearest_land_price,log_land_price,weighted_land_price_3,...,maint_x_station,maint_x_age,maint_x_kyoueki_std_log,tower_x_relative_floor,tower_x_high_floor,highgrade_x_maint,resort_x_elev,UrbanClass,pred_log,pred
0,三重県,桑名市,0,202301,2023,129053,149752,69600.0,11.150534,91373.093157,...,13.101418,17.701970,47.503067,0.0,0,0.0,0.0,other,16.558735,1.553701e+07
7,三重県,桑名市,3,202301,2023,179871,252158,41700.0,10.638280,36765.232086,...,0.000000,0.000000,0.000000,0.0,0,0.0,0.0,other,16.390916,1.313664e+07
17,滋賀県,米原市,2,202301,2023,123056,101084,55000.0,10.915107,56059.565882,...,13.652983,19.732985,54.438195,0.0,0,0.0,0.0,other,16.120752,1.002660e+07
18,滋賀県,米原市,0,202301,2023,200406,31043,55000.0,10.915107,55782.755542,...,13.020368,16.652307,46.003768,0.0,0,0.0,0.0,other,16.485820,1.444444e+07
40,茨城県,かすみがうら市,2,202301,2023,34018,303524,34000.0,10.434145,30850.081852,...,12.425957,16.920771,48.527213,0.0,0,0.0,0.0,other,16.285954,1.182769e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112428,千葉県,船橋市,0,202307,2023,107276,49776,82300.0,11.318139,82728.886295,...,16.685558,17.464001,51.014429,0.0,0,0.0,0.0,mid_city,16.918469,2.226372e+07
112429,千葉県,船橋市,0,202307,2023,107276,110461,82300.0,11.318139,82728.886295,...,17.715968,19.267521,54.468108,0.0,0,0.0,0.0,mid_city,16.842119,2.062716e+07
112430,千葉県,船橋市,0,202307,2023,125919,195552,82300.0,11.318139,80216.531786,...,15.068528,16.383877,50.145950,0.0,0,0.0,0.0,mid_city,16.965308,2.333133e+07
112435,三重県,桑名市,3,202307,2023,43310,113394,90900.0,11.417526,85659.539607,...,13.490701,15.985529,49.070449,0.0,0,0.0,0.0,other,16.941068,2.277259e+07


In [17]:
q = np.quantile(pred_df['pred'], [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4716672.716560311
25% (Q1) : 16632144.664054187
Median   : 25332865.705682375
75% (Q3) : 37509977.6438956
99%      : 99439446.5012415
Max      : 183576638.3023337


## 出力

In [18]:
pred_df.to_parquet(f'{intermediate_path}pred_df_{target_model}_v{inference_ver}.parquet')