# 予測処理

## Library Import

In [1]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
intermediate_path = '../output/intermediate_file/'
model_path = '../output/model/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
create_tbl_ver = 2
training_ver = 8
inference_ver = 8

today = dt.datetime.today().strftime("%Y%m%d")

In [3]:
target_model = 'residential'
# target_model = 'house'
# target_model = 'other'

alg = 'lgb'
# alg = 'cat'

## File Import

In [4]:
test_df = pd.read_parquet(f'{intermediate_path}test_df_{target_model}_v{create_tbl_ver}.parquet')

## モデルの読み込み

In [5]:
import pickle

with open(f'{model_path}/model_{target_model}_{alg}_v{training_ver}.pkl', "rb") as f:
    models_dict = pickle.load(f)

In [6]:
models = models_dict['models']
fe_cols = models_dict['fe_cols']
cat_cols = models_dict['cat_cols']

## 予測

#### データの分割

In [7]:
# --- 東京23区 ---
TOKYO_23 = [
    '千代田区', '中央区', '港区', '新宿区', '文京区', '台東区',
    '墨田区', '江東区', '品川区', '目黒区', '大田区', '世田谷区',
    '渋谷区', '中野区', '杉並区', '豊島区', '北区', '荒川区',
    '板橋区', '練馬区', '足立区', '葛飾区', '江戸川区'
]

# --- 政令指定都市 ---
SEIREI_CITIES = [
    '札幌市', '仙台市', 'さいたま市', '千葉市', '横浜市', '川崎市', '相模原市',
    '新潟市', '静岡市', '浜松市', '名古屋市',
    '京都市', '大阪市', '堺市', '神戸市',
    '岡山市', '広島市', '北九州市', '福岡市', '熊本市'
]

# --- 首都圏（都道府県） ---
CAPITAL_PREFS = ['東京都', '神奈川県', '埼玉県', '千葉県']

# --- 県庁所在地（市名のみ） ---
PREF_CAPITALS = [
    '札幌市','青森市','盛岡市','仙台市','秋田市','山形市','福島市',
    '水戸市','宇都宮市','前橋市','さいたま市','千葉市','新宿区',
    '横浜市','新潟市','富山市','金沢市','福井市','甲府市','長野市',
    '岐阜市','静岡市','名古屋市','津市','大津市','京都市','大阪市',
    '神戸市','奈良市','和歌山市','鳥取市','松江市','岡山市','広島市',
    '山口市','徳島市','高松市','松山市','高知市','福岡市','佐賀市',
    '長崎市','熊本市','大分市','宮崎市','鹿児島市','那覇市'
]

In [8]:
def add_urban_class(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    cond_main = (
        ((out['Prefecture_name'] == '東京都') &
         (out['City/town/village_name'].isin(TOKYO_23)))
        |
        (out['City/town/village_name'].isin(['大阪市', '名古屋市']))
    )

    cond_mid = (
        (
            (out['Prefecture_name'].isin(CAPITAL_PREFS))
            &
            ~(
                (out['Prefecture_name'] == '東京都') &
                (out['City/town/village_name'].isin(TOKYO_23))
            )
        )
        |
        (out['City/town/village_name'].isin(SEIREI_CITIES))
        |
        (out['City/town/village_name'].isin(PREF_CAPITALS))
    )

    out['UrbanClass'] = 'other'
    out.loc[cond_mid, 'UrbanClass'] = 'mid_city'
    out.loc[cond_main, 'UrbanClass'] = 'main_city'

    out['UrbanClass'] = out['UrbanClass'].astype('category')
    return out


def add_density_class(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 既存ロジックをそのまま列化
    out['DensityClass'] = 'high'
    out.loc[out['zone_residential_rank'] == 1, 'DensityClass'] = 'low'
    out.loc[out['zone_residential_rank'] == 2, 'DensityClass'] = 'mid'
    # rank in [3,4,0] or NA は high（デフォルト）で一致

    out['DensityClass'] = out['DensityClass'].astype('category')
    return out

In [9]:
def make_idx_dict_from_key(
    df: pd.DataFrame,
    key_col: str,
    expected_keys: list[str] | None = None,
) -> dict[str, pd.Index]:
    """
    df[key_col] の値ごとに df.index をまとめて idx_dict を作る。
    expected_keys を渡すと、その順序で揃え、欠けていても空Indexを作る。
    """
    s = df[key_col].astype('string')

    if expected_keys is None:
        keys = sorted([k for k in s.dropna().unique().tolist()])
    else:
        keys = expected_keys

    idx_dict: dict[str, pd.Index] = {}
    for k in keys:
        idx_dict[k] = df.index[s == k]

    return idx_dict

In [10]:
def build_split_idx_dict(
    df: pd.DataFrame,
    target_model: str,
) -> tuple[pd.DataFrame, dict[str, pd.Index] | None]:
    """
    target_model に応じて分割キー列を付与し、idx_dict を返す。
    - residential: UrbanClass を付与し main/mid/other に分割
    - house: DensityClass を付与し low/mid/high に分割
    - other: 分割しないので None
    """
    out = df.copy()

    if target_model == 'residential':
        out = add_urban_class(out)
        idx_dict = make_idx_dict_from_key(
            out,
            key_col='UrbanClass',
            expected_keys=['main_city', 'mid_city', 'other'],
        )
        return out, idx_dict

    if target_model == 'house':
        out = add_density_class(out)
        idx_dict = make_idx_dict_from_key(
            out,
            key_col='DensityClass',
            expected_keys=['low', 'mid', 'high'],
        )
        return out, idx_dict

    if target_model == 'other':
        return out, None

    raise ValueError(f'Unknown target_model: {target_model}')


In [11]:
test_df, idx_dict = build_split_idx_dict(test_df, target_model)

#### 関数

In [12]:
def _is_lightgbm_model(model: object) -> bool:
    name = model.__class__.__name__.lower()
    mod = getattr(model.__class__, '__module__', '').lower()
    return ('lightgbm' in mod) or name.startswith('lgbm')

def _force_lgbm_train_categories(X: pd.DataFrame, cat_cols_use: list[str], na_token: str = 'NA') -> pd.DataFrame:
    X = X.copy()
    for c in cat_cols_use:
        if c not in X.columns:
            continue
        s = X[c].astype('string').fillna(na_token)
        X[c] = s.astype('category')
    return X

In [13]:
def _is_catboost_model(model: object) -> bool:
    """
    catboost が import されていない環境でも落ちないように判定する。
    """
    name = model.__class__.__name__.lower()
    mod = getattr(model.__class__, '__module__', '').lower()
    return ('catboost' in mod) or ('catboost' in name)


def _force_cat_cols_to_str(
    X: pd.DataFrame,
    cat_cols: list[str],
    na_token: str = 'NA',
) -> pd.DataFrame:
    """
    CatBoost の cat_features 用に、カテゴリ列を必ず str にし NaN を潰す。
    """
    if not cat_cols:
        return X

    X = X.copy()
    cat_cols_use = [c for c in cat_cols if c in X.columns]

    for c in cat_cols_use:
        X[c] = X[c].map(lambda v: na_token if pd.isna(v) else str(v))

    return X

In [14]:
def predict_by_split_df(
    df: pd.DataFrame,
    models: dict[str, object],
    fe_cols: dict[str, list[str]] | list[str],
    cat_cols: dict[str, list[str]] | list[str],
    idx_dict: dict[str, pd.Index] | None,
    na_token: str = 'NA',
    pred_col: str = 'pred',
    pred_log_col: str = 'pred_log',
) -> pd.DataFrame:
    out_df = df.copy()
    out_df[pred_log_col] = np.nan
    out_df[pred_col] = np.nan

    def _get_cols(cols_obj, split_key: str) -> list[str]:
        if cols_obj is None:
            return []
        if isinstance(cols_obj, dict):
            return cols_obj.get(split_key) or cols_obj.get('all') or []
        return cols_obj

    # idx_dict が None → all 1本
    if idx_dict is None:
        if 'all' not in models:
            raise KeyError("idx_dict is None, but models does not contain key 'all'.")

        model = models['all']
        fe_cols_all = _get_cols(fe_cols, 'all')
        X = df.loc[:, fe_cols_all].copy()

        cat_cols_all = _get_cols(cat_cols, 'all')

        # cat列を category dtype に統一（学習・推論で同じ前処理にする）
        if cat_cols_all:
            X = _force_lgbm_train_categories(X, cat_cols_all, na_token='NA')

        # CatBoost
        if _is_catboost_model(model) and cat_cols_all:
            X = _force_cat_cols_to_str(X, cat_cols_all, na_token=na_token)

        pred_log = model.predict(X)
        out_df.loc[:, pred_log_col] = pred_log
        out_df.loc[:, pred_col] = np.exp(pred_log)
        return out_df

    # splitごと
    for split_key, model in models.items():
        idx = idx_dict.get(split_key)
        if idx is None:
            continue

        idx_use = pd.Index(idx).intersection(df.index)
        if len(idx_use) == 0:
            continue

        fe_cols_split = _get_cols(fe_cols, split_key)
        X = df.loc[idx_use, fe_cols_split].copy()

        cat_cols_split = _get_cols(cat_cols, split_key)

        if cat_cols_split:
            X = _force_lgbm_train_categories(X, cat_cols_split, na_token='NA')

        if _is_catboost_model(model) and cat_cols_split:
            X = _force_cat_cols_to_str(X, cat_cols_split, na_token=na_token)

        pred_log = model.predict(X)
        out_df.loc[idx_use, pred_log_col] = pred_log
        out_df.loc[idx_use, pred_col] = np.exp(pred_log)

    return out_df


#### モデルの予測

In [15]:
pred_df = predict_by_split_df(
    df=test_df,
    models=models,
    fe_cols=fe_cols,
    idx_dict=idx_dict,
    cat_cols=cat_cols
)

In [16]:
pred_df

Unnamed: 0,Prefecture_name,City/town/village_name,zone_residential_rank,target_ym,target_year,building_id,unit_id,nearest_land_price,log_land_price,weighted_land_price_3,...,infra_penalty_x_log_land_price,hard_penalty_x_log_land_price,has_shidou_x_log_land_price,senyu_area_ratio_to_median,area_per_room_x_senyu_diff,low_price_proxy,log_dist_x_livability,UrbanClass,pred_log,pred
0,三重県,桑名市,0,202301,2023,129053,149752,69600.0,11.150534,91373.093157,...,0.000000,0.000000,0.0,16.455847,1542.417114,0,451.719221,other,16.510113,1.479964e+07
7,三重県,桑名市,3,202301,2023,179871,252158,41700.0,10.638280,36765.232086,...,0.000000,0.000000,0.0,,,1,526.676792,other,16.415292,1.346079e+07
17,滋賀県,米原市,2,202301,2023,123056,101084,55000.0,10.915107,56059.565882,...,21.830213,10.915107,0.0,16.470265,1546.022403,1,506.693905,other,16.111643,9.935683e+06
18,滋賀県,米原市,0,202301,2023,200406,31043,55000.0,10.915107,55782.755542,...,10.915107,0.000000,0.0,16.041933,1441.004280,1,514.019139,other,16.463414,1.412439e+07
40,茨城県,かすみがうら市,2,202301,2023,34018,303524,34000.0,10.434145,30850.081852,...,0.000000,0.000000,0.0,16.142863,2275.958976,1,530.151963,other,16.181585,1.065548e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112428,千葉県,船橋市,0,202307,2023,107276,49776,82300.0,11.318139,82728.886295,...,0.000000,0.000000,0.0,17.704750,1882.183071,0,458.320489,mid_city,16.981129,2.370339e+07
112429,千葉県,船橋市,0,202307,2023,107276,110461,82300.0,11.318139,82728.886295,...,0.000000,0.000000,0.0,17.027319,1736.757765,0,456.552764,mid_city,16.838652,2.055577e+07
112430,千葉県,船橋市,0,202307,2023,125919,195552,82300.0,11.318139,80216.531786,...,0.000000,0.000000,0.0,,,0,500.689821,mid_city,16.951004,2.299998e+07
112435,三重県,桑名市,3,202307,2023,43310,113394,90900.0,11.417526,85659.539607,...,0.000000,0.000000,0.0,,,0,481.169359,other,16.820119,2.017832e+07


In [17]:
q = np.quantile(pred_df['pred'], [0, 0.25, 0.5, 0.75, 0.99, 1])
print("Min      :", q[0])
print("25% (Q1) :", q[1])
print("Median   :", q[2])
print("75% (Q3) :", q[3])
print("99%      :", q[4])
print("Max      :", q[5])

Min      : 4220218.517615446
25% (Q1) : 16519003.59889548
Median   : 25335810.153580964
75% (Q3) : 37545293.942653075
99%      : 100800721.58461785
Max      : 181092470.6383987


## 出力

In [18]:
pred_df.to_parquet(f'{intermediate_path}pred_df_{target_model}_v{inference_ver}.parquet')