# 後補正・提出

## Library Import

In [17]:
# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [18]:
# 自身がファイルを格納したディレクトリを指定
ROOT_DIR = '../input/'
intermediate_path = '../output/intermediate_file/'
submit_file_path = ROOT_DIR + 'sample_submit.csv'
model_path = '../output/model/'
oof_path = '../output/oof/'
pred_path = '../output/pred/'

# スクリプトのバージョン指定
training_ver = 8
inference_ver = 8
post_revision_ver = 1
submit_ver = 2

today = dt.datetime.today().strftime("%Y%m%d")

## File Import

In [19]:
test_df_residential = pd.read_parquet(f'{intermediate_path}pred_df_residential_v{inference_ver}.parquet')
test_df_house = pd.read_parquet(f'{intermediate_path}pred_df_house_v{inference_ver}.parquet')
test_df_other = pd.read_parquet(f'{intermediate_path}pred_df_other_v{inference_ver}.parquet')

## 後補正

In [20]:
oof_df_residential = pd.read_csv(f'{oof_path}train_with_pred_residential_lgb_v{training_ver}.csv')
# oof_df_house = pd.read_csv(f'{oof_path}train_with_pred_house_lgb_v{training_ver}.csv')
# # oof_df_other = pd.read_csv(f'{oof_path}train_with_pred_other_lgb_v{training_ver}.csv')

#### Residential

In [21]:
import numpy as np
import pandas as pd

def mape(y, p):
    y = np.asarray(y)
    p = np.asarray(p)
    mask = (y > 0) & np.isfinite(y) & np.isfinite(p) & (p > 0)
    return float(np.mean(np.abs(y[mask] - p[mask]) / y[mask]))

def fit_global_scale(df, y_col='money_room', pred_col='pred'):
    d = df[[y_col, pred_col]].dropna()
    d = d[(d[y_col] > 0) & (d[pred_col] > 0)]
    return float(np.median(d[y_col] / d[pred_col]))

def fit_group_scale_shrink(df, group_col, y_col='money_room', pred_col='pred', k=1000):
    d = df[[group_col, y_col, pred_col]].dropna()
    d = d[(d[y_col] > 0) & (d[pred_col] > 0)]
    d = d.assign(ratio=d[y_col] / d[pred_col])

    global_scale = float(np.median(d['ratio']))

    g = (
        d.groupby(group_col)['ratio']
        .agg(n='size', raw_scale='median')
        .reset_index()
    )
    g['w'] = g['n'] / (g['n'] + k)
    g['final_scale'] = g['w'] * g['raw_scale'] + (1 - g['w']) * global_scale
    scale_map = dict(zip(g[group_col], g['final_scale']))
    return scale_map, global_scale, g

def apply_scale(df, pred_col, scale_map, group_col=None, default_scale=1.0):
    s = df[pred_col].to_numpy().copy()
    if group_col is None:
        return s * default_scale
    scale = df[group_col].map(scale_map).fillna(default_scale).to_numpy()
    return s * scale

def prepare_long(df, y_col='money_room', oof_col='oof_pred', ho_col='ho_pred'):
    # oof行
    oof = df[df[oof_col].notna()].copy()
    oof['pred'] = oof[oof_col]
    oof['split'] = 'oof'
    # ho行
    ho = df[df[ho_col].notna()].copy()
    ho['pred'] = ho[ho_col]
    ho['split'] = 'ho'
    return pd.concat([oof, ho], axis=0, ignore_index=True)

def evaluate_all(df_long, group_col='structure_group', y_col='money_room'):
    res = []

    # baseline
    for split in ['oof', 'ho']:
        d = df_long[df_long['split'] == split]
        res.append((f'baseline_{split}', mape(d[y_col], d['pred'])))

    # A: global scale（oof+ho混ぜる版 / oofのみ版を比較しても良い）
    gs_all = fit_global_scale(df_long, y_col=y_col, pred_col='pred')
    for split in ['oof', 'ho']:
        d = df_long[df_long['split'] == split]
        p = d['pred'].to_numpy() * gs_all
        res.append((f'global_allfit_{split}', mape(d[y_col], p)))

    # B: structure shrinkage (k探索)
    # scale推定は「oofだけでfit」→「hoで評価」が基本
    df_fit = df_long[df_long['split'] == 'oof']
    for k in [50, 100, 200, 500, 1000, 2000, 5000]:
        scale_map, gs_fit, gtbl = fit_group_scale_shrink(df_fit, group_col=group_col, y_col=y_col, pred_col='pred', k=k)
        for split in ['oof', 'ho']:
            d = df_long[df_long['split'] == split]
            p = apply_scale(d, 'pred', scale_map, group_col=group_col, default_scale=gs_fit)
            res.append((f'structure_k{k}_fitOOF_{split}', mape(d[y_col], p)))

    out = pd.DataFrame(res, columns=['method', 'mape']).sort_values('mape')
    return out

In [23]:
df_long = prepare_long(oof_df_residential, y_col='money_room', oof_col='residential_lgb_oof_pred', ho_col='residential_lgb_ho_pred')
result = evaluate_all(df_long, group_col='structure_group', y_col='money_room')
print(result.head(30))


                        method      mape
3             global_allfit_ho  0.127060
5      structure_k50_fitOOF_ho  0.129303
7     structure_k100_fitOOF_ho  0.129303
9     structure_k200_fitOOF_ho  0.129305
11    structure_k500_fitOOF_ho  0.129306
13   structure_k1000_fitOOF_ho  0.129306
15   structure_k2000_fitOOF_ho  0.129306
17   structure_k5000_fitOOF_ho  0.129306
1                  baseline_ho  0.129844
0                 baseline_oof  0.138244
4     structure_k50_fitOOF_oof  0.138571
6    structure_k100_fitOOF_oof  0.138577
8    structure_k200_fitOOF_oof  0.138581
10   structure_k500_fitOOF_oof  0.138586
12  structure_k1000_fitOOF_oof  0.138589
14  structure_k2000_fitOOF_oof  0.138592
16  structure_k5000_fitOOF_oof  0.138594
2            global_allfit_oof  0.141198


In [5]:
# cols = ['money_room', 'residential_lgb_oof_pred']
# oof_part = oof_df_residential[cols].rename(columns={'residential_lgb_oof_pred':'pred'})

# ho_part = oof_df_residential[['money_room', 'residential_lgb_ho_pred']] \
#     .dropna(subset=['residential_lgb_ho_pred']) \
#     .rename(columns={'residential_lgb_ho_pred':'pred'})

# df_all = pd.concat([oof_part, ho_part], axis=0)

# mask = (df_all['pred'] > 0) & df_all['pred'].notna()
# global_scale = np.median(df_all.loc[mask, 'money_room'] / df_all.loc[mask, 'pred'])

In [6]:
# def compute_structure_scale(oof_df, k=1000):
#     d = (
#         oof_df
#         .dropna(subset=['oof_pred'])
#         .assign(ratio=lambda x: x['money_room']/x['oof_pred'])
#         .groupby('structure_group')
#         .agg(
#             n=('ratio','size'),
#             raw_scale=('ratio','median')
#         )
#     )

#     global_scale = np.median(oof_df['money_room'] / oof_df['oof_pred'])

#     d['w'] = d['n'] / (d['n'] + k)
#     d['final_scale'] = d['w'] * d['raw_scale'] + (1 - d['w']) * global_scale
#     return d[['n','raw_scale','final_scale']]


In [7]:
# scale_df_residential = compute_structure_scale(oof_df_residential)

# def apply_structure_scale(pred_df, scale_df, default_scale):
#     pred_df = pred_df.copy()
#     scale_map = scale_df['final_scale'].to_dict()
#     pred_df['scale'] = pred_df['structure_group'].map(scale_map).fillna(default_scale)
#     pred_df['pred_adj'] = pred_df['pred'] * pred_df['scale']
#     return pred_df

# test_df_residential = apply_structure_scale(test_df_residential, scale_df_residential, global_scale)
# test_df_residential

#### House

#### Other

## 提出

In [8]:
# # pred と pred_log の整合確認（全行）
# err = (test_df_residential['pred_log'] - np.log(test_df_residential['pred'])).abs()
# print(err.describe())

In [9]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [10]:
submit_df['pred'] = np.nan

for df_part in [
    test_df_residential,
    test_df_house,
    test_df_other,
]:
    idx = submit_df.index.intersection(df_part.index)
    submit_df.loc[idx, 'pred'] = df_part.loc[idx, 'pred']

In [11]:
n_nan = submit_df['pred'].isna().sum()
print('NaN preds:', n_nan, '/', len(submit_df))

NaN preds: 0 / 112437


In [12]:
for name, df_part in [
    ('residential', test_df_residential),
    ('house', test_df_house),
    ('other', test_df_other),
]:
    n_hit = len(submit_df.index.intersection(df_part.index))
    print(name, 'hit:', n_hit, 'part_rows:', len(df_part))


residential hit: 58834 part_rows: 58834
house hit: 48594 part_rows: 48594
other hit: 5009 part_rows: 5009


In [13]:
idx_r = test_df_residential.index
idx_h = test_df_house.index
idx_o = test_df_other.index

print('overlap r-h:', len(idx_r.intersection(idx_h)))
print('overlap r-o:', len(idx_r.intersection(idx_o)))
print('overlap h-o:', len(idx_h.intersection(idx_o)))


overlap r-h: 0
overlap r-o: 0
overlap h-o: 0


In [14]:
submit_df.to_csv(
    f'{pred_path}submit_{today}_atohosei-v{post_revision_ver}_v{submit_ver}.csv',
    index=False,
    header=False
)