In [35]:
import itertools
import os
import re
import warnings

import numpy as np, json, math
import optuna
import pandas as pd

warnings.filterwarnings("ignore")

import joblib
import matplotlib.pyplot as plt
import shap
import sklearn
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (average_precision_score, brier_score_loss,
                             roc_auc_score)
from sklearn.model_selection import (GroupKFold, StratifiedGroupKFold,
                                     StratifiedKFold)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path

In [None]:
plt.rcParams['font.family'] = 'MS Gothic'
results_df = pd.read_pickle('02_horse_results_df.pickle')

In [None]:
# 予測に無関係なカラムの削除
results_df = results_df.drop([
    "馬名","horse_id","着順","タイム","着差","人気","date",
    
    "HighPayoutRace","jockey_id"
], axis=1, errors="ignore")

In [27]:
# 重複カラムの削除
results_df = results_df.drop(["距離区分","年齢_bin","斤量区分","馬体重_bin","体重_bin","性齢","馬番"], axis=1, errors='ignore')

# LightGBM

In [None]:
LABEL_COL = "HighPayoutHorse"
RACE_COL  = "race_id"
DROP_COLS = [RACE_COL, "着順_num", "複勝フラグ", "単勝"]

# 学習でカテゴリ扱いする列（存在チェックは都度する）
BASE_CAT_COLS = [
    "騎手","調教師","weather","race_type","ground_state",
    "競馬場","所属","月","性","枠番","最内枠","大外枠"
]

NUMERIC_FORCE = ["月", "枠番", "最内枠", "大外枠"]

def lump_rare(s: pd.Series, min_count=50, other="__OTHER__"):
    vc = s.value_counts(dropna=False)
    rares = set(vc[vc < min_count].index)
    return s.astype(str).where(~s.astype(str).isin(rares), other)

def normalize_text(s: pd.Series) -> pd.Series:
    return (s.astype("string")
              .str.normalize("NFKC")
              .str.strip()
              .str.replace(r"\s+", " ", regex=True))

In [None]:
def build_core_features(df: pd.DataFrame,
                        drop_cols=DROP_COLS,
                        label_col=LABEL_COL,
                        base_cat_cols=BASE_CAT_COLS,
                        min_count_for_lump=50):
    df = df.copy()

    feat_cols = [c for c in df.columns if c not in drop_cols + [label_col]]
    X = df[feat_cols].copy()
    y = df[label_col].astype(int).to_numpy().ravel()
    groups = df[RACE_COL].astype(str).to_numpy().ravel()

    # 存在するカテゴリ候補だけ採用
    cat_cols = [c for c in base_cat_cols if c in X.columns]
    # 数値強制のものはカテゴリ対象から外す
    cat_cols = [c for c in cat_cols if c not in NUMERIC_FORCE]

    # 低頻度カテゴリまとめ + 軽い正規化
    for c in cat_cols:
        X[c] = normalize_text(X[c])
        X[c] = lump_rare(X[c], min_count=min_count_for_lump, other="__OTHER__")

    # 数値化（カテゴリ以外はなるべく float に）
    for c in X.columns:
        if c not in cat_cols:
            X[c] = pd.to_numeric(X[c], errors="coerce")

    return X, y, groups, cat_cols

## 推奨設定にて学習

In [110]:
def prepare_for_lgbm(X_core: pd.DataFrame, cat_cols: list[str], levels_json="03_lgbm_cat_levels.json"):
    X_lgbm = X_core.copy()

    # JSON が存在すれば学習語彙を読み込み（推論時はこちらを必ず使う）
    if os.path.exists(levels_json):
        obj = json.loads(Path(levels_json).read_text(encoding="utf-8"))
        levels = obj["levels"]  # {col: [cat1, cat2, ...]}

        # 学習時の語彙で dtype 固定（未知は __OTHER__ 扱い→カテゴリに __OTHER__ が含まれている前提）
        for c in cat_cols:
            if c in X_lgbm.columns and c in levels:
                vals = normalize_text(X_lgbm[c]).fillna("__OTHER__")
                cats = levels[c]
                X_lgbm[c] = vals.where(vals.isin(cats), "__OTHER__")\
                                .astype(pd.CategoricalDtype(categories=cats, ordered=False))
    else:
        # （学習時のみ実行される分岐）語彙を作って保存
        levels = {}
        for c in cat_cols:
            vals = normalize_text(X_lgbm[c]).fillna("__OTHER__")
            uniq = ["__OTHER__"] + sorted([u for u in vals.unique().tolist() if u != "__OTHER__"])
            X_lgbm[c] = vals.where(vals.isin(uniq), "__OTHER__")\
                            .astype(pd.CategoricalDtype(categories=uniq, ordered=False))
            levels[c] = uniq
        Path(levels_json).write_text(json.dumps({"levels": levels}, ensure_ascii=False, indent=2), encoding="utf-8")

    # 数値強制
    for c in NUMERIC_FORCE:
        if c in X_lgbm.columns:
            X_lgbm[c] = pd.to_numeric(X_lgbm[c], errors="coerce")

    return X_lgbm

In [111]:
X_core, y, groups, cat_cols = build_core_features(results_df)

X_lgbm = prepare_for_lgbm(X_core, cat_cols, levels_json="03_lgbm_cat_levels.json")

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof_pred = np.zeros(len(X_lgbm))
pr_list, roc_list, br_list = [], [], []

for fold, (tr_idx, va_idx) in enumerate(cv.split(X_lgbm, y, groups=groups), 1):
    X_tr, X_va = X_lgbm.iloc[tr_idx].copy(), X_lgbm.iloc[va_idx].copy()
    y_tr, y_va = y[tr_idx], y[va_idx]

    clf = LGBMClassifier(
        objective="binary",
        boosting_type="gbdt",
        learning_rate=0.05,
        n_estimators=4000,
        num_leaves=255,
        min_child_samples=10,
        min_data_per_group=1,
        cat_smooth=5,
        subsample=0.8, colsample_bytree=0.8, max_bin=511,
        is_unbalance=False,
        scale_pos_weight=base_spw,
        random_state=42, n_jobs=-1
    )

    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)], eval_metric="aucpr",
        categorical_feature="auto",
        callbacks=[early_stopping(300), log_evaluation(0)]
    )

    p = clf.predict_proba(X_va)[:, 1]
    oof_pred[va_idx] = p

    pr  = average_precision_score(y_va, p)
    roc = roc_auc_score(y_va, p)
    br  = brier_score_loss(y_va, p)
    pr_list.append(pr); roc_list.append(roc); br_list.append(br)

    print(f"[Fold {fold}] PR-AUC={pr:.4f}  ROC-AUC={roc:.4f}  Brier={br:.4f}")

print("\n=== OOF ===")
print(f"ROC-AUC: {np.mean(roc_list):.4f} (±{np.std(roc_list):.4f})")
print(f"PR-AUC : {np.mean(pr_list):.4f} (±{np.std(pr_list):.4f})")
print(f"Brier  : {np.mean(br_list):.4f} (±{np.std(br_list):.4f})")


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.109995
[Fold 1] PR-AUC=0.0432  ROC-AUC=0.6506  Brier=0.0229
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.105814
[Fold 2] PR-AUC=0.0451  ROC-AUC=0.6906  Brier=0.0218
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.109301
[Fold 3] PR-AUC=0.0375  ROC-AUC=0.6318  Brier=0.0224
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.106359
[Fold 4] PR-AUC=0.0426  ROC-AUC=0.6569  Brier=0.0219
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.113797
[Fold 5] PR-AUC=0.0445  ROC-AUC=0.6546  Brier=0.0240

=== OOF ===
ROC-AUC: 0.6569 (±0.0190)
PR-AUC : 0.0426 

ROC-AUC : 分類性能(最大１)  
PR-AUC  : 陽性クラスの見分け性能（0.2~0.3あたりを目指す）  
Brier   : 二乗誤差

## ハイパーパラメータの探索

In [31]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [115]:
def objective(trial):
    params = {
        "objective": "binary",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "n_estimators": 5000,  # early stopping で制御
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", base_spw*0.5, base_spw*2.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "verbosity": -1,
    }

    pr_list = []

    for tr_idx, va_idx in cv.split(X_core, y, groups=groups):
        X_tr_core = X_core.iloc[tr_idx].copy()
        X_va_core = X_core.iloc[va_idx].copy()
        y_tr, y_va = y[tr_idx], y[va_idx]

        for c in cat_cols:
            vals_tr = normalize_text(X_tr_core[c]).fillna("__OTHER__")
            cats = ["__OTHER__"] + sorted([u for u in pd.Series(vals_tr).unique().tolist() if u != "__OTHER__"])
            X_tr_core[c] = vals_tr.where(vals_tr.isin(cats), "__OTHER__").astype(
                pd.CategoricalDtype(categories=cats, ordered=False)
            )
            vals_va = normalize_text(X_va_core[c]).fillna("__OTHER__")
            X_va_core[c] = vals_va.where(vals_va.isin(cats), "__OTHER__").astype(
                pd.CategoricalDtype(categories=cats, ordered=False)
            )
        for c in NUMERIC_FORCE:
            if c in X_tr_core.columns:
                X_tr_core[c] = pd.to_numeric(X_tr_core[c], errors="coerce")
            if c in X_va_core.columns:
                X_va_core[c] = pd.to_numeric(X_va_core[c], errors="coerce")

        clf = LGBMClassifier(**params)
        clf.fit(
            X_tr_core, y_tr,
            eval_set=[(X_va_core, y_va)],
            eval_metric="aucpr",
            categorical_feature="auto",
            callbacks=[early_stopping(200), log_evaluation(-1)]
        )

        p = clf.predict_proba(X_va_core)[:, 1]
        pr_list.append(average_precision_score(y_va, p))

    return float(np.mean(pr_list))

In [116]:
study = optuna.create_study(direction="maximize", study_name="lgbm_pr_auc")
study.optimize(objective, n_trials=50, show_progress_bar=False)

print("Best PR-AUC:", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.103413
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.0997373
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.100894
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.100025
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.106854
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.114511
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.112423
Training until validation scores don't improve 

In [117]:
best_params = study.best_params | {
    "objective": "binary",
    "boosting_type": "gbdt",
    "n_estimators": 5000,  # early stoppingで実質決まる
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1
}

# JSON 保存（最適パラメタ）
with open("03_lgbm_best_params.json", "w", encoding="utf-8") as f:
    json.dump(best_params, f, ensure_ascii=False, indent=2)
print("✅ saved best_params -> 03_lgbm_best_params.json")

✅ saved best_params -> 03_lgbm_best_params.json


## best_paramsで再学習

In [118]:
with open("03_lgbm_best_params.json", "r", encoding="utf-8") as f:
    best_params = json.load(f)

oof_pred = np.zeros(len(X_core))
pr_list, roc_list, br_list, best_iters = [], [], [], []

for fold, (tr_idx, va_idx) in enumerate(cv.split(X_core, y, groups=groups), 1):
    X_tr_core = X_core.iloc[tr_idx].copy()
    X_va_core = X_core.iloc[va_idx].copy()
    y_tr, y_va = y[tr_idx], y[va_idx]

    # fold内で学習側から語彙→学習/検証へ適用
    for c in cat_cols:
        vals_tr = normalize_text(X_tr_core[c]).fillna("__OTHER__")
        cats = ["__OTHER__"] + sorted([u for u in pd.Series(vals_tr).unique().tolist() if u != "__OTHER__"])
        X_tr_core[c] = vals_tr.where(vals_tr.isin(cats), "__OTHER__").astype(
            pd.CategoricalDtype(categories=cats, ordered=False)
        )
        vals_va = normalize_text(X_va_core[c]).fillna("__OTHER__")
        X_va_core[c] = vals_va.where(vals_va.isin(cats), "__OTHER__").astype(
            pd.CategoricalDtype(categories=cats, ordered=False)
        )

    for c in NUMERIC_FORCE:
        if c in X_tr_core.columns:
            X_tr_core[c] = pd.to_numeric(X_tr_core[c], errors="coerce")
        if c in X_va_core.columns:
            X_va_core[c] = pd.to_numeric(X_va_core[c], errors="coerce")

    clf = LGBMClassifier(**best_params)
    clf.fit(
        X_tr_core, y_tr,
        eval_metric="aucpr",
        eval_set=[(X_va_core, y_va)],
        categorical_feature="auto",
        callbacks=[early_stopping(300), log_evaluation(0)]
    )

    p = clf.predict_proba(X_va_core)[:, 1]
    oof_pred[va_idx] = p

    pr  = average_precision_score(y_va, p)
    roc = roc_auc_score(y_va, p)
    br  = brier_score_loss(y_va, p)
    pr_list.append(pr); roc_list.append(roc); br_list.append(br)
    best_iters.append(getattr(clf, "best_iteration_", clf.n_estimators))
    print(f"[Fold {fold}] PR-AUC={pr:.4f}  ROC-AUC={roc:.4f}  Brier={br:.4f}  best_iter={best_iters[-1]}")

print("\n=== OOF ===")
print(f"PR-AUC : {np.mean(pr_list):.4f} (±{np.std(pr_list):.4f})")
print(f"ROC-AUC: {np.mean(roc_list):.4f} (±{np.std(roc_list):.4f})")
print(f"Brier  : {np.mean(br_list):.4f} (±{np.std(br_list):.4f})")

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[5]	valid_0's binary_logloss: 0.1025
[Fold 1] PR-AUC=0.0562  ROC-AUC=0.7738  Brier=0.0217  best_iter=5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[5]	valid_0's binary_logloss: 0.098366
[Fold 2] PR-AUC=0.0600  ROC-AUC=0.7990  Brier=0.0205  best_iter=5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[5]	valid_0's binary_logloss: 0.0997643
[Fold 3] PR-AUC=0.0601  ROC-AUC=0.7895  Brier=0.0209  best_iter=5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[5]	valid_0's binary_logloss: 0.0987816
[Fold 4] PR-AUC=0.0590  ROC-AUC=0.7980  Brier=0.0207  best_iter=5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[6]	valid_0's binary_logloss: 0.105536
[Fold 5] PR-AUC=0.0642  ROC-AUC=0.7889  Brier=0.0227  be

最終学習

In [122]:
final_n_estimators = int(np.median(best_iters) * 1.1)
final_params = best_params.copy()
final_params["n_estimators"] = max(final_n_estimators, 100)

# 最終学習では「専用関数」で語彙を固定＆保存（→推論時に再現）
X_full_lgbm = prepare_for_lgbm(X_core, cat_cols, levels_json="03_lgbm_cat_levels.json")

model_lgbm = LGBMClassifier(**final_params)
model_lgbm.fit(
    X_full_lgbm, y,
    categorical_feature="auto"
)

model_path = "03_lgbm_nativecat_odds_tuned.joblib"
joblib.dump(model_lgbm, model_path)
print(f"✅ saved -> {model_path}")

with open("03_lgbm_best_params_final.json", "w", encoding="utf-8") as f:
    json.dump(final_params, f, ensure_ascii=False, indent=2)
print("✅ saved -> 03_lgbm_best_params_final.json")

# 特徴量メタ保存（列名は最終学習に使ったフレームで確定）
FEATURES_JSON = Path("03_lgbm_feature_cols.json")
FEATURES_JSON.write_text(
    json.dumps({
        "features": list(X_full_lgbm.columns),
        "categorical_features": cat_cols,   # 実際にカテゴリ扱いした列集合
        "numeric_force": NUMERIC_FORCE
    }, ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print("✅ saved -> 03_lgbm_feature_cols.json")

✅ saved -> 03_lgbm_nativecat_odds_tuned.joblib
✅ saved -> 03_lgbm_best_params_final.json
✅ saved -> 03_lgbm_feature_cols.json


## 特徴量重要度

In [123]:
imp = pd.DataFrame({"feature": X_full_lgbm.columns, "gain": model_lgbm.booster_.feature_importance("gain")})
imp["gain%"] = imp["gain"] / imp["gain"].sum() * 100
print("\n=== Feature Importance (top20 by gain) ===")
print(imp.sort_values("gain", ascending=False).head(20))


=== Feature Importance (top20 by gain) ===
          feature          gain      gain%
31        単勝_最人気差  1.528871e+06  55.125890
4             調教師  5.589224e+05  20.152838
2              騎手  3.729306e+05  13.446607
28          log単勝  1.707084e+05   6.155164
32       単勝_2番人気差  5.867800e+04   2.115729
29          単勝ランク  4.803012e+04   1.731802
30          単勝pct  6.266634e+03   0.225953
25    馬_コース適性_複勝率  3.435685e+03   0.123879
22    馬_直近5走_平均着順  3.408327e+03   0.122893
3             馬体重  3.139814e+03   0.113211
27  コンビ_直近50走_複勝率  2.962522e+03   0.106818
14             年齢  2.187922e+03   0.078889
21    馬_直近10走_複勝率  2.177227e+03   0.078503
23   騎手_直近30走_複勝率  2.130293e+03   0.076811
16          相対枠位置  1.952841e+03   0.070413
9             競馬場  1.439497e+03   0.051903
26     馬_距離適性_複勝率  1.257945e+03   0.045357
12         馬体重_増減  1.113839e+03   0.040161
15           出走頭数  1.049968e+03   0.037858
1              斤量  6.250570e+02   0.022537


現状人気のなさが大きく特徴量として寄与してしまっている

In [124]:
lgbm_feat_cols = list(X_tr.columns)
with open("03_lgbm_cat_levels.json","w",encoding="utf-8") as f:
    json.dump({"levels": levels}, f, ensure_ascii=False, indent=2)

# CatBoost

In [125]:
def prepare_for_catboost(X_core: pd.DataFrame, cat_cols: list[str], catcols_json="03_catboost_cat_cols.json"):
    X_cat = X_core.copy()

    # CatBoost は str + cat_features=index が基本
    for c in cat_cols:
        X_cat[c] = normalize_text(X_cat[c]).fillna("NA").astype(str)

    cat_idx = [X_cat.columns.get_loc(c) for c in cat_cols]

    # 推論側で再現できるよう列名保存（※cat_idxは列順依存なので列名を保存）
    Path(catcols_json).write_text(json.dumps({"categorical_features": cat_cols}, ensure_ascii=False, indent=2), encoding="utf-8")

    return X_cat, cat_idx

## Optunaでハイパーパラメータ探索

In [126]:
X_core, y, groups, cat_cols = build_core_features(results_df)

X_cat, cat_idx = prepare_for_catboost(X_core, cat_cols, catcols_json="03_catboost_cat_cols.json")

In [127]:
def objective(trial):
    params = {
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 5.0),
        "border_count": trial.suggest_int("border_count", 128, 254),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "rsm": trial.suggest_float("rsm", 0.6, 1.0),
        "iterations": 5000,
        "class_weights": [
            1.0,
            trial.suggest_float("class_weight_pos", base_w1*0.5, base_w1*2.0, log=True)
        ],
        "random_state": 42,
        "verbose": False,
        "allow_writing_files": False,
        "task_type": "CPU",
    }

    pr_list = []
    for tr_idx, va_idx in cv.split(X_cat, y, groups=groups):
        X_tr, X_va = X_cat.iloc[tr_idx], X_cat.iloc[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
        valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=300, verbose=False)

        p = model.predict_proba(valid_pool)[:, 1]
        pr_list.append(average_precision_score(y_va, p))

    return float(np.mean(pr_list))

In [89]:
study = optuna.create_study(direction="maximize", study_name="catboost_pr_auc")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best PR-AUC:", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

Path("03_cat_best_params_search.json").write_text(
    json.dumps(study.best_params, ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print("✅ saved -> 03_cat_best_params_search.json")

  0%|          | 0/50 [00:05<?, ?it/s]

[W 2025-09-16 00:26:46,539] Trial 0 failed with parameters: {'learning_rate': 0.13917639709769958, 'depth': 8, 'l2_leaf_reg': 2.328211025539983, 'bagging_temperature': 0.6259699097796922, 'random_strength': 4.1822576950455295, 'border_count': 130, 'subsample': 0.9194065132481472, 'rsm': 0.7034879611916857, 'class_weight_pos': 76.0743984362791} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\project\horse_ana\.venv4\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\soich\AppData\Local\Temp\ipykernel_26348\2491112914.py", line 34, in objective
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=300, verbose=False)
  File "c:\project\horse_ana\.venv4\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None,




KeyboardInterrupt: 

## 探索済みパラメータで学習

In [128]:
SEARCH_JSON = Path("03_cat_best_params_search.json")
best_search = json.loads(SEARCH_JSON.read_text(encoding="utf-8"))

class_weight_pos = float(best_search.pop("class_weight_pos"))

best_params_cat = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "learning_rate": best_search["learning_rate"],
    "depth": int(best_search["depth"]),
    "l2_leaf_reg": float(best_search["l2_leaf_reg"]),
    "bagging_temperature": float(best_search["bagging_temperature"]),
    "random_strength": float(best_search["random_strength"]),
    "border_count": int(best_search["border_count"]),
    "subsample": float(best_search["subsample"]),
    "rsm": float(best_search["rsm"]),
    "iterations": 5000,
    "class_weights": [1.0, class_weight_pos],
    "random_state": 42,
    "verbose": False,
    "allow_writing_files": False,
    "task_type": "CPU",
}
print("再学習ベース params:", best_params_cat)

再学習ベース params: {'loss_function': 'Logloss', 'eval_metric': 'AUC', 'learning_rate': 0.03076622576045712, 'depth': 4, 'l2_leaf_reg': 2.692607180834445, 'bagging_temperature': 1.6362128720538305, 'random_strength': 3.197491890827852, 'border_count': 192, 'subsample': 0.7688116926449186, 'rsm': 0.716035050229886, 'iterations': 5000, 'class_weights': [1.0, 24.31116790970121], 'random_state': 42, 'verbose': False, 'allow_writing_files': False, 'task_type': 'CPU'}


In [129]:
X_core, y, groups, cat_cols = build_core_features(results_df)

X_cb = X_core.copy()
for c in cat_cols:
    if c in X_cb.columns:
        X_cb[c] = X_cb[c].astype(str)

cat_idx = [X_cb.columns.get_loc(c) for c in cat_cols]

print("cat_idx (by X_core):", cat_idx[:10], "...", "count=", len(cat_idx))

cat_idx (by X_core): [2, 4, 6, 7, 8, 9, 10, 13] ... count= 8


In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

oof_pred = np.zeros(len(X_cat))
pr_list, roc_list, br_list, best_iters = [], [], [], []

for fold, (tr_idx, va_idx) in enumerate(cv.split(X_cat, y, groups=groups), 1):
    X_tr, X_va = X_cat.iloc[tr_idx].copy(), X_cat.iloc[va_idx].copy()
    y_tr, y_va = y[tr_idx], y[va_idx]

    # cat_features はインデックスで指定（推論時と一致する）
    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

    model_cb = CatBoostClassifier(**best_params_cat)
    model_cb.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=300, verbose=False)

    p = model_cb.predict_proba(valid_pool)[:, 1]
    oof_pred[va_idx] = p

    pr  = average_precision_score(y_va, p)
    roc = roc_auc_score(y_va, p)
    br  = brier_score_loss(y_va, p)
    pr_list.append(pr); roc_list.append(roc); br_list.append(br)

    # CatBoost は best_iteration_ で最良イテレーションを持ってる
    biter = getattr(model_cb, "best_iteration_", None)
    if biter is None or biter <= 0:
        biter = best_params_cat["iterations"]
    best_iters.append(biter)

    print(f"[Fold {fold}] PR-AUC={pr:.4f}  ROC-AUC={roc:.4f}  Brier={br:.4f}  best_iter={biter}")

print("\n=== OOF (CatBoost) ===")
print(f"ROC-AUC: {np.mean(roc_list):.4f} (±{np.std(roc_list):.4f})")
print(f"PR-AUC : {np.mean(pr_list):.4f} (±{np.std(pr_list):.4f})")
print(f"Brier  : {np.mean(br_list):.4f} (±{np.std(br_list):.4f})")

[Fold 1] PR-AUC=0.0676  ROC-AUC=0.7962  Brier=0.1180  best_iter=797
[Fold 2] PR-AUC=0.0702  ROC-AUC=0.8092  Brier=0.1250  best_iter=245
[Fold 3] PR-AUC=0.0679  ROC-AUC=0.8092  Brier=0.1233  best_iter=84
[Fold 4] PR-AUC=0.0649  ROC-AUC=0.8206  Brier=0.1247  best_iter=456
[Fold 5] PR-AUC=0.0757  ROC-AUC=0.7996  Brier=0.1210  best_iter=528

=== OOF (CatBoost) ===
ROC-AUC: 0.8070 (±0.0086)
PR-AUC : 0.0693 (±0.0036)
Brier  : 0.1224 (±0.0026)


In [92]:
final_iterations = int(np.median(best_iters) * 1.1)
final_iterations = max(final_iterations, 300)
print("final_iterations:", final_iterations)

final_params_cat = best_params_cat.copy()
final_params_cat["iterations"] = final_iterations

full_pool = Pool(X_cat, y, cat_features=cat_idx)

final_cat = CatBoostClassifier(**final_params_cat)
final_cat.fit(full_pool, verbose=False)

MODEL_CAT_PATH = "03_catboost_odds_features.cbm"
final_cat.save_model(MODEL_CAT_PATH)
print(f"✅ saved CatBoost model -> {MODEL_CAT_PATH}")

final_iterations: 300
✅ saved CatBoost model -> 03_catboost_odds_features.cbm


## 特徴量重要度

In [130]:
imp_type = "PredictionValuesChange"  # or "LossFunctionChange"
imp = model_cb.get_feature_importance(type=imp_type, data=Pool(X_cat, y, cat_features=cat_idx))

feat_imp = pd.DataFrame({"feature": X_cat.columns, "importance": imp})
feat_imp = feat_imp.sort_values("importance", ascending=False)
feat_imp["importance%"] = 100 * feat_imp["importance"] / feat_imp["importance"].sum()

print("\n=== Feature Importance (CatBoost) ===")
print(feat_imp.head(30))


=== Feature Importance (CatBoost) ===
          feature  importance  importance%
31        単勝_最人気差   49.497445    49.497445
28          log単勝   24.237612    24.237612
30          単勝pct    8.413642     8.413642
32       単勝_2番人気差    6.053179     6.053179
29          単勝ランク    5.186599     5.186599
19     馬_直近3走_複勝率    1.792135     1.792135
20     馬_直近5走_複勝率    0.567343     0.567343
26     馬_距離適性_複勝率    0.471470     0.471470
25    馬_コース適性_複勝率    0.393779     0.393779
22    馬_直近5走_平均着順    0.349495     0.349495
14             年齢    0.338315     0.338315
13              性    0.310886     0.310886
23   騎手_直近30走_複勝率    0.271885     0.271885
27  コンビ_直近50走_複勝率    0.216442     0.216442
21    馬_直近10走_複勝率    0.191641     0.191641
24  調教師_直近50走_複勝率    0.184585     0.184585
7       race_type    0.174759     0.174759
3             馬体重    0.160660     0.160660
9             競馬場    0.145329     0.145329
0              枠番    0.135269     0.135269
12         馬体重_増減    0.115395     0.115395
11             

CatBoostにおいても単勝オッズが強く寄与してしまっている

In [68]:
# --- CatBoost 用特徴量リスト保存 ---
cat_feat_cols = list(X_tr.columns)
with open("03_catboost_feature_cols.json", "w", encoding="utf-8") as f:
    json.dump(cat_feat_cols, f, ensure_ascii=False, indent=2)

print("特徴量リストを保存しました -> 03_lgbm_feature_cols.json / 03_catboost_feature_cols.json")

特徴量リストを保存しました -> 03_lgbm_feature_cols.json / 03_catboost_feature_cols.json


In [None]:
# LGBMと混ぜるときに使う推論関数
def predict_catboost(df_new: pd.DataFrame, model_path="03_catboost_odds_features.cbm",
                     cat_cols_hint_path="03_catboost_cat_cols.json"):
    model_cb = CatBoostClassifier()
    model_cb.load_model(model_path)
    if os.path.exists(cat_cols_hint_path):
        with open(cat_cols_hint_path, "r", encoding="utf-8") as f:
            cat_cols_hint = json.load(f)
        cat_cols_use = [c for c in cat_cols_hint if c in df_new.columns]
    else:
        cat_cols_use = df_new.select_dtypes(include=["object"]).columns.tolist()
    cat_idx_use = [df_new.columns.get_loc(c) for c in cat_cols_use]
    pool = Pool(df_new, cat_features=cat_idx_use)
    return model_cb.predict_proba(pool)[:, 1]

# アンサンブル

学習用前処理

In [131]:
LGBM_PATH = "03_lgbm_nativecat_odds_tuned.joblib"
CAT_PATH  = "03_catboost_odds_features.cbm" 
CAT_COLS_JSON = "03_catboost_cat_cols.json"
LGBM_LEVELS_JSON = "03_lgbm_cat_levels.json"

LABEL_COL = "HighPayoutHorse"
RACE_COL  = "race_id"
DROP_COLS = [RACE_COL, "着順_num", "複勝フラグ", "単勝"]

TOP_K_PER_RACE = 2
MIN_PROB = 0.165

In [132]:
X_core, y, groups, cat_cols = build_core_features(
    results_df,
    drop_cols=DROP_COLS,
    label_col=LABEL_COL,
    base_cat_cols=[
        "騎手","調教師","weather","race_type","ground_state",
        "競馬場","所属","月","性","枠番","最内枠","大外枠"
    ],
    min_count_for_lump=50
)

In [133]:
# LGBM：語彙を JSON から“読むだけ”で固定（学習時に保存済み想定）
X_lgbm = prepare_for_lgbm(X_core, cat_cols, levels_json=LGBM_LEVELS_JSON)

# CatBoost：文字列カテゴリ＋ cat_idx（列順に依存）を用意
X_cat, cat_idx = prepare_for_catboost(X_core, cat_cols, catcols_json=CAT_COLS_JSON)

In [134]:
model_lgbm = joblib.load(LGBM_PATH)
model_cb   = CatBoostClassifier()
model_cb.load_model(CAT_PATH)

<catboost.core.CatBoostClassifier at 0x15fb7c65310>

In [135]:
def align_to_model_features(df: pd.DataFrame, model) -> pd.DataFrame:
    cols = getattr(model, "feature_name_", None) or getattr(model, "feature_names_", None)
    return df.reindex(columns=list(cols)) if cols is not None else df

In [141]:
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof_lgbm = np.zeros(len(X_core))
oof_cat  = np.zeros(len(X_core))

for tr_idx, va_idx in cv.split(X_core, y, groups=groups):
    # --- LGBM ---
    X_va_lgbm = X_lgbm.iloc[va_idx]
    X_va_lgbm = align_to_model_features(X_va_lgbm, model_lgbm)
    oof_lgbm[va_idx] = model_lgbm.predict_proba(X_va_lgbm)[:, 1]

    # --- CatBoost ---
    X_va_cat = X_cat.iloc[va_idx]
    # 列順が学習時とズレていたら整える（CatBoost でも一応合わせておく）
    X_va_cat = align_to_model_features(X_va_cat, model_cb)
    pool = Pool(X_va_cat, cat_features=cat_idx)
    oof_cat[va_idx] = model_cb.predict_proba(pool)[:, 1]

In [144]:
# 最良の重みをグリッドサーチで探す
grid = np.linspace(0.0, 1.0, 101) # LGBM0.0~1.0を101分割
scores = [average_precision_score(y, w*oof_lgbm + (1-w)*oof_cat) for w in grid]
W_LGBM = grid[int(np.argmax(scores))]
W_CAT  = 1.0 - W_LGBM
best_pr = max(scores)
print(f"[Blend search] best PR-AUC={best_pr:.4f} -> LGBM={W_LGBM:.2f}, Cat={W_CAT:.2f}")

[Blend search] best PR-AUC=0.1547 -> LGBM=1.00, Cat=0.00


In [145]:
p_blend_oof = W_LGBM * oof_lgbm + W_CAT * oof_cat

print("\n=== OOF評価（CV外予測での公正評価）===")
print(f"ROC-AUC: {roc_auc_score(y, p_blend_oof):.4f}")
print(f"PR-AUC : {average_precision_score(y, p_blend_oof):.4f}")
print(f"Brier  : {brier_score_loss(y, p_blend_oof):.4f}")


=== OOF評価（CV外予測での公正評価）===
ROC-AUC: 0.9174
PR-AUC : 0.1547
Brier  : 0.0473


## 買い目評価シミュレーション

In [None]:
def make_picks(df_pred: pd.DataFrame, score_col: str, 
               race_col: str = RACE_COL, top_k: int = 2, 
               min_prob: float = MIN_PROB, 
               odds_min: float | None = None, odds_max: float | None = None):
    tmp = df_pred[[race_col, score_col, "単勝"]].copy()
    tmp["row_id"] = df_pred.index

    tmp = tmp.sort_values([race_col, score_col], ascending=[True, False])
    topk = tmp.groupby(race_col, as_index=False).head(top_k)

    ok_races = topk.groupby(race_col)[score_col].max()
    ok_races = ok_races[ok_races >= min_prob].index
    picks = topk[topk[race_col].isin(ok_races)].copy()

    if odds_min is not None:
        picks = picks[picks["単勝"] >= odds_min]
    if odds_max is not None:
        picks = picks[picks["単勝"] <= odds_max]

    return picks[["row_id", race_col, score_col, "単勝"]]

In [None]:
def evaluate_picks_counts(df_all: pd.DataFrame, picks: pd.DataFrame,
                          race_col=RACE_COL, label_col=LABEL_COL):
    pick_idx = picks["row_id"].to_numpy()
    bet_mask = df_all.index.isin(pick_idx)
    n_bets = int(bet_mask.sum())
    total_races = df_all[race_col].nunique()
    bet_races = df_all.loc[bet_mask, race_col].nunique()
    skipped = total_races - bet_races

    # 馬単位の的中率
    hit_rate_horse = float(df_all.loc[bet_mask, label_col].mean() or 0.0)

    # レース単位の的中率（買った中に1頭でも当たり）
    hit_rate_race = float(
        df_all.loc[bet_mask, [race_col, label_col]]
              .groupby(race_col)[label_col].max().mean() or 0.0
    )

    return dict(
        total_races=int(total_races),
        bet_races=int(bet_races),
        skipped=int(skipped),
        n_bets=n_bets,
        hit_rate_horse=hit_rate_horse,
        hit_rate_race=hit_rate_race,
    )

In [None]:
pred_df = results_df.copy()
pred_df["p_blend"] = p_blend_oof

picks = make_picks(pred_df, score_col="p_blend",
                   top_k=TOP_K_PER_RACE, min_prob=MIN_PROB)
res = evaluate_picks_counts(pred_df, picks)

total   = res["total_races"]
bet     = res["bet_races"]
skipped = res["skipped"]
n_bets  = res["n_bets"]

bet_rate         = (bet / total) if total else 0.0
avg_bets_per_race = (n_bets / bet) if bet else 0.0

print("\n=== 買い目の評価（オッズ不使用） ===")
print(f"設定              : 上位{TOP_K_PER_RACE}頭 / 確率しきい値 {MIN_PROB:.3f}")
print(f"総レース数        : {total}")
print(f"参加レース数      : {bet}（参加率 {bet_rate:.1%}）")
print(f"見送りレース数    : {skipped}")
print(f"買い目点数        : {n_bets}（1レース平均 {avg_bets_per_race:.2f} 点）")
print(f"的中率（馬単位）  : {res['hit_rate_horse']:.3f}")
print(f"的中率（レース）  : {res['hit_rate_race']:.3f} ＊そのレースで1頭でも当たりがいれば的中")


=== 買い目の評価（オッズ不使用） ===
設定              : 上位2頭 / 確率しきい値 0.165
総レース数        : 6788
参加レース数      : 6723（参加率 99.0%）
見送りレース数    : 65
買い目点数        : 13446（1レース平均 2.00 点）
的中率（馬単位）  : 0.101
的中率（レース）  : 0.188 ＊そのレースで1頭でも当たりがいれば的中


オッズ系特徴（log単勝/単勝ランク 等）の寄与が高く、現在のしきい値(0.165)設定では参加率が99%と高止まり。  
しきい値調整やオッズ寄与の抑制で選択性と精度の改善余地あり。

## 妙味スコアと注目度の集計

買い目判断材料として、馬ごとの妙味スコアとレースの注目度を出力できるようにする。

In [161]:
df_oof = results_df.copy()
df_oof["p_blend_oof"] = oof_pred

def _market_prob(s_odds: pd.Series) -> pd.Series:
    if s_odds.notna().any():
        inv = s_odds.apply(lambda x: 1/x if pd.notna(x) and x > 0 else np.nan)
        denom = np.nansum(inv)
        return inv / denom if denom and denom > 0 else pd.Series(np.nan, index=s_odds.index)
    return pd.Series(np.nan, index=s_odds.index)

if "単勝" in df_oof.columns:
    df_oof["market_p"] = df_oof.groupby(RACE_COL, group_keys=False)["単勝"].transform(_market_prob)
else:
    df_oof["market_p"] = np.nan

eps = 1e-12
has_market_any = df_oof["market_p"].notna().any()

if has_market_any:
    df_oof["妙味スコア_oof"] = df_oof["p_blend_oof"] / (df_oof["market_p"] + eps)
else:
    def _fallback_value(group: pd.DataFrame) -> pd.Series:
        p = group["p_blend_oof"].values
        pos = np.clip(p - np.nanmean(p), 0, None)
        vmax = np.nanmax(pos) if np.isfinite(pos).any() else np.nan
        hv = (pos / vmax) if (vmax and vmax > 0) else np.zeros_like(pos)
        return pd.Series(1.0 + 9.0 * hv, index=group.index)
    df_oof["妙味スコア_oof"] = df_oof.groupby(RACE_COL, group_keys=False).apply(_fallback_value)

def _race_attention(g: pd.DataFrame) -> float:
    top_p = float(np.nanmax(g["p_blend_oof"].values)) if len(g) else np.nan

    _eps = 1e-12
    if g["market_p"].notna().any():
        pm = g["market_p"].fillna(0).values
        ent = -np.nansum(pm * np.log(pm + _eps))
        ent_max = math.log(len(g)) if len(g) > 1 else 1.0
        ent_norm = (ent / ent_max) if ent_max > 0 else 0.0
    else:
        ent_norm = 0.0

    race_skip = 0.6 * (1 - (top_p if top_p == top_p else 0.0)) + 0.4 * ent_norm
    return float(1.0 - race_skip) 

race_attention_by_race = df_oof.groupby(RACE_COL, group_keys=True).apply(_race_attention)
race_attention_by_race.name = "race_attention"

avg_race_attention    = float(np.nanmean(race_attention_by_race.values)) if len(race_attention_by_race) else float("nan")
median_race_attention = float(np.nanmedian(race_attention_by_race.values)) if len(race_attention_by_race) else float("nan")

avg_myoumi_score    = float(np.nanmean(df_oof["妙味スコア_oof"].values))
median_myoumi_score = float(np.nanmedian(df_oof["妙味スコア_oof"].values))

n_races  = int(race_attention_by_race.shape[0])
n_horses = int(df_oof.shape[0])

BASELINE_JSON = Path("03_baselines.json")
payload = {
    "avg_myoumi_score_oof": avg_myoumi_score,
    "median_myoumi_score_oof": median_myoumi_score,
    "avg_race_attention_oof": avg_race_attention,
    "median_race_attention_oof": median_race_attention,
    "n_races": n_races,
    "n_horses": n_horses,
}
BASELINE_JSON.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✅ saved baselines -> {BASELINE_JSON} : {payload}")


✅ saved baselines -> 03_baselines.json : {'avg_myoumi_score_oof': 3.07846041491536, 'median_myoumi_score_oof': 1.1965446799067012, 'avg_race_attention_oof': 0.13285795279676615, 'median_race_attention_oof': 0.12906446699097196, 'n_races': 6788, 'n_horses': 92870}


# Keras

実装コストが低く試行が容易なため、Kerasでニューラルネットのベースラインを作成し、ツリー系（LGBM/CatBoost）と比較評価する。

In [164]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## 推奨設定で学習

In [172]:
import random

In [210]:
SEED = 42
np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)

LABEL_COL = "HighPayoutHorse"
RACE_COL  = "race_id"

train_df = results_df.iloc[tr_idx].copy()
eval_df  = results_df.iloc[va_idx].copy()

cat_cols = [c for c in train_df.select_dtypes(include=["object"]).columns if c != RACE_COL]
num_cols = [c for c in train_df.columns if c not in cat_cols + [LABEL_COL, RACE_COL]]

print("cat_cols:", cat_cols)
print("num_cols:", num_cols)

def df_to_ds(df: pd.DataFrame, batch=1024, shuffle=True):
    X = {}
    for c in num_cols: X[c] = df[c].astype("float32").values
    for c in cat_cols: X[c] = df[c].astype(str).values
    y = df[LABEL_COL].astype("float32").values
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle: ds = ds.shuffle(len(df), seed=SEED)
    return ds.batch(batch).prefetch(tf.data.AUTOTUNE)

train_ds = df_to_ds(train_df, shuffle=True)
valid_ds = df_to_ds(eval_df,  shuffle=False)

pos = int(train_df[LABEL_COL].sum()); neg = len(train_df) - pos
base_w_pos = neg / max(pos, 1)
print("base_w_pos (class_weight[1]):", base_w_pos)

cat_cols: ['騎手', '調教師', 'weather', 'race_type', 'ground_state', '競馬場', '所属', '月', '性']
num_cols: ['枠番', '斤量', '単勝', '馬体重', 'course_len', '馬体重_増減', '年齢', '出走頭数', '相対枠位置', '大外枠', '最内枠', '着順_num', '複勝フラグ', '馬_直近3走_複勝率', '馬_直近5走_複勝率', '馬_直近10走_複勝率', '馬_直近5走_平均着順', '騎手_直近30走_複勝率', '調教師_直近50走_複勝率', '馬_コース適性_複勝率', '馬_距離適性_複勝率', 'コンビ_直近50走_複勝率', 'log単勝', '単勝ランク', '単勝pct', '単勝_最人気差', '単勝_2番人気差']
base_w_pos (class_weight[1]): 45.042157470551764


In [211]:
VOCAB_JSON = "03_keras_vocabs.json"

def _norm(s: pd.Series):
    return normalize_text(s).fillna("NA").astype(str)

if not os.path.exists(VOCAB_JSON):
    vocabs = {}
    for c in results_df.select_dtypes(include="object").columns:
        if c in [LABEL_COL, RACE_COL]:
            continue
        vocabs[c] = sorted(_norm(results_df[c]).unique().tolist())
    Path(VOCAB_JSON).write_text(json.dumps(vocabs, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"✅ saved vocab -> {VOCAB_JSON} (cols={len(vocabs)})")

VOCAB = json.loads(Path(VOCAB_JSON).read_text(encoding="utf-8"))

In [184]:
def focal_loss(alpha=0.25, gamma=2.0):
    def _loss(y_true, y_pred):
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1.0 - eps)
        p_t = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        w   = tf.where(tf.equal(y_true, 1), alpha, 1 - alpha)
        return -tf.reduce_mean(w * tf.pow(1.0 - p_t, gamma) * tf.math.log(p_t))
    return _loss

In [None]:
norm_layer = layers.Normalization()
if num_cols:
    norm_layer.adapt(train_df[num_cols].astype("float32").values)

lookup = {}
vocab_sizes = {}
for col in cat_cols:
    v = VOCAB.get(col)
    if v is None:
        v = sorted(_norm(train_df[col]).unique().tolist())
    lut = layers.StringLookup(vocabulary=v, output_mode="int", num_oov_indices=1)
    lookup[col] = lut
    vocab_sizes[col] = lut.vocabulary_size()

In [213]:
def safe_name(s: str) -> str:
    return re.sub(r'[^A-Za-z0-9_.\\/>\-]+', '_', str(s))

NAME_MAP = {c: safe_name(c) for c in (num_cols + cat_cols)}

In [214]:
def build_safe_name_map(cols):
    used = set()
    name_map = {}
    for i, c in enumerate(cols):
        base = re.sub(r'[^A-Za-z0-9_.\\/>\-]+', '_', str(c))
        base = re.sub(r'_+', '_', base).strip('_')
        if not base:
            base = f"col{i}"
        name = base
        k = 2
        while name in used:
            name = f"{base}_{k}"
            k += 1
        used.add(name)
        name_map[c] = name
    return name_map

NAME_MAP = build_safe_name_map(num_cols + cat_cols)

assert len(set(NAME_MAP.values())) == len(NAME_MAP), "NAME_MAP に重複があります"


In [None]:
def df_to_ds(df: pd.DataFrame, batch=1024, shuffle=True):
    X = {}
    for c in num_cols:
        X[NAME_MAP[c]] = df[c].astype("float32").values
    for c in cat_cols:
        X[NAME_MAP[c]] = normalize_text(df[c]).fillna("NA").astype(str).values
    y = df[LABEL_COL].astype("float32").values
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        ds = ds.shuffle(len(df), seed=SEED)
    return ds.batch(batch).prefetch(tf.data.AUTOTUNE)

In [216]:
def build_model(hparams):
    inputs = {}
    for c in num_cols:
        inputs[NAME_MAP[c]] = keras.Input(shape=(1,), name=NAME_MAP[c], dtype=tf.float32)
    for c in cat_cols:
        inputs[NAME_MAP[c]] = keras.Input(shape=(1,), name=NAME_MAP[c], dtype=tf.string)

    feats = []
    if num_cols:
        x_num = layers.Concatenate(name="num_concat")([inputs[NAME_MAP[c]] for c in num_cols])
        x_num = norm_layer(x_num)
        if hparams.get("num_bn", False):
            x_num = layers.BatchNormalization()(x_num)
        feats.append(x_num)

    emb_coef   = hparams.get("emb_coef", 1.0)
    emb_cap    = hparams.get("emb_cap", 50)
    emb_floor  = hparams.get("emb_floor", 4)
    emb_drop   = hparams.get("emb_dropout", 0.0)

    for c in cat_cols:
        idx = lookup[c](inputs[NAME_MAP[c]])
        dim = int(min(emb_cap, max(emb_floor, round(math.sqrt(vocab_sizes[c]) * emb_coef))))
        emb = layers.Embedding(input_dim=vocab_sizes[c], output_dim=dim,
                               name=f"emb_{NAME_MAP[c]}")(idx)
        emb = layers.Reshape((dim,))(emb)
        if emb_drop > 0:
            emb = layers.Dropout(emb_drop)(emb)
        feats.append(emb)

    if not feats:
        raise ValueError("特徴量がありません")
    x = feats[0] if len(feats)==1 else layers.Concatenate(name="all_concat")(feats)

    reg = keras.regularizers.l2(hparams.get("l2", 1e-6))
    act = hparams.get("activation", "relu")
    for width in hparams.get("hidden", [256,128,64]):
        x = layers.Dense(width, activation=act, kernel_regularizer=reg)(x)
        if hparams.get("bn", True):
            x = layers.BatchNormalization()(x)
        dp = hparams.get("dropout", 0.2)
        if dp > 0:
            x = layers.Dropout(dp)(x)

    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=out)

    if hparams.get("use_focal", True):
        loss_fn = focal_loss(alpha=hparams.get("focal_alpha", 0.25),
                             gamma=hparams.get("focal_gamma", 2.0))
    else:
        loss_fn = "binary_crossentropy"

    opt = keras.optimizers.Adam(learning_rate=hparams.get("lr", 1e-3))
    model.compile(optimizer=opt, loss=loss_fn,
                  metrics=[keras.metrics.AUC(name="roc_auc"),
                           keras.metrics.AUC(curve="PR", name="pr_auc")])
    return model


In [217]:
def race_topk_precision(
    df: pd.DataFrame,
    prob: np.ndarray | list,
    k: int = 1,
    ycol: str = LABEL_COL,
    race_col: str = RACE_COL,
    min_prob: float | None = None, 
) -> float:
    """各レースで予測確率の上位k頭の中に、少なくとも1頭ラベル=1が含まれる割合（レース単位Hit@K）。"""
    prob = np.asarray(prob)
    if len(prob) != len(df):
        raise ValueError(f"prob長さ({len(prob)})とdf長さ({len(df)})が一致していません。")

    t = df[[race_col, ycol]].copy()
    t["prob"] = prob

    if min_prob is not None:
        ok_races = (t.groupby(race_col)["prob"].max() >= min_prob)
        ok_races = ok_races[ok_races].index
        t = t[t[race_col].isin(ok_races)]

    if len(t) == 0:
        return 0.0

    topk = (t.sort_values([race_col, "prob"], ascending=[True, False])
              .groupby(race_col, as_index=False)
              .head(k))

    return float(topk.groupby(race_col)[ycol].max().mean())

In [None]:
keras.backend.clear_session()

hp0 = dict(
    emb_coef=1.0, emb_cap=32, emb_floor=4, emb_dropout=0.0,
    hidden=[256,128,64], dropout=0.2, bn=True, num_bn=False,
    activation="relu", l2=1e-5,
    use_focal=True, focal_alpha=0.25, focal_gamma=2.0, lr=1e-3,
    batch_size=1024, epochs=10, patience=3,
)

model_keras = build_model(hp0)

cw1 = min(base_w_pos, base_w_pos*0.3) if hp0["use_focal"] else base_w_pos
class_weight = {0: 1.0, 1: cw1}

train_ds = df_to_ds(train_df, batch=hp0["batch_size"], shuffle=True)
valid_ds = df_to_ds(eval_df,  batch=hp0["batch_size"], shuffle=False)

cb = [
    keras.callbacks.EarlyStopping(monitor="val_pr_auc", mode="max",
                                  patience=hp0["patience"], restore_best_weights=True)
]

# 学習
hist = model_keras.fit(
    train_ds, validation_data=valid_ds,
    epochs=hp0["epochs"], class_weight=class_weight,
    callbacks=cb, verbose=1
)

# 予測＆指標
p_valid = model_keras.predict(valid_ds, verbose=0).ravel()
y_true  = eval_df[LABEL_COL].to_numpy().astype(int)

roc  = roc_auc_score(y_true, p_valid)
pr   = average_precision_score(y_true, p_valid)
br   = brier_score_loss(y_true, p_valid)

top1 = top2 = None
if "race_topk_precision" in globals():
    top1 = float(race_topk_precision(eval_df, p_valid, k=1))
    top2 = float(race_topk_precision(eval_df, p_valid, k=2))

thr  = np.quantile(p_valid, 0.9)
lift = float((y_true[p_valid >= thr].mean()) / y_true.mean())

val_hist = np.array(hist.history.get("val_pr_auc", []), dtype=float)
best_ep  = int(np.nanargmax(val_hist) + 1) if len(val_hist) else None
best_val = float(np.nanmax(val_hist)) if len(val_hist) else float("nan")
trained_epochs = len(hist.history.get("loss", []))

n = len(y_true); pos = int(y_true.sum())
print("\n=== Keras NN 総合評価（検証） ===")
print(f"サンプル数         : {n}（陽性 {pos}, 陽性率 {pos/n:.2%}）")
print(f"損失               : {'Focal' if hp0.get('use_focal', True) else 'BinaryCE'}")
print(f"ROC-AUC            : {roc:.4f}")
print(f"PR-AUC             : {pr:.4f}")
print(f"Brier              : {br:.4f}")
if top1 is not None:
    print(f"Race Hit@1 / Hit@2 : {top1:.3f} / {top2:.3f}")
print(f"Lift@10%           : {lift:.2f}x")
if best_ep is not None:
    print(f"Best val_pr_auc    : {best_val:.4f} @ epoch {best_ep}（実行 {trained_epochs}epochs）")

Epoch 1/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - loss: nan - pr_auc: 0.0210 - roc_auc: 0.4886 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000
Epoch 2/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - loss: nan - pr_auc: 0.0219 - roc_auc: 0.5037 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000
Epoch 3/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - loss: nan - pr_auc: 0.0221 - roc_auc: 0.5114 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000
Epoch 4/10
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - loss: nan - pr_auc: 0.0230 - roc_auc: 0.5039 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000

=== Keras NN 総合評価（検証） ===
サンプル数         : 18604（陽性 440, 陽性率 2.37%）
損失               : Focal
ROC-AUC            : 0.5000
PR-AUC             : 0.0237
Brier              : 0.0885
Race Hit@1 / Hit@2 : 0.032 / 0.058
Lift@10%           : 1.0

ランダムと同値のスコアとなった。

## Optunaでチューニング

In [None]:
def objective(trial: optuna.Trial):
    keras.backend.clear_session()

    h = dict(
        emb_coef    = trial.suggest_float("emb_coef", 0.7, 1.5),
        emb_cap     = trial.suggest_int("emb_cap", 16, 64),
        emb_floor   = 4,
        emb_dropout = trial.suggest_float("emb_dropout", 0.0, 0.2),

        hidden      = [trial.suggest_int("h1",128,512,log=True),
                       trial.suggest_int("h2",64,256,log=True),
                       trial.suggest_int("h3",32,128,log=True)],
        dropout     = trial.suggest_float("dropout", 0.1, 0.5),
        bn          = trial.suggest_categorical("bn",[True, False]),
        num_bn      = trial.suggest_categorical("num_bn",[False, True]),
        activation  = trial.suggest_categorical("activation", ["relu", "gelu", "selu"]),
        l2          = trial.suggest_float("l2", 1e-7, 1e-3, log=True),

        use_focal   = trial.suggest_categorical("use_focal", [True, False]),
        focal_alpha = trial.suggest_float("focal_alpha", 0.1, 0.5),
        focal_gamma = trial.suggest_float("focal_gamma", 1.0, 3.0),
        lr          = trial.suggest_float("lr", 5e-4, 3e-3, log=True),

        batch_size  = trial.suggest_categorical("batch_size", [512, 1024, 2048]),
        epochs      = 30,
        patience    = 6,
    )

    model = build_model(h)

    cw1 = min(base_w_pos, base_w_pos*0.3) if h["use_focal"] else base_w_pos
    class_weight = {0:1.0, 1:cw1}

    train_ds_trial = df_to_ds(train_df, batch=h["batch_size"], shuffle=True)
    valid_ds_trial = df_to_ds(eval_df,  batch=h["batch_size"], shuffle=False)

    cb = [
        keras.callbacks.EarlyStopping(monitor="val_pr_auc", mode="max",
                                      patience=h["patience"], restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor="val_pr_auc", mode="max",
                                          factor=0.5, patience=2, min_lr=1e-5),
    ]

    hist = model.fit(
        train_ds_trial, validation_data=valid_ds_trial,
        epochs=h["epochs"], class_weight=class_weight,
        verbose=0, callbacks=cb
    )

    pr_hist = hist.history.get("val_pr_auc", [])
    return float(np.max(pr_hist)) if len(pr_hist) else 0.0


In [220]:
study_nn = optuna.create_study(direction="maximize", study_name="keras_nn_pr_auc")
study_nn.optimize(objective, n_trials=100, show_progress_bar=True)

print("Best PR-AUC (NN):", study_nn.best_value)
print("Best params (NN):")
for k,v in study_nn.best_params.items():
    print(f"  {k}: {v}")

Best trial: 0. Best value: 0.0236508: 100%|██████████| 100/100 [44:42<00:00, 26.82s/it]

Best PR-AUC (NN): 0.023650826886296272
Best params (NN):
  emb_coef: 0.8133115677197165
  emb_cap: 20
  emb_dropout: 0.04809503919384766
  h1: 448
  h2: 234
  h3: 59
  dropout: 0.11760539695088079
  bn: True
  num_bn: True
  activation: relu
  l2: 1.389407409422714e-05
  use_focal: False
  focal_alpha: 0.4009645431058436
  focal_gamma: 1.8200654975432533
  lr: 0.0013329271527386614
  batch_size: 1024





## ベストパラメータで再学習

In [221]:
keras.backend.clear_session()

best_hp = study_nn.best_params
best_hp["epochs"] = 100
best_hp["patience"] = 8

model_best = build_model(best_hp)

cw1 = min(base_w_pos, base_w_pos*0.3) if best_hp.get("use_focal", True) else base_w_pos
class_weight = {0:1.0, 1:cw1}

train_ds_best = df_to_ds(train_df, batch=best_hp.get("batch_size", 1024), shuffle=True)
valid_ds_best = df_to_ds(eval_df,  batch=best_hp.get("batch_size", 1024), shuffle=False)

cb = [
    keras.callbacks.EarlyStopping(monitor="val_pr_auc", mode="max",
                                  patience=best_hp["patience"], restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_pr_auc", mode="max",
                                      factor=0.5, patience=2, min_lr=1e-5),
]
history = model_best.fit(
    train_ds_best, validation_data=valid_ds_best,
    epochs=best_hp["epochs"], class_weight=class_weight,
    verbose=1, callbacks=cb
)

p_valid = model_best.predict(valid_ds_best, verbose=0).ravel()
y_true  = eval_df[LABEL_COL].to_numpy().astype(int)

roc  = roc_auc_score(y_true, p_valid)
pr   = average_precision_score(y_true, p_valid)
br   = brier_score_loss(y_true, p_valid)
print(f"[Keras NN tuned] ROC-AUC={roc:.4f}  PR-AUC={pr:.4f}  Brier={br:.4f}")

def race_topk_precision(df, prob, k=1, ycol=LABEL_COL, race_col=RACE_COL):
    t = df[[race_col, ycol]].copy()
    t["prob"] = prob
    return t.groupby(race_col).apply(
        lambda g: g.sort_values("prob", ascending=False).head(k)[ycol].max()
    ).mean()

top1 = race_topk_precision(eval_df, p_valid, k=1)
top2 = race_topk_precision(eval_df, p_valid, k=2)
thr  = np.quantile(p_valid, 0.9)
lift = (y_true[p_valid >= thr].mean()) / y_true.mean()
print(f"Top1={top1:.4f}  Top2={top2:.4f}  Lift@10%={lift:.2f}x")

# 保存
nn_path = "03_keras_nn_best.keras"
model_best.save(nn_path)
print(f"✅ Saved tuned NN -> {nn_path}")

# Optunaの履歴保存
joblib.dump(study_nn, "03_optuna_study_keras_nn.pkl")
study_nn.trials_dataframe().to_csv("03_optuna_study_keras_nn_trials.csv", index=False)
print("Saved Optuna study results.")


Epoch 1/100
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 45ms/step - loss: nan - pr_auc: 0.0217 - roc_auc: 0.4966 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000 - learning_rate: 0.0013
Epoch 2/100
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - loss: nan - pr_auc: 0.0211 - roc_auc: 0.4898 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000 - learning_rate: 0.0013
Epoch 3/100
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - loss: nan - pr_auc: 0.0208 - roc_auc: 0.4858 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000 - learning_rate: 0.0013
Epoch 4/100
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - loss: nan - pr_auc: 0.0214 - roc_auc: 0.4970 - val_loss: nan - val_pr_auc: 0.0237 - val_roc_auc: 0.5000 - learning_rate: 6.6646e-04
Epoch 5/100
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - loss: nan - pr_auc: 0.0215 - roc_auc: 0.4945

OOV(=Out Of Vocabulary)を確認。

In [None]:
print("pred stats:", np.min(p_valid), np.median(p_valid), np.max(p_valid), np.std(p_valid))

for c, lut in lookup.items():
    ids = lut(tf.constant(eval_df[c].astype(str).values))
    ids = ids.numpy().ravel()
    oov_id = 1
    oov_ratio = (ids == oov_id).mean()
    print(f"{c}: vocab={vocab_sizes[c]}, OOV={oov_ratio:.1%}")

print("race_id in cat_cols?", RACE_COL in cat_cols)


pred stats: 0.51211 0.51211 0.51211 1.1920929e-07
騎手: vocab=236, OOV=0.0%
調教師: vocab=263, OOV=0.0%
weather: vocab=7, OOV=5.3%
race_type: vocab=4, OOV=48.9%
ground_state: vocab=5, OOV=4.2%
競馬場: vocab=11, OOV=10.5%
所属: vocab=5, OOV=0.1%
月: vocab=22, OOV=4.7%
性: vocab=4, OOV=4.5%
race_id in cat_cols? False


race_typeなどでOOVが残り、検証ではPR-AUC≒陽性率・ROC-AUC≈0.5とツリー系に劣後。  
OOV処理や追加学習は打ち切り、最終は LGBM＋CatBoost を採用。