In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

OUT_DIR = Path("../outputs/02_feature_association")
FIG_DIR = OUT_DIR / "figures"
TAB_DIR = OUT_DIR / "tables"
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
from pathlib import Path
import sys

# notebooks/ から見てプロジェクトルートを追加
ROOT = Path.cwd().resolve().parents[0]  # notebooks の1つ上 = ~/work/keiba-ai
sys.path.append(str(ROOT))

from src.data_collection.loaders.result_loader import load_results
from src.data_collection.loaders.horse_loader import load_horse_results
from src.data_collection.loaders.race_info_loader import load_race_info

df_result = load_results("../data/rawdf/result/result_*.csv")
df_horse = load_horse_results("../data/rawdf/horse/*.csv")
df_race_info = load_race_info("../data/rawdf/race_info/*.csv")

from src.data_collection.pipelines.build_train_table import build_train_table

df = build_train_table(df_result, df_race_info, df_horse)

  raw = pd.read_csv(p, sep=",")


In [5]:
MARKET_COLS = ["popularity", "tansho_odds"]

# ターゲット（目的変数）
# rank: 1が勝ち、1-3が複勝圏
df["y_win"] = (df["rank"] == 1).astype(int)
df["y_top3"] = (df["rank"] <= 3).astype(int)

# 市場特徴は解析対象から除外（DF自体から消すか、特徴リストから外すかはお好み）
df_ = df.drop(columns=[c for c in MARKET_COLS if c in df.columns])

df_[["y_win", "y_top3"]].mean()


y_win     0.073377
y_top3    0.219926
dtype: float64

In [7]:
# 使わない/リーク疑い/ID系（単変量の相関を見る意味が薄い）
ID_COLS = ["race_id", "horse_id", "jockey_id", "trainer_id", "owner_id","place","race_type","around","course_len","weather","ground_state","race_class"]

TARGETS = ["y_win", "y_top3"]
DROP_ALWAYS = TARGETS + ["rank"] + ID_COLS

feature_cols = [c for c in df_.columns if c not in DROP_ALWAYS]

num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df_[c])]
cat_cols = [c for c in feature_cols if c not in num_cols]

len(feature_cols), len(num_cols), len(cat_cols), num_cols[:10], cat_cols[:10]


(9,
 8,
 1,
 ['wakuban',
  'umaban',
  'sex',
  'age',
  'impost',
  'weight',
  'weight_diff',
  'hr_rank_mean_3'],
 ['race_date'])

In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from scipy.special import expit  # 安定シグモイド

def univariate_numeric_metrics_stable(df, x_cols, y_col, non_null_th=0.7):
    rows = []
    y = df[y_col].values

    # yが片クラスだけだとAUC/loglossが定義できないので早期return
    if len(np.unique(y)) < 2:
        raise ValueError(f"{y_col} has only one class in data.")

    for c in x_cols:
        x = df[c]
        if x.notna().mean() < non_null_th:
            continue

        # Spearman（欠損落として）
        tmp = df[[c, y_col]].dropna()
        sp = tmp[c].corr(tmp[y_col], method="spearman")

        # 欠損補完→標準化
        x_filled = x.fillna(x.median())
        x_std = StandardScaler().fit_transform(x_filled.values.reshape(-1, 1)).ravel()

        # AUC（符号の向き消す）
        try:
            auc_raw = roc_auc_score(y, x_std)
            auc = max(auc_raw, 1 - auc_raw)
        except Exception:
            auc = np.nan

        rows.append({
            "feature": c,
            "non_null_rate": float(x.notna().mean()),
            "spearman": float(sp) if sp == sp else np.nan,
            "auc": float(auc) if auc == auc else np.nan,
        })

    res = pd.DataFrame(rows).sort_values(["auc", "spearman"], ascending=[False, False])
    return res

num_win  = univariate_numeric_metrics_stable(df_, num_cols, "y_win")
num_top3 = univariate_numeric_metrics_stable(df_, num_cols, "y_top3")

num_win2.head(10), num_top32.head(10)


(          feature  non_null_rate  spearman       auc
 7  hr_rank_mean_3       0.899825 -0.191510  0.700327
 5          weight       1.000000  0.052073  0.557638
 3             age       1.000000 -0.038076  0.540251
 2             sex       1.000000 -0.036313  0.535415
 4          impost       1.000000  0.019838  0.521649
 1          umaban       1.000000 -0.012875  0.514223
 6     weight_diff       1.000000  0.008988  0.509856
 0         wakuban       1.000000  0.007563  0.508304,
           feature  non_null_rate  spearman       auc
 7  hr_rank_mean_3       0.899825 -0.290414  0.691777
 5          weight       1.000000  0.063828  0.544476
 2             sex       1.000000 -0.054308  0.533344
 3             age       1.000000 -0.047982  0.531932
 4          impost       1.000000  0.043638  0.529980
 1          umaban       1.000000 -0.028776  0.520012
 6     weight_diff       1.000000  0.012168  0.508400
 0         wakuban       1.000000  0.009212  0.506368)

In [13]:
num_win.to_csv(TAB_DIR / "univariate_numeric_win.csv", index=False)
num_top3.to_csv(TAB_DIR / "univariate_numeric_top3.csv", index=False)


In [14]:
def id_win_rates(df, id_col, min_count=50, k=200):
    """
    id_colごとに:
      - n
      - win_rate / top3_rate（生）
      - win_rate_s / top3_rate_s（平滑化: (wins + k*overall)/(n+k)）
    を返す
    """
    overall_win = df["y_win"].mean()
    overall_top3 = df["y_top3"].mean()

    g = df.groupby(id_col).agg(
        n=("y_win", "size"),
        wins=("y_win", "sum"),
        top3=("y_top3", "sum"),
    ).reset_index()

    g["win_rate"] = g["wins"] / g["n"]
    g["top3_rate"] = g["top3"] / g["n"]

    # 平滑化（Empirical Bayes風）
    g["win_rate_s"] = (g["wins"] + k * overall_win) / (g["n"] + k)
    g["top3_rate_s"] = (g["top3"] + k * overall_top3) / (g["n"] + k)

    # 足切り
    g = g[g["n"] >= min_count].copy()

    # 便利列
    g["overall_win"] = overall_win
    g["overall_top3"] = overall_top3

    return g.sort_values(["win_rate_s", "n"], ascending=[False, False])

# まずはざっくり（min_countやkは後で調整）
jockey_stats  = id_win_rates(df_, "jockey_id",  min_count=50, k=200)
trainer_stats = id_win_rates(df_, "trainer_id", min_count=50, k=200)
owner_stats   = id_win_rates(df_, "owner_id",   min_count=50, k=200)

jockey_stats.head(20), trainer_stats.head(20), owner_stats.head(20)


(     jockey_id     n  wins  top3  win_rate  top3_rate  win_rate_s  top3_rate_s  overall_win  overall_top3
 51        1088  2507   676  1428  0.269645   0.569605    0.255144     0.543770     0.073377      0.219926
 183       5339  3143   789  1721  0.251034   0.547566    0.240405     0.527964     0.073377      0.219926
 213       5509   410   113   240  0.275610   0.585366    0.209304     0.465550     0.073377      0.219926
 203       5473   519   117   259  0.225434   0.499037    0.183137     0.421398     0.073377      0.219926
 232       5585   477   107   235  0.224319   0.492662    0.179727     0.412091     0.073377      0.219926
 19        1014  1399   242   604  0.172981   0.431737    0.160522     0.405244     0.073377      0.219926
 188       5386  3830   601  1497  0.156919   0.390862    0.152773     0.382378     0.073377      0.219926
 33        1046   219    46    90  0.210046   0.410959    0.144810     0.319774     0.073377      0.219926
 115       1170  3862   569  1463  0.

In [15]:
jockey_stats.to_csv(TAB_DIR / "jockey_winrates.csv", index=False)
trainer_stats.to_csv(TAB_DIR / "trainer_winrates.csv", index=False)
owner_stats.to_csv(TAB_DIR / "owner_winrates.csv", index=False)

In [17]:
# df_ は market列を落としたやつ（前の流れのまま想定）
df_["n_horses"] = df_.groupby("race_id")["horse_id"].transform("count")

In [22]:
import numpy as np
import pandas as pd

REL_NUM_COLS = ["hr_rank_mean_3", "impost", "weight", "weight_diff", "age", "umaban", "wakuban"]
REL_NUM_COLS = [c for c in REL_NUM_COLS if c in df_.columns]

def add_race_relative_features(df, group_col="race_id", cols=None):
    df = df.copy()
    g = df.groupby(group_col)
    cols = cols or []

    for c in cols:
        mean = g[c].transform("mean")
        std  = g[c].transform("std").replace(0, np.nan)

        df[f"{c}_diff_mean"] = df[c] - mean
        df[f"{c}_z"] = (df[c] - mean) / std

        # “良い/悪い”が分からないものもあるので両方向作る
        df[f"{c}_rank_asc"]  = g[c].rank(method="average", ascending=True)   # 小さいほど上位
        df[f"{c}_rank_desc"] = g[c].rank(method="average", ascending=False)  # 大きいほど上位

        n = g[c].transform("count")
        denom = (n - 1).replace(0, np.nan)
        df[f"{c}_pct_asc"]  = (df[f"{c}_rank_asc"] - 1) / denom
        df[f"{c}_pct_desc"] = (df[f"{c}_rank_desc"] - 1) / denom

    return df

df_rel = add_race_relative_features(df_, cols=REL_NUM_COLS)
df_rel.shape



(235838, 67)

In [23]:
# 距離帯
if "course_len" in df_rel.columns:
    df_rel["dist_bin"] = pd.cut(
        df_rel["course_len"],
        bins=[0, 1400, 1800, 2200, 2600, 4000],
        labels=["<=1400","1401-1800","1801-2200","2201-2600","2601+"],
        include_lowest=True
    )

# 頭数帯
df_rel["field_bin"] = pd.cut(
    df_rel["n_horses"],
    bins=[0, 10, 14, 18, 30],
    labels=["<=10","11-14","15-18","19+"],
    include_lowest=True
)


In [24]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from scipy.special import expit

def univariate_auc(df, x_cols, y_col, non_null_th=0.7):
    rows=[]
    y = df[y_col].values
    if len(np.unique(y)) < 2:
        raise ValueError(f"{y_col} has only one class.")

    for c in x_cols:
        x = df[c]
        if x.notna().mean() < non_null_th:
            continue

        tmp = df[[c, y_col]].dropna()
        sp = tmp[c].corr(tmp[y_col], method="spearman")

        x_filled = x.fillna(x.median())
        x_std = StandardScaler().fit_transform(x_filled.values.reshape(-1,1)).ravel()

        try:
            auc_raw = roc_auc_score(y, x_std)
            auc = max(auc_raw, 1-auc_raw)
        except Exception:
            auc = np.nan

        rows.append({"feature": c, "non_null_rate": float(x.notna().mean()), "spearman": sp, "auc": auc})
    return pd.DataFrame(rows).sort_values("auc", ascending=False)

# 相対特徴カラムだけ集める
rel_cols = [c for c in df_rel.columns if any(
    c.endswith(suf) for suf in ["_diff_mean","_z","_rank_asc","_rank_desc","_pct_asc","_pct_desc"]
)]
# 重要：n_horses も入れる
rel_cols = rel_cols + (["n_horses"] if "n_horses" in df_rel.columns else [])

rel_win  = univariate_auc(df_rel, rel_cols, "y_win")
rel_top3 = univariate_auc(df_rel, rel_cols, "y_top3")

rel_win.head(30), rel_top3.head(30)


(                     feature  non_null_rate  spearman       auc
 2    hr_rank_mean_3_rank_asc       0.899825 -0.202022  0.710585
 1           hr_rank_mean_3_z       0.899825 -0.199851  0.708278
 0   hr_rank_mean_3_diff_mean       0.899825 -0.199344  0.707751
 4     hr_rank_mean_3_pct_asc       0.899825 -0.196450  0.704560
 5    hr_rank_mean_3_pct_desc       0.899825  0.196450  0.704560
 3   hr_rank_mean_3_rank_desc       0.899825  0.157819  0.665288
 25              age_rank_asc       1.000000 -0.102469  0.612940
 28              age_pct_desc       1.000000  0.085594  0.588308
 27               age_pct_asc       1.000000 -0.085594  0.588308
 24             age_diff_mean       1.000000 -0.082559  0.585250
 15          weight_rank_desc       1.000000 -0.070032  0.577444
 12          weight_diff_mean       1.000000  0.058781  0.565075
 13                  weight_z       1.000000  0.057264  0.563396
 41                  n_horses       1.000000 -0.058125  0.562926
 17           weight_pct_

In [25]:
rel_win.to_csv(TAB_DIR / "rel_univariate_auc_win.csv", index=False)
rel_top3.to_csv(TAB_DIR / "rel_univariate_auc_top3.csv", index=False)
