In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)

OUT_DIR = Path('../outputs/03_train')
FIG_DIR = OUT_DIR / 'figures'
TAB_DIR = OUT_DIR / 'tables'
MODEL_DIR = OUT_DIR / 'models'
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
from pathlib import Path
import sys

ROOT = Path.cwd().resolve().parents[0]
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.data.loaders.result_loader import load_results
from src.data.loaders.horse_loader import load_horse_results
from src.data.loaders.race_info_loader import load_race_info
from src.data.pipelines.build_train_table import build_train_table

df_result = load_results('../data/rawdf/result/result_*.csv')
df_horse = load_horse_results('../data/rawdf/horse/*.csv')
df_race_info = load_race_info('../data/rawdf/race_info/*.csv')

df = build_train_table(df_result, df_race_info, df_horse)
df = df.dropna(subset=['rank', 'race_date']).copy()
df['race_date'] = pd.to_datetime(df['race_date'])


In [None]:
DATE_COL = 'race_date'
CAT_COLS = [
    'place','race_type','around','dist_bin','weather','ground_state','race_class',
    'sex','rest_bin','jockey_id','trainer_id','owner_id'
]
CAT_COLS = [c for c in CAT_COLS if c in df.columns]

DROP_ALWAYS = ['rank','race_id','horse_id','race_date','y_win','y_top3']
DROP_MARKET = ['popularity','tansho_odds']

def make_Xy(df, target_col, use_market=False):
    drop = DROP_ALWAYS.copy()
    if not use_market:
        drop += DROP_MARKET
    drop = [c for c in drop if c in df.columns]
    X = df.drop(columns=drop + [target_col], errors='ignore').copy()
    y = df[target_col].copy()
    for c in CAT_COLS:
        if c in X.columns:
            X[c] = X[c].astype('category')
    cat_cols = [c for c in CAT_COLS if c in X.columns]
    return X, y, cat_cols
train_df = df[df[DATE_COL].dt.year <= 2024].copy()
valid_df = df[df[DATE_COL].dt.year == 2025].copy()

train_df['y_win'] = (train_df['rank'] == 1).astype(int)
train_df['y_top3'] = (train_df['rank'] <= 3).astype(int)
valid_df['y_win'] = (valid_df['rank'] == 1).astype(int)
valid_df['y_top3'] = (valid_df['rank'] <= 3).astype(int)

print('train:', train_df.shape, 'valid:', valid_df.shape)
print('train range:', train_df[DATE_COL].min(), '->', train_df[DATE_COL].max())


In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score
params = {
    'objective': 'binary',
    'metric': ['auc', 'average_precision'],
    'learning_rate': 0.05,
    'num_leaves': 63,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'seed': 42,
    'verbosity': -1,
}


In [None]:
MODEL_TAG = 'y_win__split_2025'
X_tr, y_tr, cat_cols = make_Xy(train_df, 'y_win', use_market=False)
X_va, y_va, _ = make_Xy(valid_df, 'y_win', use_market=False)

dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

model = lgb.train(
    params,
    dtr,
    num_boost_round=2000,
    valid_sets=[dtr, dva],
    valid_names=['train', 'valid'],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)],
)

pred = model.predict(X_va, num_iteration=model.best_iteration)
print('AUC:', roc_auc_score(y_va, pred))
print('AP :', average_precision_score(y_va, pred))

model_path = MODEL_DIR / f'lgb_{MODEL_TAG}.txt'
model.save_model(str(model_path))
print('saved:', model_path)


In [None]:
imp = (
    pd.DataFrame({
        'feature': model.feature_name(),
        'gain': model.feature_importance(importance_type='gain'),
        'split': model.feature_importance(importance_type='split'),
    })
    .sort_values('gain', ascending=False)
)
imp.head(30)

imp_path = TAB_DIR / f'importance_{MODEL_TAG}.csv'
imp.to_csv(imp_path, index=False)
print('saved:', imp_path)
