In [47]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from functools import wraps
from sklearn.metrics import ndcg_score
from glob import glob
from os import path as osp

In [48]:
BASE_DIR = 'race_data/'
USE_COL = ['着順', '馬名', '性齢', '斤量', '騎手', 'タイム', '着差', '単勝', '人気', '馬体重', '調教師']# + ['horse_id', 'jockey_id']

In [49]:
p_race_csvs = glob(osp.join(BASE_DIR, 'horse_info_*.csv'))
p_race_csvs

['race_data\\horse_info_202408010101.csv',
 'race_data\\horse_info_202408010102.csv',
 'race_data\\horse_info_202408010103.csv',
 'race_data\\horse_info_202408010104.csv',
 'race_data\\horse_info_202408010106.csv',
 'race_data\\horse_info_202408010107.csv',
 'race_data\\horse_info_202408010108.csv',
 'race_data\\horse_info_202408010109.csv',
 'race_data\\horse_info_202408010110.csv',
 'race_data\\horse_info_202408010111.csv',
 'race_data\\horse_info_202408010112.csv',
 'race_data\\horse_info_202408010201.csv',
 'race_data\\horse_info_202408010202.csv',
 'race_data\\horse_info_202408010205.csv',
 'race_data\\horse_info_202408010206.csv',
 'race_data\\horse_info_202408010207.csv',
 'race_data\\horse_info_202408010208.csv',
 'race_data\\horse_info_202408010209.csv',
 'race_data\\horse_info_202408010210.csv',
 'race_data\\horse_info_202408010211.csv',
 'race_data\\horse_info_202408010212.csv',
 'race_data\\horse_info_202408010301.csv',
 'race_data\\horse_info_202408010302.csv',
 'race_data

In [62]:
# デコレータの定義
def track_calls(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        wrapper.call_count += 1
        return func(*args, call_count=wrapper.call_count, **kwargs)
    wrapper.call_count = 0
    return wrapper

@track_calls
def load_processed_df(p_csv, call_count):
    print(f'loading {p_csv}...')
    df = pd.read_csv(p_csv)
    df.columns = [c.replace(' ', '') for c in df.columns]
    df = df[USE_COL]
    df["レースID"] = call_count  # 各DataFrameにレースIDを付与
    df['性'] = df['性齢'].map(lambda x: x[0])
    df['齢'] = df['性齢'].map(lambda x: int(x[1:]))
    df.loc[df['着順'] == 1, '着差'] = 0
    df = df.drop(['性齢'], axis=1)
    df = df.loc[~df['着順'].isin(['取', '中', '除', '失'])]
    return df

In [63]:
# 複数のDataFrameをリストとして保持
dfs = [load_processed_df(p_race_csv) for p_race_csv in p_race_csvs]

# レースIDを追加して1つのデータフレームに統合
for i, df in enumerate(dfs):
    df["レースID"] = i  # 各DataFrameにレースIDを付与

# 単一のデータフレームに統合
merged_df = pd.concat(dfs, ignore_index=True)

merged_df

loading race_data\horse_info_202408010101.csv...
loading race_data\horse_info_202408010102.csv...
loading race_data\horse_info_202408010103.csv...
loading race_data\horse_info_202408010104.csv...
loading race_data\horse_info_202408010106.csv...
loading race_data\horse_info_202408010107.csv...
loading race_data\horse_info_202408010108.csv...
loading race_data\horse_info_202408010109.csv...
loading race_data\horse_info_202408010110.csv...
loading race_data\horse_info_202408010111.csv...
loading race_data\horse_info_202408010112.csv...
loading race_data\horse_info_202408010201.csv...
loading race_data\horse_info_202408010202.csv...
loading race_data\horse_info_202408010205.csv...
loading race_data\horse_info_202408010206.csv...
loading race_data\horse_info_202408010207.csv...
loading race_data\horse_info_202408010208.csv...
loading race_data\horse_info_202408010209.csv...
loading race_data\horse_info_202408010210.csv...
loading race_data\horse_info_202408010211.csv...
loading race_data\ho

Unnamed: 0,着順,馬名,斤量,騎手,タイム,着差,単勝,人気,馬体重,調教師,レースID,性,齢
0,1,セレブレイトエール,57.0,坂井瑠星,1:55.4,0,2.6,1.0,522(-4),[西] 大久保龍,0,牡,3
1,2,カフジテルビウム,57.0,長岡禎仁,1:55.9,3,14.7,5.0,464(+4),[西] 杉山佳明,0,牡,3
2,3,ルージュシュエット,55.0,鮫島克駿,1:55.9,アタマ,32.3,9.0,442(-2),[西] 矢作芳人,0,牝,3
3,4,ヤマニンアラクリア,57.0,田中健,1:56.2,1.3/4,19.3,7.0,480(+10),[西] 中村直也,0,牡,3
4,5,アメリカンチーフ,57.0,松若風馬,1:56.2,クビ,5.0,2.0,468(+2),[西] 音無秀孝,0,牡,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4436,9,ヤマニンサンパ,58.0,団野大成,2:13.3,2,165.0,13.0,474(-10),[西] 斉藤崇史,328,牡,6
4437,10,ジャスティンパレス,58.0,ルメール,2:13.6,1.3/4,3.7,2.0,472(0),[西] 杉山晴紀,328,牡,5
4438,11,シュトルーヴェ,58.0,レーン,2:14.5,5,25.4,9.0,470(-8),[東] 堀宣行,328,セ,5
4439,12,ヒートオンビート,58.0,坂井瑠星,2:15.5,6,162.3,12.0,476(+4),[西] 友道康夫,328,牡,7


In [64]:
# レースIDを取得
race_ids = merged_df["レースID"].unique()
train_ids, test_ids = train_test_split(race_ids, test_size=0.2, random_state=42)

# トレーニングデータとテストデータに分割
train_df = merged_df[merged_df["レースID"].isin(train_ids)]
test_df = merged_df[merged_df["レースID"].isin(test_ids)]

# グループ情報の再計算
train_group = train_df.groupby("レースID").size().tolist()
test_group = test_df.groupby("レースID").size().tolist()

# 特徴量とターゲットの抽出
features = ["斤量"]# , "人気", "馬体重", "タイム"]
X_train = train_df[features]
X_test = test_df[features]
# 着順を逆符号にしてスコア化
y_train = train_df["着順"].astype(float)
y_test = test_df["着順"].astype(float)

In [67]:
# LightGBM データセットの作成
train_data = lgb.Dataset(X_train, label=y_train, group=train_group)
test_data = lgb.Dataset(X_test, label=y_test, group=test_group, reference=train_data)

# モデルのハイパーパラメータ設定
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "boosting": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "verbose": -1
}

# モデルの学習
model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100,)

# テストデータでの予測
y_pred = model.predict(X_test)

# NDCGスコアの計算
true_relevance = [list(y_test)]  # 着順をリストに変換（符号は逆転済み）
predicted_scores = [y_pred]
ndcg = ndcg_score(true_relevance, predicted_scores)
print(f"NDCGスコア: {ndcg}")

NDCGスコア: 0.9079656156901998
