In [18]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from functools import wraps
from sklearn.metrics import ndcg_score
from glob import glob
from os import path as osp

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
BASE_DIR = 'race_data/'
USE_COL = ['着順', '馬名', '性齢', '斤量', '騎手', 'タイム', '着差', '単勝', '人気', '馬体重', '調教師', 'horse_id', 'jockey_id']

In [21]:
p_race_csvs = sorted(glob(osp.join(BASE_DIR, 'horse_info_*.csv')))
p_race_csvs

['race_data\\horse_info_202408010101.csv',
 'race_data\\horse_info_202408010102.csv',
 'race_data\\horse_info_202408010103.csv',
 'race_data\\horse_info_202408010104.csv',
 'race_data\\horse_info_202408010106.csv',
 'race_data\\horse_info_202408010107.csv',
 'race_data\\horse_info_202408010108.csv',
 'race_data\\horse_info_202408010109.csv',
 'race_data\\horse_info_202408010110.csv',
 'race_data\\horse_info_202408010111.csv',
 'race_data\\horse_info_202408010112.csv',
 'race_data\\horse_info_202408010201.csv',
 'race_data\\horse_info_202408010202.csv',
 'race_data\\horse_info_202408010205.csv',
 'race_data\\horse_info_202408010206.csv',
 'race_data\\horse_info_202408010207.csv',
 'race_data\\horse_info_202408010208.csv',
 'race_data\\horse_info_202408010209.csv',
 'race_data\\horse_info_202408010210.csv',
 'race_data\\horse_info_202408010211.csv',
 'race_data\\horse_info_202408010212.csv',
 'race_data\\horse_info_202408010301.csv',
 'race_data\\horse_info_202408010302.csv',
 'race_data

In [22]:
def load_processed_df(p_csv, use_col=USE_COL):
    df = pd.read_csv(p_csv)
    df.columns = [c.replace(' ', '') for c in df.columns]
    df = df[use_col]
    df["レースID"] = int(osp.basename(p_csv).split('_')[2].split('.')[0])  # 各DataFrameにレースIDを付与
    df['性'] = df['性齢'].map(lambda x: x[0])
    df['齢'] = df['性齢'].map(lambda x: int(x[1:]))
    df['馬体重'] = df['馬体重'].map(lambda s: s if s != '計不' else None)
    df['当日体重'] = df['馬体重'].map(lambda s: int(s.split('(')[0]) if s is not None else None)
    df['増減'] = df['馬体重'].map(lambda s: int(s.split('(')[1].replace(')', '')) if s is not None else None)
    df.loc[df['着順'] == 1, '着差'] = 0
    df = df.drop(['性齢', '馬体重'], axis=1)
    df = df.loc[~df['着順'].isin(['取', '中', '除', '失'])]
    df['着順'] = (1 + len(df) - df['着順'].astype(int))
    return df

In [23]:
# 複数のDataFrameをリストとして保持
dfs = [load_processed_df(p_race_csv) for p_race_csv in p_race_csvs]

# 単一のデータフレームに統合
merged_df = pd.concat(dfs, ignore_index=True)

merged_df

Unnamed: 0,着順,馬名,斤量,騎手,タイム,着差,単勝,人気,調教師,horse_id,jockey_id,レースID,性,齢,当日体重,増減
0,16,セレブレイトエール,57.0,坂井瑠星,1:55.4,0,2.6,1.0,[西] 大久保龍,2021104109,1163,202408010101,牡,3,522.0,-4.0
1,15,カフジテルビウム,57.0,長岡禎仁,1:55.9,3,14.7,5.0,[西] 杉山佳明,2021102150,1142,202408010101,牡,3,464.0,4.0
2,14,ルージュシュエット,55.0,鮫島克駿,1:55.9,アタマ,32.3,9.0,[西] 矢作芳人,2021105640,1157,202408010101,牝,3,442.0,-2.0
3,13,ヤマニンアラクリア,57.0,田中健,1:56.2,1.3/4,19.3,7.0,[西] 中村直也,2021100284,1114,202408010101,牡,3,480.0,10.0
4,12,アメリカンチーフ,57.0,松若風馬,1:56.2,クビ,5.0,2.0,[西] 音無秀孝,2021110123,1154,202408010101,牡,3,468.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7489,5,ロゼフレア,53.0,岩田望来,1:22.3,クビ,20.9,7.0,[西] 中村直也,2021103203,1174,202408070812,牝,3,452.0,6.0
7490,4,アサギリ,52.0,永島まな,1:22.3,ハナ,68.7,14.0,[東] 尾形和幸,2021100610,1187,202408070812,牝,3,460.0,-4.0
7491,3,エイムフォーエース,55.0,高杉吏麒,1:22.3,アタマ,21.7,8.0,[東] 武井亮,2021104167,1213,202408070812,牡,3,444.0,0.0
7492,2,クイックバイオ,54.0,佐々木大,1:22.4,3/4,8.4,5.0,[西] 須貝尚介,2021105374,1197,202408070812,牝,3,490.0,-4.0


In [24]:
# レースIDを取得
race_ids = merged_df["レースID"].unique()
train_valid_ids, test_ids = train_test_split(race_ids, test_size=0.2, shuffle=False)
train_ids, valid_ids = train_test_split(train_valid_ids, test_size=0.2, shuffle=False)
print(f'train size: {len(train_ids)}, valid size: {len(valid_ids)}, test size: {len(test_ids)}')

# トレーニングデータとテストデータに分割
train_df = merged_df[merged_df["レースID"].isin(train_ids)]
valid_df = merged_df[merged_df["レースID"].isin(valid_ids)]
test_df = merged_df[merged_df["レースID"].isin(test_ids)]

# グループ情報の再計算
train_group = train_df.groupby("レースID").size().tolist()
valid_group = valid_df.groupby("レースID").size().tolist()
test_group = test_df.groupby("レースID").size().tolist()

# 特徴量とターゲットの抽出
features = ["斤量", "horse_id", "jockey_id", "人気", "当日体重", "増減"]
X_train = train_df[features]
X_valid = valid_df[features]
X_test = test_df[features]
y_train = train_df["着順"]
y_valid = valid_df["着順"]
y_test = test_df["着順"]

train size: 361, valid size: 91, test size: 113


In [25]:
# LightGBM データセットの作成
train_data = lgb.Dataset(X_train, label=y_train, group=train_group)
valid_data = lgb.Dataset(X_valid, label=y_valid, group=valid_group, reference=train_data)
test_data = lgb.Dataset(X_test, label=y_test, group=test_group, reference=train_data)

# モデルのハイパーパラメータ設定
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "boosting": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "verbose": -1
}

# モデルの学習
model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=1000,
)

In [26]:
# テストデータでの予測
y_pred = model.predict(X_test)

# NDCGスコアの計算
true_relevance = [list(y_test)]
predicted_scores = [y_pred]
ndcg = ndcg_score(true_relevance, predicted_scores)
print(f"NDCGスコア: {ndcg}")

NDCGスコア: 0.926851234062691


In [27]:
test_df.loc[:, '予測重み'] = y_pred
race_ids = test_df.loc[:, 'レースID'].unique()
for race_id in race_ids:
    test_df.loc[test_df.loc[:, 'レースID'] == race_id, '予測'] = test_df.loc[test_df.loc[:, 'レースID'] == race_id, '予測重み'].rank(ascending=True, method='min')
    print(test_df.loc[test_df['レースID'] == race_id, ['レースID', '予測', '着順', '馬名', '予測重み']].sort_values('予測', ascending=True))
    print('---')

             レースID    予測  着順         馬名      予測重み
5966  202408060504   1.0   5    ホリゾンブルー -6.497894
5968  202408060504   2.0   3  キッシングセーラー -5.669372
5969  202408060504   3.0   2  ウインドラブハーツ -5.637220
5970  202408060504   4.0   1   インナーリソース -5.461516
5961  202408060504   5.0  10    ジェットエアー -4.282516
5962  202408060504   6.0   9  マーゴットレジーナ -3.605737
5959  202408060504   7.0  12      パスコード -3.540067
5967  202408060504   8.0   4    ダノンカゼルタ -1.204029
5963  202408060504   9.0   8    セイフウサツキ -0.673853
5965  202408060504  10.0   6       ディニテ -0.649662
5964  202408060504  11.0   7       ツインギ -0.468419
5957  202408060504  12.0  14  フォーキャンドルズ -0.088931
5960  202408060504  13.0  11  ランフォースマイル -0.047725
5958  202408060504  14.0  13  ルージュシークエル  0.763315
---
             レースID    予測  着順         馬名      予測重み
5971  202408060505   1.0  13  メイショウタマユラ -6.255754
5982  202408060505   2.0   2     インピッシュ -5.913496
5978  202408060505   3.0   6  ウォーターカルエル -5.280800
5981  202408060505   4.0   3     シエルルビー -4.653

In [28]:
def calc_mean_rank(test_df, race_id):
    test_df = test_df.copy()
    sub_df = test_df.loc[test_df.loc[:, 'レースID'] == race_id, :].sort_values(
        '予測', ascending=True
    ).reset_index()
    return sub_df.loc[sub_df['着順'] == 1].index.tolist()[0] + 1

mean_rank = np.nanmean([calc_mean_rank(test_df, race_id) for race_id in test_df.loc[:, 'レースID'].unique()])
print(f'平均順位誤差: {mean_rank}')

平均順位誤差: 5.814159292035399
