In [11]:
import sqlite3
import pandas as pd
from pathlib import Path

# Notebook の位置から data/keiba.db を参照
DB_PATH = Path.cwd().parents[0] / "data" / "keiba.db"
print("Using DB:", DB_PATH)

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query("SELECT * FROM v_race_features", conn)
conn.close()

df


Using DB: c:\Users\r_bas\keiba-model\data\keiba.db


Unnamed: 0,race_id,date,course_id,venue_id,course_name,course_surface,course_track_type,straight_len,slope_max,race_no,...,finish_rank,finish_status,finish_time_sec,odds,popularity,weight,weight_diff,last_3f,margin_sec,prize
0,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,1,OK,147.6,44.0,1,479,0,33.0,0.0,1000000.0
1,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,2,OK,147.9,34.9,2,478,3,33.2,0.2,1000000.0
2,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,3,OK,148.2,27.9,3,474,-3,33.4,0.4,1000000.0
3,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,4,OK,148.5,43.9,4,475,5,33.6,0.6,0.0
4,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,5,OK,148.8,7.4,5,483,-1,33.8,0.8,0.0
5,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,6,OK,149.1,28.0,6,472,-4,34.0,1.0,0.0
6,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,7,OK,149.4,6.2,7,481,2,34.2,1.2,0.0
7,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,8,OK,149.7,14.9,8,476,1,34.4,1.4,0.0
8,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,9,OK,150.0,40.6,9,486,-5,34.6,1.6,0.0
9,R2024TOK11,2024-10-20,TOK_T,TOK,東京 芝2400m,turf,standard,525,2.0,11,...,10,OK,150.3,16.9,10,481,4,34.8,1.8,0.0


In [12]:
from sklearn.preprocessing import LabelEncoder

data = df.copy()

# 3着以内なら1、それ以外は0
data["in_top3"] = (data["finish_rank"] <= 3).astype(int)

# 騎手・調教師のエンコード（今まで通り）
le_jockey = LabelEncoder()
data["jockey_encoded"] = le_jockey.fit_transform(data["jockey_name"])

le_trainer = LabelEncoder()
data["trainer_encoded"] = le_trainer.fit_transform(data["trainer_name"])

feature_cols = [
    "distance",
    "straight_len",
    "last_3f",
    "odds",
    "jockey_encoded",
    "trainer_encoded",
]

X = data[feature_cols]
y = data["in_top3"]   # ← ここがポイント
y_binary = data["in_top3"]
X, y


(   distance  straight_len  last_3f  odds  jockey_encoded  trainer_encoded
 0      2400           525     33.0  44.0               3                1
 1      2400           525     33.2  34.9               2                1
 2      2400           525     33.4  27.9               3                0
 3      2400           525     33.6  43.9               0                2
 4      2400           525     33.8   7.4               3                1
 5      2400           525     34.0  28.0               1                1
 6      2400           525     34.2   6.2               0                1
 7      2400           525     34.4  14.9               3                2
 8      2400           525     34.6  40.6               0                0
 9      2400           525     34.8  16.9               0                0,
 0    1
 1    1
 2    1
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    0
 Name: in_top3, dtype: int64)

In [13]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.3, random_state=42
)

model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.1,
    objective="binary:logistic",
    eval_metric="logloss"
)

model.fit(X_train, y_train)

print("Training complete!")


Training complete!


In [14]:
# 3着以内に入る確率を予測（テストデータ）
proba = model.predict_proba(X_test)[:, 1]
pred_class = (proba >= 0.5).astype(int)

# X_test の行番号を取得（どの馬がテストに回ったか）
test_idx = X_test.index

# テストデータだけの結果表を作成
result_test = pd.DataFrame({
    "horse":       data.loc[test_idx, "horse_name"],
    "finish_rank": data.loc[test_idx, "finish_rank"],
    "in_top3":     y_test,
    "P_top3":      proba,
    "pred_class":  pred_class,
}, index=test_idx)

result_test


Unnamed: 0,horse,finish_rank,in_top3,P_top3,pred_class
8,ダミーホース9,9,0,0.285714,0
1,ダミーホース2,2,1,0.285714,0
5,ダミーホース6,6,0,0.285714,0


In [15]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

auc = roc_auc_score(y_test, proba)
# モデルのランキング性能
print("ROC AUC:", roc_auc_score(y_test, proba))
# 上位2頭の軸の正確性
print("Precision:", precision_score(y_test, pred_class))
# 来る馬を見逃していないか
print("Recall:", recall_score(y_test, pred_class))
# 無駄な候補を出していないか
print("F1-score:", f1_score(y_test, pred_class))


ROC AUC: 0.5
Precision: 0.0
Recall: 0.0
F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
