# XGBoost Model Preparation



In [1]:
# Setup Environment

import os
import logging
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import set_config
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
import pyspark.sql.functions as F
import xgboost as xgb
from sklearn import set_config
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
import configparser
from pyspark.sql import SparkSession
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, Window
from src.data_preprocessing.data_prep1.data_utils import (save_parquet, gather_statistics, 
                initialize_environment, load_config, initialize_spark, 
                identify_and_impute_outliers, 
                identify_and_remove_outliers, identify_missing_and_outliers)
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# Set global references to None
spark = None
master_results_df = None
race_df = None
df = None

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [3]:

spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()


Spark session created successfully.


In [4]:
# This dataset has already been cleaned up in the LGB notebook and saved as a starting point
# It now just need to be converted to Panadas and run in the GBDT variant model (LGB, XGB, CatBoost)
race_df = spark.read.parquet(os.path.join(parquet_dir, "race_df_p2.parquet"))


In [5]:
race_df.count()

324041

# Switching to Pandas

In [6]:
race_df = race_df.toPandas()
# Quick info about the DataFrame
#print(df.info())
#print(df.head(5))

                                                                                

## Set the race_id

In [7]:
race_df["race_id"] = (
    race_df["course_cd"].astype(str) + "_" +
    race_df["race_date"].astype(str) + "_" +
    race_df["race_number"].astype(str)
)

In [8]:
group_array = race_df.groupby("race_id").size().values  # array of group sizes
print(group_array)

[9 8 9 ... 9 9 6]


In [9]:
# If the largest official_fin is 20 (some races can have 20 horses),
# then label = (21 - official_fin).
# So official_fin=1 => label=20, official_fin=2 =>19, etc.
# If your max is 14, you can do (15 - official_fin).  Just ensure "best" horse has largest label.
race_df["rank"] = 21 - race_df["official_fin"]

In [10]:
if "official_fin" in race_df.columns:
    race_df.drop(columns=["official_fin"], inplace=True)

In [11]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ["course_cd", "sex", "equip", "surface", "trk_cond", "weather", "med", 
            "race_type", "stk_clm_md", "turf_mud_mark", "layoff_cat"]
for c in cat_cols:
    lbl = LabelEncoder()
    race_df[c] = lbl.fit_transform(race_df[c].astype(str))

In [12]:
race_df = race_df.sort_values("race_id", ascending=True)

In [13]:
features = [
    # Basic numeric columns
    "purse",
    "wps_pool",
    "weight",
    "claimprice",
    "power",
    "morn_odds",
    "distance_meters",
    "avgspd",
    "class_rating",
    "todays_cls",
    "net_sentiment",
    "avg_spd_sd",
    "ave_cl_sd",
    "hi_spd_sd",
    "pstyerl",

    # Cumulative performance stats
    "all_starts",
    "all_win",
    "all_place",
    "all_show",
    "all_fourth",
    "all_earnings",
    "cond_starts",
    "cond_win",
    "cond_place",
    "cond_show",
    "cond_fourth",
    "cond_earnings",

    # Recent form metrics
    "avg_fin_3",
    "avg_beaten_3",
    "avg_speed_3",
    "avg_fin_5",
    "avg_beaten_5",
    "avg_speed_5",
    "speed_improvement",
    "days_off",

    # Sectionals / GPS
    "avgtime_gate1",
    "avgtime_gate2",
    "avgtime_gate3",
    "avgtime_gate4",
    "total_distance_ran",
    "running_time",
    "speed_q1",
    "speed_q2",
    "speed_q3",
    "speed_q4",
    "total_dist_covered",
    "avg_acceleration",
    "net_progress_gain",
    "gps_avg_stride_length",

    # Jockey/Trainer stats
    "jock_win_percent",
    "jock_itm_percent",
    "trainer_win_percent",
    "trainer_itm_percent",
    "jt_win_percent",
    "jt_itm_percent",
    "jock_win_track",
    "jock_itm_track",
    "trainer_win_track",
    "trainer_itm_track",
    "jt_win_track",
    "jt_itm_track",

    # Other
    "age_at_race_day",
    "is_first_race",
]


X_all = race_df[features].values
y_all = race_df['rank'].values
race_ids = race_df['race_id'].values

### Split the data

In [14]:
from sklearn.utils import shuffle

unique_races = race_df['race_id'].unique()
unique_races = shuffle(unique_races, random_state=42)

train_ratio = 0.8
cut = int(len(unique_races) * train_ratio)
train_races = set(unique_races[:cut])
valid_races = set(unique_races[cut:])

# Create a boolean mask
train_mask = race_df['race_id'].isin(train_races)
valid_mask  = race_df['race_id'].isin(valid_races)

# Now slice
X_train = X_all[train_mask]
y_train = y_all[train_mask]
race_id_train = race_ids[train_mask]

X_valid = X_all[valid_mask]
y_valid = y_all[valid_mask]
race_id_valid = race_ids[valid_mask]

In [15]:
import numpy as np

def make_group_array(race_id_array):
    """
    Returns an array of group sizes in the order of race_id_array’s actual row order.
    Expects that race_id_array is sorted or lumps each race contiguously.
    """
    # Approach 1: Rely on the data already being grouped in contiguous rows
    # If your data is not guaranteed to be sorted by race_id, you can sort it first.
    # But let's assume from the train_mask/valid_mask approach that the relative order
    # is consistent. We can just accumulate counts.

    # A simpler approach: group the data by unique race_id in the order they appear
    # and store the size for each chunk.
    # We'll do a loop approach:

    groups = []
    current_race = None
    current_count = 0

    group_sequence = []

    for rid in race_id_array:
        if rid != current_race:
            # if we have an existing group, push it
            if current_race is not None:
                groups.append(current_count)
            current_race = rid
            current_count = 1
        else:
            current_count += 1
    # push the last group
    if current_race is not None and current_count > 0:
        groups.append(current_count)

    return np.array(groups, dtype=np.int32)

group_train = make_group_array(race_id_train)
group_valid  = make_group_array(race_id_valid)

# XGBoost

In [22]:
import xgboost as xgb
import numpy as np
import itertools

# 1) Create DMatrices for train/valid with group data
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

dvalid = xgb.DMatrix(X_valid, label=y_valid)
dvalid.set_group(group_valid)

# 2) A search space (grid) for hyperparameters
param_grid = {
    "eta": [0.01, 0.1],
    "max_depth": [4, 6],
    "min_child_weight": [10, 30],
    "subsample": [0.8, 1.0],
}

# 3) Base parameters for GPU-based ranking with ndcg@5
base_params = {
    "booster": "gbtree",
    "objective": "rank:pairwise",    # or "rank:ndcg"
    "eval_metric": ["ndcg@5"],       # focusing on ndcg@5
    "tree_method": "hist",           # recommended if using GPU
    "device": "cuda",                # specify GPU device
    "eta": 0.1,                      # will be overwritten by param_grid iteration
    "max_depth": 6,                  # will be overwritten
    "verbosity": 1
}



In [25]:
# 4) Prepare a watchlist for training/early stopping
watchlist = [(dtrain, "train"), (dvalid, "valid")]

best_score = float("-inf")
best_params = None
best_iteration = None

# 5) Iterate over all combinations in param_grid
for eta in param_grid["eta"]:
    for max_depth in param_grid["max_depth"]:
        for min_child_weight in param_grid["min_child_weight"]:
            for subsample in param_grid["subsample"]:

                # Copy base_params, then update with current hyperparams
                trial_params = base_params.copy()
                trial_params.update({
                    "eta": eta,
                    "max_depth": max_depth,
                    "min_child_weight": min_child_weight,
                    "subsample": subsample,
                })

                # 6) Train with early stopping
                model = xgb.train(
                    params=trial_params,
                    dtrain=dtrain,
                    num_boost_round=1000,
                    evals=watchlist,
                    early_stopping_rounds=10,
                    verbose_eval=50  # or False/True/50 if you want logs
                )

                # 7) model.best_score typically corresponds to the "valid-ndcg@5" 
                #    if "ndcg@5" is your only eval_metric. 
                this_score = model.best_score  # Usually the highest valid-ndcg@5

                # Update if better
                if this_score > best_score:
                    best_score = this_score
                    best_params = trial_params
                    best_iteration = model.best_iteration

# 8) Print final results
print("Best Score (valid-ndcg@5):", best_score)
print("Best Params:", best_params)
print("Best Iteration:", best_iteration)

[0]	train-ndcg@5:0.78572	valid-ndcg@5:0.78787
[50]	train-ndcg@5:0.87393	valid-ndcg@5:0.87438
[100]	train-ndcg@5:0.89574	valid-ndcg@5:0.89498
[150]	train-ndcg@5:0.90743	valid-ndcg@5:0.90583
[200]	train-ndcg@5:0.91485	valid-ndcg@5:0.91375
[250]	train-ndcg@5:0.92086	valid-ndcg@5:0.91887
[300]	train-ndcg@5:0.92516	valid-ndcg@5:0.92345
[350]	train-ndcg@5:0.92872	valid-ndcg@5:0.92701
[400]	train-ndcg@5:0.93205	valid-ndcg@5:0.93047
[450]	train-ndcg@5:0.93494	valid-ndcg@5:0.93305
[500]	train-ndcg@5:0.93724	valid-ndcg@5:0.93509
[550]	train-ndcg@5:0.93913	valid-ndcg@5:0.93735
[600]	train-ndcg@5:0.94105	valid-ndcg@5:0.93891
[650]	train-ndcg@5:0.94266	valid-ndcg@5:0.94080
[700]	train-ndcg@5:0.94449	valid-ndcg@5:0.94227
[750]	train-ndcg@5:0.94582	valid-ndcg@5:0.94352
[800]	train-ndcg@5:0.94708	valid-ndcg@5:0.94468
[850]	train-ndcg@5:0.94828	valid-ndcg@5:0.94590
[900]	train-ndcg@5:0.94941	valid-ndcg@5:0.94699
[950]	train-ndcg@5:0.95038	valid-ndcg@5:0.94790
[999]	train-ndcg@5:0.95135	valid-ndcg@5:0.9

In [26]:
# 2) Combine train + valid arrays into a single full set
X_full = np.concatenate([X_train, X_valid], axis=0)
y_full = np.concatenate([y_train, y_valid], axis=0)
group_full = np.concatenate([group_train, group_valid], axis=0)

# 3) Create an XGBoost DMatrix for the *full* data
dfull = xgb.DMatrix(X_full, label=y_full)
dfull.set_group(group_full)

In [27]:
# 4) Re-train a final model on the best parameters (train+valid, or just train)
#    We'll do a final train on (dtrain + dvalid) if you want. Or just on dtrain.

# Example: final train on just the same training set for demonstration
# If you want to combine train+valid => you'd DMatrix(np.concatenate(...)).
# But let's keep it simple:

# Best Score (valid-ndcg@5): 0.9700206338995528
# Best Params: {'booster': 'gbtree', 'objective': 'rank:pairwise', 'eval_metric': ['ndcg@5'], 
# 'tree_method': 'hist', 'device': 'cuda', 'eta': 0.1, 'max_depth': 4, 'verbosity': 1, 
# 'min_child_weight': 10, 'subsample': 0.8}
# Best Iteration: 495

best_params = {
    "booster": "gbtree",
    "objective": "rank:pairwise",    # or "rank:ndcg"
    "eval_metric": ["ndcg@5"],       # focusing on ndcg@5
    "tree_method": "hist",           # or "hist" with "device":"cuda"
    "device": "cuda",
    "eta": 0.1,
    "max_depth": 4,
    "min_child_weight": 10,      # <--- newly added
    "subsample": 0.8,            # <--- newly added
    "verbosity": 1
}

# Then train with those parameters:
final_model = xgb.train(
    params=best_params,
    dtrain=dfull,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=10,
    verbose_eval=50
)

print("Final model best iteration:", final_model.best_iteration)
print("Final model best score:", final_model.best_score)

# 5) Save the final model to disk, if desired:
final_model.save_model("/home/exx/myCode/horse-racing/FoxRiverAIRacing/src/models/XGB_ranking_best_model.json")

# 6) Predict on your validation set or a new test set
preds = final_model.predict(dvalid)
# 'preds' are ranking scores. Sort each group (race) descending to get top picks.

[0]	train-ndcg@5:0.78428	valid-ndcg@5:0.78697
[50]	train-ndcg@5:0.92983	valid-ndcg@5:0.93018
[100]	train-ndcg@5:0.94600	valid-ndcg@5:0.94570
[150]	train-ndcg@5:0.95381	valid-ndcg@5:0.95399
[200]	train-ndcg@5:0.95899	valid-ndcg@5:0.95963
[250]	train-ndcg@5:0.96314	valid-ndcg@5:0.96375
[300]	train-ndcg@5:0.96579	valid-ndcg@5:0.96653
[350]	train-ndcg@5:0.96820	valid-ndcg@5:0.96912
[400]	train-ndcg@5:0.97037	valid-ndcg@5:0.97117
[450]	train-ndcg@5:0.97203	valid-ndcg@5:0.97273
[500]	train-ndcg@5:0.97364	valid-ndcg@5:0.97457
[550]	train-ndcg@5:0.97499	valid-ndcg@5:0.97576
[600]	train-ndcg@5:0.97615	valid-ndcg@5:0.97675
[642]	train-ndcg@5:0.97705	valid-ndcg@5:0.97744
Final model best iteration: 632
Final model best score: 0.9774410556976562
