# XGBoost Model Predictions



In [1]:
# Setup Environment

import os
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import xgboost as xgb
import optuna
import itertools
from sklearn.preprocessing import LabelEncoder
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment 
# Set global references to None
spark = None
master_results_df = None
df = None
training_data = None
train_df = None

In [3]:

spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()


Spark session created successfully.


In [4]:
upcoming_races = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/upcoming_races.parquet")
training_data = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/train_df")

In [5]:
training_data.count()

394322

In [6]:
upcoming_races.count()

1078

# Switching to Pandas

In [7]:
# Convert Spark DataFrame -> Pandas DataFrame
upcoming_races = upcoming_races.toPandas()
training_data = training_data.toPandas()
# Quick info about the DataFrame
#print(df.info())
#print(df.head(5))

                                                                                

## Set the race_id

In [8]:
upcoming_races["race_id"] = (
    upcoming_races["course_cd"].astype(str) + "_" +
    upcoming_races["race_date"].astype(str) + "_" +
    upcoming_races["race_number"].astype(str)
)

## Group and sort data by race_id and group_id

In [9]:
# Generate unique numeric group_id from race_id
upcoming_races["group_id"] = upcoming_races["race_id"].astype("category").cat.codes

In [10]:
# Sort by race_id for consistency
upcoming_races = upcoming_races.sort_values("group_id", ascending=True)
upcoming_races.reset_index(drop=True, inplace=True)

## Drop Non-numeric Features

In [11]:
unused_columns = [
    # columns you do NOT use in features or group_id
    "race_date", "date_of_birth"
]
cols_to_drop = [col for col in unused_columns if col in upcoming_races.columns]

upcoming_races.drop(columns=cols_to_drop, inplace=True)
print("After dropping unused cols, shape:", upcoming_races.shape)

After dropping unused cols, shape: (1078, 77)


# Convert DataTime columns to Numerical Values

In [12]:
# Convert datetime columns to numerical
upcoming_races["first_race_date_5"] = pd.to_datetime(upcoming_races["first_race_date_5"])
upcoming_races["most_recent_race_5"] = pd.to_datetime(upcoming_races["most_recent_race_5"])
upcoming_races["prev_race_date"] = pd.to_datetime(upcoming_races["prev_race_date"])

# Calculate numeric date features
upcoming_races["first_race_date_5_numeric"] = (upcoming_races["first_race_date_5"] - pd.Timestamp("1970-01-01")).dt.days
upcoming_races["most_recent_race_5_numeric"] = (upcoming_races["most_recent_race_5"] - pd.Timestamp("1970-01-01")).dt.days
upcoming_races["prev_race_date_numeric"] = (upcoming_races["prev_race_date"] - pd.Timestamp("1970-01-01")).dt.days

# Drop original datetime columns
upcoming_races.drop(columns=["first_race_date_5", "most_recent_race_5", "prev_race_date"], inplace=True)


## Set Rank/Label - Calculate Frequency Encoding from Historical Data:

Use historical data to calculate the frequency of each horse_id.

In [13]:
# If the largest official_fin is 20 (some races can have 20 horses),
# then label = (21 - official_fin).
# So official_fin=1 => label=20, official_fin=2 =>19, etc.
# If your max is 14, you can do (15 - official_fin).  Just ensure "best" horse has largest label.
if "official_fin" in training_data.columns:
    # Calculate 'rank' and add it to the DataFrame
    training_data["rank"] = 21 - training_data["official_fin"]
    # Drop the 'official_fin' column
    training_data.drop(columns=["official_fin"], inplace=True)

## Simple Target Encoding for (XGBoost/LightGBM)

In [14]:
# Ensure original horse_id is kept
upcoming_races["horse_id_original"] = upcoming_races["horse_id"]

def encode_horse_id_with_training_mean(upcoming_df, training_df, horse_col, target_col):
    """
    Encode horse_id in the upcoming data using the historical mean target
    from the training data.

    Parameters:
    - upcoming_df: DataFrame for races to predict
    - training_df: DataFrame with historical data
    - horse_col: Column containing horse IDs
    - target_col: Target column (e.g., rank, performance metric)

    Returns:
    - DataFrame with encoded horse_id
    """
    # Compute historical mean for each horse_id from the training data
    horse_means = training_df.groupby(horse_col)[target_col].mean()

    # Map the historical means to the upcoming races
    upcoming_df["horse_id_encoded"] = upcoming_df[horse_col].map(horse_means)

    # If a horse_id in upcoming races doesn't exist in training data, fill with the global mean
    global_mean = training_df[target_col].mean()
    upcoming_df["horse_id_encoded"] = upcoming_df["horse_id_encoded"].fillna(global_mean)

    return upcoming_df

# Example usage
upcoming_races = encode_horse_id_with_training_mean(
    upcoming_df=upcoming_races,
    training_df=training_data,  # Historical data
    horse_col="horse_id",
    target_col="rank"  # Column used for encoding (from historical data)
)

# Drop the original horse_id if it's not needed
upcoming_races.drop(columns=["horse_id"], inplace=True)

# Assigned Numerical Features

In [15]:

features = ['race_number','horse_id_encoded','purse','weight','claimprice','power','morn_odds','avgspd','class_rating',
    'net_sentiment','avg_spd_sd','ave_cl_sd','hi_spd_sd','pstyerl','all_starts','all_win','all_place',
    'all_show','all_fourth','all_earnings','cond_starts','cond_win','cond_place','cond_show','cond_fourth',
    'cond_earnings','avg_speed_5','best_speed','avg_beaten_len_5','avg_dist_bk_gate1_5','avg_dist_bk_gate2_5',
    'avg_dist_bk_gate3_5','avg_dist_bk_gate4_5','avg_speed_fullrace_5','avg_stride_length_5','avg_strfreq_q1_5',
    'avg_strfreq_q2_5','avg_strfreq_q3_5','avg_strfreq_q4_5','prev_speed','speed_improvement','days_off',
    'avg_workout_rank_3','jock_win_percent','jock_itm_percent','trainer_win_percent','trainer_itm_percent',
    'jt_win_percent','jt_itm_percent','jock_win_track','jock_itm_track','trainer_win_track','trainer_itm_track',
    'jt_win_track','jt_itm_track','age_at_race_day','distance_meters', 'count_workouts_3']

In [17]:
# upcoming_races.columns

# Set the Category Columns with Label Encoder

In [18]:
cat_cols = [ "course_cd", "sex", "equip", "surface", "med",  
            "race_type", "stk_clm_md", "turf_mud_mark", "layoff_cat"]
for c in cat_cols:
    lbl = LabelEncoder()
    upcoming_races[c] = lbl.fit_transform(upcoming_races[c].astype(str))

# Specify categorical feature indices
cat_cols_indices = [training_data.columns.get_loc(col) for col in cat_cols]

In [27]:
X_predict[col]

KeyError: 'layoff_cat'

# Make DMatrix Object

In [25]:
X_predict.columns

Index(['race_number', 'horse_id_encoded', 'purse', 'weight', 'claimprice',
       'power', 'morn_odds', 'avgspd', 'class_rating', 'net_sentiment',
       'avg_spd_sd', 'ave_cl_sd', 'hi_spd_sd', 'pstyerl', 'all_starts',
       'all_win', 'all_place', 'all_show', 'all_fourth', 'all_earnings',
       'cond_starts', 'cond_win', 'cond_place', 'cond_show', 'cond_fourth',
       'cond_earnings', 'avg_speed_5', 'best_speed', 'avg_beaten_len_5',
       'avg_dist_bk_gate1_5', 'avg_dist_bk_gate2_5', 'avg_dist_bk_gate3_5',
       'avg_dist_bk_gate4_5', 'avg_speed_fullrace_5', 'avg_stride_length_5',
       'avg_strfreq_q1_5', 'avg_strfreq_q2_5', 'avg_strfreq_q3_5',
       'avg_strfreq_q4_5', 'prev_speed', 'speed_improvement', 'days_off',
       'avg_workout_rank_3', 'jock_win_percent', 'jock_itm_percent',
       'trainer_win_percent', 'trainer_itm_percent', 'jt_win_percent',
       'jt_itm_percent', 'jock_win_track', 'jock_itm_track',
       'trainer_win_track', 'trainer_itm_track', 'jt_win_track',

In [22]:
# Retain horse_name and saddle_cloth_number
extra_cols = ["horse_name"]  # Columns to retain for output

# Create a DataFrame with only the necessary columns
X_predict = upcoming_races[features].copy()

# Convert categorical columns if needed
cat_cols = ["course_cd", "sex", "equip", "surface", "med", "race_type", "stk_clm_md", "turf_mud_mark", "layoff_cat"]
for col in cat_cols:
    if col in X_predict.columns:
        X_predict[col] = X_predict[col].astype("category")

# Handle missing values (adjust fill strategy as needed)
X_predict.fillna(0, inplace=True)

# Create DMatrix for prediction
dpredict = xgb.DMatrix(X_predict, feature_names=features)

# Define the path to the saved model
model_path = "/home/exx/myCode/horse-racing/FoxRiverAIRacing/src/models/XGB_914054_optuna_ranking_model-2025-01-11.json"

# Load the model
xgb_model = xgb.Booster()
xgb_model.load_model(model_path)

# Print the model to verify it has been loaded
print(xgb_model)

# Step 3: Predict scores
predicted_scores = xgb_model.predict(dpredict)


<xgboost.core.Booster object at 0x7f4eb9049d80>


# Load the XGB Model

In [None]:
# Define the path to the saved model
model_path = "/home/exx/myCode/horse-racing/FoxRiverAIRacing/src/models/XGB_914054_optuna_ranking_model-2025-01-11.json"

# Load the model
xgb_model = xgb.Booster()
xgb_model.load_model(model_path)

# Print the model to verify it has been loaded
print(xgb_model)

In [None]:


# Step 3: Predict scores
X_predict_values = X_predict.values
predicted_scores = xgb_model.predict(X_predict_values)