# CATBoost Model Predictions


In [None]:
#!conda install catboost -y

In [None]:
#spark.stop()

In [1]:
# Setup Environment
import os
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import optuna
from catboost import CatBoostRanker, CatBoostRegressor, CatBoostClassifier, Pool
import numpy as np
import itertools
from sklearn.preprocessing import LabelEncoder
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment 
# Set global references to None
spark = None
master_results_df = None
race_df = None
df = None
pred_data= None
train_df = None

In [3]:

spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()


Spark session created successfully.


In [4]:
# This dataset has already been cleaned up in the LGB notebook and saved as a starting point
# It now just need to be converted to Panadas and run in the GBDT variant model (LGB, XGB, CatBoost)
pred_data = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/predict.parquet")



In [5]:
pred_data.count()

1871

In [6]:
#pred_data.printSchema()

# Switching to Pandas

In [7]:
# Convert Spark DataFrame -> Pandas DataFrame

pred_data = pred_data.toPandas()
# Quick info about the DataFrame
#print(pred_data.info())
#print(pred_data.head(5))

# Set race_id

In [8]:
# Create race_id for grouping
pred_data["race_id"] = (
    pred_data["course_cd"].astype(str) + "_" +
    pred_data["race_date"].astype(str) + "_" +
    pred_data["race_number"].astype(str)
)

### Group and sort data by race_id and group_id

In [9]:
# Generate unique numeric group_id from race_id
pred_data["group_id"] = pred_data["race_id"].astype("category").cat.codes

In [10]:
# Sort by race_id for consistency
pred_data = pred_data.sort_values("group_id", ascending=True)
pred_data.reset_index(drop=True, inplace=True)

### Drop Non-numeric Features

In [11]:
unused_columns = [
    # columns you do NOT use in features or group_id
    "race_date", "horse_name", "date_of_birth", "saddle_cloth_number"
]
cols_to_drop = [col for col in unused_columns if col in pred_data.columns]

pred_data.drop(columns=cols_to_drop, inplace=True)
print("After dropping unused cols, shape:", pred_data.shape)

After dropping unused cols, shape: (1871, 75)


# Convert DataTime columns to Numerical Values

In [12]:
# Convert datetime columns to numerical
pred_data["first_race_date_5"] = pd.to_datetime(pred_data["first_race_date_5"])
pred_data["most_recent_race_5"] = pd.to_datetime(pred_data["most_recent_race_5"])
pred_data["prev_race_date"] = pd.to_datetime(pred_data["prev_race_date"])

# Calculate numeric date features
pred_data["first_race_date_5_numeric"] = (pred_data["first_race_date_5"] - pd.Timestamp("1970-01-01")).dt.days
pred_data["most_recent_race_5_numeric"] = (pred_data["most_recent_race_5"] - pd.Timestamp("1970-01-01")).dt.days
pred_data["prev_race_date_numeric"] = (pred_data["prev_race_date"] - pd.Timestamp("1970-01-01")).dt.days

# Drop original datetime columns
pred_data.drop(columns=["first_race_date_5", "most_recent_race_5", "prev_race_date"], inplace=True)


# Assigned Numerical Features

In [13]:
# Define features and target
features = [
    'horse_id', 'course_cd', 'sex', 'equip', 'surface', 'med',
    'race_type', 'stk_clm_md', 'turf_mud_mark', 'layoff_cat',
    'race_number', 'purse', 'weight', 'claimprice', 'power', 'morn_odds',
    'avgspd', 'class_rating', 'net_sentiment', 'avg_spd_sd', 'ave_cl_sd',
    'hi_spd_sd', 'pstyerl', 'all_starts', 'all_win', 'all_place',
    'all_show', 'all_fourth', 'all_earnings', 'cond_starts', 'cond_win',
    'cond_place', 'cond_show', 'cond_fourth', 'cond_earnings',
    'avg_speed_5', 'best_speed', 'avg_beaten_len_5', 'avg_dist_bk_gate1_5',
    'avg_dist_bk_gate2_5', 'avg_dist_bk_gate3_5', 'avg_dist_bk_gate4_5',
    'avg_speed_fullrace_5', 'avg_stride_length_5', 'avg_strfreq_q1_5',
    'avg_strfreq_q2_5', 'avg_strfreq_q3_5', 'avg_strfreq_q4_5',
    'prev_speed', 'speed_improvement', 'days_off', 'avg_workout_rank_3',
    'jock_win_percent', 'jock_itm_percent', 'trainer_win_percent',
    'trainer_itm_percent', 'jt_win_percent', 'jt_itm_percent',
    'jock_win_track', 'jock_itm_track', 'trainer_win_track',
    'trainer_itm_track', 'jt_win_track', 'jt_itm_track', 'age_at_race_day',
    'distance_meters', 'count_workouts_3'
]

# Set the Category Columns with Label Encoder

In [14]:
# Keep original horse_id for identification
pred_data["horse_id_original"] = pred_data["horse_id"]

# Encode categorical columns
cat_cols = [
    "horse_id", "course_cd", "sex", "equip", "surface", "med",  
    "race_type", "stk_clm_md", "turf_mud_mark", "layoff_cat"
]
for c in cat_cols:
    lbl = LabelEncoder()
    pred_data[c] = lbl.fit_transform(pred_data[c].astype(str))


In [15]:
#pred_data.columns

In [16]:
# non_numeric_columns = pred_data.select_dtypes(include=["object", "category"]).columns.tolist()
# print("Non-Numeric Columns in Entire DataFrame:", non_numeric_columns)

# Create the Prediction Pool

In [17]:
# Create the prediction pool
prediction_pool = Pool(
    data=pred_data[features],
    group_id=pred_data["group_id"],  # Ensure group_id is present for ranking
    cat_features=cat_cols  # Include categorical feature names
)

In [18]:
print("Prediction Pool has rows:", prediction_pool.num_row())

Prediction Pool has rows: 1871


In [19]:
print(f"Prediction Pool Size: {len(prediction_pool.get_label()) if prediction_pool.get_label() else 'N/A'}")

Prediction Pool Size: N/A


In [20]:
print("Non-Numeric Columns:")
print(pred_data.select_dtypes(include=["object", "category"]).columns.tolist())

Non-Numeric Columns:
['race_id']


In [108]:
# Remove column and row limits for DataFrame display
import pandas as pd
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_rows", None)     # Show all rows

# Example: Print full DataFrame info
#print(pred_data[features].info())
#print(pred_data[features].dtypes)

In [109]:
# Ensure cat_features are valid
print("Categorical Features:", cat_cols)
print("Features List:", features)

# Check for mismatches
invalid_cat_features = [col for col in cat_cols if col not in features]
#print("Invalid Categorical Features:", invalid_cat_features)


Categorical Features: ['horse_id', 'course_cd', 'sex', 'equip', 'surface', 'med', 'race_type', 'stk_clm_md', 'turf_mud_mark', 'layoff_cat']
Features List: ['horse_id', 'course_cd', 'sex', 'equip', 'surface', 'med', 'race_type', 'stk_clm_md', 'turf_mud_mark', 'layoff_cat', 'race_number', 'purse', 'weight', 'claimprice', 'power', 'morn_odds', 'avgspd', 'class_rating', 'net_sentiment', 'avg_spd_sd', 'ave_cl_sd', 'hi_spd_sd', 'pstyerl', 'all_starts', 'all_win', 'all_place', 'all_show', 'all_fourth', 'all_earnings', 'cond_starts', 'cond_win', 'cond_place', 'cond_show', 'cond_fourth', 'cond_earnings', 'avg_speed_5', 'best_speed', 'avg_beaten_len_5', 'avg_dist_bk_gate1_5', 'avg_dist_bk_gate2_5', 'avg_dist_bk_gate3_5', 'avg_dist_bk_gate4_5', 'avg_speed_fullrace_5', 'avg_stride_length_5', 'avg_strfreq_q1_5', 'avg_strfreq_q2_5', 'avg_strfreq_q3_5', 'avg_strfreq_q4_5', 'prev_speed', 'speed_improvement', 'days_off', 'avg_workout_rank_3', 'jock_win_percent', 'jock_itm_percent', 'trainer_win_percent

In [110]:
#print(pred_data.groupby("group_id").size().head(10))  # Group sizes by group_id

# Load the Saved Model

In [115]:
from catboost import CatBoostRanker

# Load the saved CatBoost model
cat_model = CatBoostRanker()
cat_model.load_model(
    "/home/exx/myCode/horse-racing/FoxRiverAIRacing/src/models/catboost_984316_2025-01-12_final.cbm",
    format="cbm"
)

<catboost.core.CatBoostRanker at 0x7fcab09a3760>

# Feature Importance

In [117]:
# Retrieve feature importance using PredictionValuesChange
feature_importance = cat_model.get_feature_importance(type="PredictionValuesChange")

# Retrieve feature names
feature_names = cat_model.feature_names_

# Pair feature names with their importance
importance_dict = dict(zip(feature_names, feature_importance))

# Sort by importance
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print sorted feature importance
for feature, importance in sorted_importance:
    print(f"{feature}: {importance}")

prev_speed: 17.115107208174365
speed_improvement: 16.93909660442474
class_rating: 7.067856629289685
best_speed: 4.892519917841825
course_cd: 4.309212272001578
avg_beaten_len_5: 4.154901547689146
horse_id: 3.9430072434537897
avg_speed_5: 3.369067676950059
morn_odds: 3.267383400572228
avg_dist_bk_gate4_5: 2.9516203480286007
purse: 1.436664140490428
all_earnings: 1.4079492478775524
power: 1.3008053285064956
net_sentiment: 1.2990937365850777
all_starts: 1.271951283661648
cond_earnings: 1.266172468733301
age_at_race_day: 1.0722257745588755
avgspd: 0.8356976854925623
trainer_itm_track: 0.7731046698846702
weight: 0.7657788425598777
jock_itm_percent: 0.7536795972613893
cond_starts: 0.7513864939224291
jock_win_track: 0.7497219063945099
pstyerl: 0.7268763620597681
days_off: 0.7253952335984634
cond_win: 0.6964474505762684
layoff_cat: 0.6696095993835933
jock_win_percent: 0.667825096953167
trainer_win_track: 0.6674268776551587
avg_speed_fullrace_5: 0.6625414226029679
trainer_itm_percent: 0.64562591

# Run Predictions

In [27]:
# Generate predictions
predictions = final_model.predict(prediction_pool)

# Attach predictions to the original dataset
pred_data["predicted_score"] = predictions

# Rank horses within each race based on their predicted scores
pred_data["predicted_rank"] = pred_data.groupby("race_id")["predicted_score"].rank(
    method="first", ascending=False
)

In [91]:
winners = pred_data[pred_data["predicted_rank"] == 1].head(30)

# Create a filtered subset
subset_df = winners.loc[
    pred_data["race_id"] == target_race,
    ["race_id", "predicted_score", "predicted_rank", "horse_id_original"]]

print(subset_df)

                race_id  predicted_score  predicted_rank  horse_id_original
134  AQU_2025-01-11_9.0         5.062589             1.0          1089648.0


In [None]:
sorted_preds = pred_data.sort_values(by=["race_id", "predicted_rank"])
sorted_preds.head(20)

In [29]:
top_horses = pred_data[pred_data["predicted_rank"] <= 4]

In [37]:
print(top_horses) # race_id, predicted_score, predicted_rank 

      course_cd  race_number  horse_id     purse  weight  sex  equip  \
0             0          1.0      1631   44000.0   122.0    1      0   
1             0          1.0      1682   44000.0   117.0    1      0   
3             0          1.0      1760   44000.0   122.0    1      0   
4             0          1.0      1498   44000.0   122.0    1      0   
7             0          2.0       199   32000.0   120.0    0      0   
9             0          2.0      1840   32000.0   115.0    2      0   
10            0          2.0       209   32000.0   120.0    2      0   
12            0          2.0       183   32000.0   120.0    2      0   
14            0          3.0       771   50000.0   123.0    4      0   
16            0          3.0      1713   50000.0   123.0    4      0   
20            0          3.0       462   50000.0   123.0    1      0   
21            0          3.0       403   50000.0   121.0    4      0   
22            0          4.0      1337   37000.0   123.0    1   

In [93]:
# Choose the race_id you're interested in
target_race = "AQU_2025-01-11_9.0"

# Create a filtered subset
subset_df = pred_data.loc[
    pred_data["race_id"] == target_race,
    ["race_id", "predicted_score", "predicted_rank", "horse_id_original"]
]

# Sort by predicted_rank ascending (rank=1 is top)
subset_df = subset_df.sort_values(by="predicted_rank")

# Show the result
print(subset_df)

                race_id  predicted_score  predicted_rank  horse_id_original
134  AQU_2025-01-11_9.0         5.062589             1.0          1089648.0
143  AQU_2025-01-11_9.0         1.072100             2.0          1099289.0
135  AQU_2025-01-11_9.0        -0.569080             3.0          1140749.0
139  AQU_2025-01-11_9.0        -0.716339             4.0          1109612.0
142  AQU_2025-01-11_9.0        -1.660384             5.0          1089650.0
133  AQU_2025-01-11_9.0        -2.018650             6.0          2174461.0
144  AQU_2025-01-11_9.0        -2.953976             7.0          1099296.0
137  AQU_2025-01-11_9.0        -3.423146             8.0          2174465.0
138  AQU_2025-01-11_9.0        -3.482755             9.0          2174459.0
136  AQU_2025-01-11_9.0        -3.862698            10.0          2174464.0
141  AQU_2025-01-11_9.0        -4.796038            11.0          2174469.0
140  AQU_2025-01-11_9.0        -4.879486            12.0           981714.0


In [105]:
# Create a filtered subset
subset_df = pred_data[
    ["race_id", "horse_id_original", "predicted_rank", "predicted_score"]
]

# Sort by predicted_rank ascending (rank=1 is top)
subset_df = subset_df.sort_values(by=["race_id", "predicted_rank"])

# Show the result
#print(subset_df[["race_id", "horse_id_original", "predicted_score"]])