# XGBoost Model Preparation



In [1]:
# Setup Environment

import os
import logging
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import set_config
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
import pyspark.sql.functions as F
import xgboost as xgb
from sklearn import set_config
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
import configparser
from pyspark.sql import SparkSession
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, Window
from src.data_preprocessing.data_prep1.data_utils import (save_parquet, gather_statistics, 
                initialize_environment, load_config, initialize_spark, 
                identify_and_impute_outliers, 
                identify_and_remove_outliers, identify_missing_and_outliers)
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# Set global references to None
spark = None
master_results_df = None
race_df = None
df = None

In [3]:

spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()


2025-01-01 10:33:49,648 - INFO - Environment setup initialized.


Spark session created successfully.


In [4]:
# This dataset has already been cleaned up in the LGB notebook and saved as a starting point
# It now just need to be converted to Panadas and run in the GBDT variant model (LGB, XGB, CatBoost)
race_df = spark.read.parquet(os.path.join(parquet_dir, "race_df_p2.parquet"))


In [5]:
race_df.count()

777100

# Switching to Pandas

In [6]:
race_df = race_df.toPandas()
# Quick info about the DataFrame
#print(df.info())
#print(df.head(5))

                                                                                

## Set the race_id

In [None]:
race_df["race_id"] = (
    race_df["course_cd"].astype(str) + "_" +
    race_df["race_date"].astype(str) + "_" +
    race_df["race_number"].astype(str)
)

In [None]:
group_array = race_df.groupby("race_id").size().values  # array of group sizes
print(group_array)

In [None]:
# If the largest official_fin is 20 (some races can have 20 horses),
# then label = (21 - official_fin).
# So official_fin=1 => label=20, official_fin=2 =>19, etc.
# If your max is 14, you can do (15 - official_fin).  Just ensure "best" horse has largest label.
race_df["rank"] = 21 - race_df["official_fin"]

In [None]:
if "official_fin" in race_df.columns:
    race_df.drop(columns=["official_fin"], inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ["course_cd", "sex", "equip", "surface", "trk_cond", "weather", "med", 
            "race_type", "stk_clm_md", "turf_mud_mark", "layoff_cat"]
for c in cat_cols:
    lbl = LabelEncoder()
    race_df[c] = lbl.fit_transform(race_df[c].astype(str))

In [None]:
race_df = race_df.sort_values("race_id", ascending=True)

In [None]:
features = [
    # Basic numeric columns
    "purse",
    "wps_pool",
    "weight",
    "claimprice",
    "power",
    "morn_odds",
    "distance_meters",
    "avgspd",
    "class_rating",
    "todays_cls",
    "net_sentiment",
    "avg_spd_sd",
    "ave_cl_sd",
    "hi_spd_sd",
    "pstyerl",

    # Cumulative performance stats
    "all_starts",
    "all_win",
    "all_place",
    "all_show",
    "all_fourth",
    "all_earnings",
    "cond_starts",
    "cond_win",
    "cond_place",
    "cond_show",
    "cond_fourth",
    "cond_earnings",

    # Recent form metrics
    "avg_fin_3",
    "avg_beaten_3",
    "avg_speed_3",
    "avg_fin_5",
    "avg_beaten_5",
    "avg_speed_5",
    "speed_improvement",
    "days_off",

    # Sectionals / GPS
    "avgtime_gate1",
    "avgtime_gate2",
    "avgtime_gate3",
    "avgtime_gate4",
    "total_distance_ran",
    "running_time",
    "speed_q1",
    "speed_q2",
    "speed_q3",
    "speed_q4",
    "total_dist_covered",
    "avg_acceleration",
    "net_progress_gain",
    "gps_avg_stride_length",

    # Jockey/Trainer stats
    "jock_win_percent",
    "jock_itm_percent",
    "trainer_win_percent",
    "trainer_itm_percent",
    "jt_win_percent",
    "jt_itm_percent",
    "jock_win_track",
    "jock_itm_track",
    "trainer_win_track",
    "trainer_itm_track",
    "jt_win_track",
    "jt_itm_track",

    # Other
    "age_at_race_day",
    "is_first_race",
]


X_all = race_df[features].values
y_all = race_df['rank'].values
race_ids = race_df['race_id'].values

### Split the data

In [None]:
from sklearn.utils import shuffle

unique_races = race_df['race_id'].unique()
unique_races = shuffle(unique_races, random_state=42)

train_ratio = 0.8
cut = int(len(unique_races) * train_ratio)
train_races = set(unique_races[:cut])
valid_races = set(unique_races[cut:])

# Create a boolean mask
train_mask = race_df['race_id'].isin(train_races)
valid_mask  = race_df['race_id'].isin(valid_races)

# Now slice
X_train = X_all[train_mask]
y_train = y_all[train_mask]
race_id_train = race_ids[train_mask]

X_valid = X_all[valid_mask]
y_valid = y_all[valid_mask]
race_id_valid = race_ids[valid_mask]

In [None]:
import numpy as np

def make_group_array(race_id_array):
    """
    Returns an array of group sizes in the order of race_id_array’s actual row order.
    Expects that race_id_array is sorted or lumps each race contiguously.
    """
    # Approach 1: Rely on the data already being grouped in contiguous rows
    # If your data is not guaranteed to be sorted by race_id, you can sort it first.
    # But let's assume from the train_mask/valid_mask approach that the relative order
    # is consistent. We can just accumulate counts.

    # A simpler approach: group the data by unique race_id in the order they appear
    # and store the size for each chunk.
    # We'll do a loop approach:

    groups = []
    current_race = None
    current_count = 0

    group_sequence = []

    for rid in race_id_array:
        if rid != current_race:
            # if we have an existing group, push it
            if current_race is not None:
                groups.append(current_count)
            current_race = rid
            current_count = 1
        else:
            current_count += 1
    # push the last group
    if current_race is not None and current_count > 0:
        groups.append(current_count)

    return np.array(groups, dtype=np.int32)

group_train = make_group_array(race_id_train)
group_valid  = make_group_array(race_id_valid)

# XGBoost

In [17]:
set_config(display="text")  # Switch to text-based display

xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",  # Multi-class classification
    predictor='gpu_predictor', # GPU acceleration
    num_class=8,               # Number of classes (0 to 7)
    max_depth=6,               # Tree depth
    learning_rate=0.1,         # Learning rate
    n_estimators=100,          # Number of trees
    eval_metric="mlogloss",    # Log loss for multi-class
    early_stopping_rounds=10   # Specify early stopping rounds here
)

In [18]:
# Train the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True  # Use verbose for training progress
)


[0]	validation_0-mlogloss:2.06199	validation_1-mlogloss:2.06233
[1]	validation_0-mlogloss:2.04695	validation_1-mlogloss:2.04759
[2]	validation_0-mlogloss:2.03382	validation_1-mlogloss:2.03479
[3]	validation_0-mlogloss:2.02233	validation_1-mlogloss:2.02360
[4]	validation_0-mlogloss:2.01206	validation_1-mlogloss:2.01363
[5]	validation_0-mlogloss:2.00295	validation_1-mlogloss:2.00482
[6]	validation_0-mlogloss:1.99479	validation_1-mlogloss:1.99699
[7]	validation_0-mlogloss:1.98747	validation_1-mlogloss:1.98996
[8]	validation_0-mlogloss:1.98084	validation_1-mlogloss:1.98360
[9]	validation_0-mlogloss:1.97486	validation_1-mlogloss:1.97792
[10]	validation_0-mlogloss:1.96942	validation_1-mlogloss:1.97277
[11]	validation_0-mlogloss:1.96444	validation_1-mlogloss:1.96807
[12]	validation_0-mlogloss:1.95992	validation_1-mlogloss:1.96384
[13]	validation_0-mlogloss:1.95579	validation_1-mlogloss:1.95998
[14]	validation_0-mlogloss:1.95200	validation_1-mlogloss:1.95647
[15]	validation_0-mlogloss:1.94855	

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=10,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_class=8, num_parallel_tree=None, objective='multi:softmax', ...)

In [19]:
# from sklearn.utils.class_weight import compute_class_weight

# # Assuming y_train contains the labels
# class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
# class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Use 'balanced' which automatically computes the class weights based on training data.

In [20]:
# Predict on test set
y_pred = xgb_model.predict(X_test)

# Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[10995  3068  2111  1205  1207   547     8  2014]
 [ 8152  3238  2674  1656  1836   836    16  2720]
 [ 6184  2977  2877  2054  2455  1184    21  3353]
 [ 4604  2544  2670  2280  3121  1688    28  4086]
 [ 3433  2061  2374  2265  3347  1966    44  4933]
 [ 2353  1499  1779  1730  2906  2162    45  5525]
 [ 1422   897  1034   975  1745  1621    50  5669]
 [ 1345   831   972   780  1409  1474    29 12336]]

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.52      0.37     21155
           1       0.19      0.15      0.17     21128
           2       0.17      0.14      0.15     21105
           3       0.18      0.11      0.13     21021
           4       0.19      0.16      0.17     20423
           5       0.19      0.12      0.15     17999
           6       0.21      0.00      0.01     13413
           7       0.30      0.64      0.41     19176

    accuracy                           0.24    155420
   macr

In [22]:
importances = xgb_model.feature_importances_
for col, imp in zip(feature_cols, importances):
    print(f"{col}: {imp:.4f}")

morn_odds: 0.4517
net_sentiment: 0.0453
power: 0.0157
avg_spd_sd: 0.0172
hi_spd_sd: 0.0184
avgspd: 0.0175
ave_cl_sd: 0.0118
cond_win: 0.0154
cond_place: 0.0095
all_win: 0.0197
all_place: 0.0129
cond_earnings: 0.0124
all_earnings: 0.0170
weight: 0.0101
cond_show: 0.0100
all_show: 0.0153
cond_starts: 0.0121
all_starts: 0.0161
class_rating: 0.0171
age_at_race_day: 0.0117
distance: 0.0117
horse_id: 0.0104
claimprice: 0.0133
wps_pool: 0.0274
cond_fourth: 0.0115
all_fourth: 0.0141
purse: 0.0180
pstyerl: 0.0195
race_number: 0.0447
start_position: 0.0726


# Predicting Probabilities for Ranking

In [21]:
# Step 1: Predict Probabilities
predicted_probs = rf_model.predict_proba(X_test_scaled)[:, 1]  # Probability for winning



In [22]:
# Step 2: Combine Metadata with Predictions
ranked_df = metadata_test.copy()
ranked_df["predicted_probability"] = predicted_probs

# Step 3: Rank Horses
ranked_df["rank"] = (
    ranked_df.groupby(["race_date", "race_number"])["predicted_probability"]
    .rank(method="first", ascending=False)
)

# Step 4: Sort Ranked DataFrame
ranked_df = ranked_df.sort_values(by=["race_date", "race_number", "rank"])

# Step 5: Save or Display Results
print(ranked_df.head(20))  # Display top 20 ranked horses
ranked_df.to_csv("ranked_horses.csv", index=False)  # Save to CSV if needed

         race_date  race_number  horse_id  predicted_probability  rank
443968  2022-01-01            1    257862                  0.190   1.0
403657  2022-01-01            1     96356                  0.189   2.0
557148  2022-01-01            1     14941                  0.188   3.0
143143  2022-01-01            1    294126                  0.187   4.0
732523  2022-01-01            1    113458                  0.148   5.0
241696  2022-01-01            1      9903                  0.143   6.0
732524  2022-01-01            1     35560                  0.085   7.0
732528  2022-01-01            1    181319                  0.084   8.0
403659  2022-01-01            1     96358                  0.084   9.0
200286  2022-01-01            1    269764                  0.084  10.0
443966  2022-01-01            1     10298                  0.078  11.0
200285  2022-01-01            1     62596                  0.078  12.0
424227  2022-01-01            1    163465                  0.077  13.0
443969

In [23]:
import pandas as pd
import numpy as np

# Assuming X_test corresponds to the feature matrix for your test set
# And you have additional columns `race_date`, `race_number`, `horse_id` in the original data.

# Step 1: Predict Probabilities
proba = rf_model.predict_proba(X_test_scaled)  # Shape: (n_samples, 2)
predicted_probs = proba[:, 1]  # Probability for class 1 (winning)

# Step 2: Create a DataFrame for `X_test` with metadata
# Replace with actual metadata (e.g., from original test data before splitting)
metadata = pd.DataFrame({
    "race_date": race_date,  # Replace with actual race dates
    "race_number": race_number,  # Replace with actual race numbers
    "horse_id": horse_id,  # Replace with actual horse IDs
})

# Combine predicted probabilities with metadata
ranked_df = metadata.copy()
ranked_df["predicted_probability"] = predicted_probs

# Step 3: Rank Horses
# Group by race_date and race_number, then rank by predicted_probability
ranked_df["rank"] = (
    ranked_df.groupby(["race_date", "race_number"])["predicted_probability"]
    .rank(method="first", ascending=False)
)

# Step 4: Sort Ranked DataFrame
ranked_df = ranked_df.sort_values(by=["race_date", "race_number", "rank"])

# Step 5: Save or Display Results
print(ranked_df.head(20))  # Display top 20 ranked horses
ranked_df.to_csv("ranked_horses.csv", index=False)  # Save to CSV if needed



NameError: name 'race_date' is not defined

In [None]:
ranked_df = metadata_test.copy()  # Metadata includes race_date, race_number, horse_id
ranked_df["predicted_probability"] = predicted_probs
ranked_df["actual_label"] = y_test  # Optional: for evaluation purposes

## Predict via Ranking using Test Race from Dataset

You can test the model on a race in your dataset by excluding the target variable (official_fin) and making predictions as if it were a new race. Here’s how you can proceed:

Step 1: Select a Race for Testing

Extract a specific race from your dataset based on race_date and race_number.

race_to_test = df[
    (df["race_date"] == "2023-05-15") & (df["race_number"] == 5)
].drop(columns=["official_fin"])  # Drop the target variable

Step 2: Prepare the Features

Ensure that the extracted race has the same feature processing (scaling, encoding) as was done during training.

# Extract features
X_race = race_to_test[feature_cols]  # Ensure `feature_cols` matches the training feature set

# Scale features
X_race_scaled = scaler.transform(X_race)  # Use the scaler fitted during training

Step 3: Predict Probabilities

Use the model to predict probabilities for this specific race.

# Predict probabilities
race_probs = rf_model.predict_proba(X_race_scaled)

# Attach probabilities back to the metadata
race_to_test["predicted_probability"] = race_probs[:, 1]  # Assuming class 1 is 'winning'

# Rank horses by predicted probability
race_to_test["rank"] = race_to_test["predicted_probability"].rank(ascending=False)
race_to_test = race_to_test.sort_values(by="rank")

Step 4: Compare to Actual Results

If you still have the actual official_fin values in a backup, compare the model’s ranking against the real results.

# Add actual finish positions for comparison (if available)
actual_results = df[
    (df["race_date"] == "2023-05-15") & (df["race_number"] == 5)
][["horse_id", "official_fin"]]

race_to_test = race_to_test.merge(actual_results, on="horse_id", how="left")

# Display results
print(race_to_test[["horse_id", "predicted_probability", "rank", "official_fin"]])

Testing Without horse_id, race_date, or race_number

If you remove horse_id, race_date, or race_number, the model should still be able to make predictions if those columns are not part of the feature set. However, you won’t be able to group or rank the predictions by race because race_date and race_number are critical for distinguishing horses in the same race.

To simulate a future race:
	1.	Select a race that the model hasn’t seen during training (e.g., from a holdout set).
	2.	Remove any identifying metadata (e.g., horse_id, race_date, race_number).
	3.	Process the features the same way as during training.
	4.	Use the model to predict probabilities for the horses in that race.
	5.	Rank the predictions by probability.

Testing with a Future Race

For a future race:
	1.	Collect horse features for that race (e.g., from Equibase).
	2.	Process the data the same way as the training data.
	3.	Use the model to make predictions.
	4.	Rank horses by predicted probabilities.
	5.	Wait for the race results and compare the model’s rankings to the actual outcomes.

This approach ensures the model is evaluated on unseen data, simulating a real-world scenario.

## Hyperparameter

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "max_features": ["auto", "sqrt", 0.5],
    "class_weight": [None, "balanced"]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="accuracy",    # or "f1", "balanced_accuracy", etc.
    cv=3,                  # 3-fold cross-validation
    n_jobs=-1             # use all CPU cores
)

grid_search.fit(X_train, y_train)
print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Then evaluate on the test set:
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
print("Final Test Accuracy:", (y_pred_best == y_test).mean())