# CATBoost Model Preparation


In [None]:
#!conda install catboost -y

In [1]:
# Setup Environment
import os
import logging
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import set_config
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
import pyspark.sql.functions as F
import xgboost as xgb
from sklearn import set_config
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
import configparser
from pyspark.sql import SparkSession
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, Window
from src.data_preprocessing.data_prep1.data_utils import (save_parquet, gather_statistics, 
                initialize_environment, load_config, initialize_spark, 
                identify_and_impute_outliers, 
                identify_and_remove_outliers, identify_missing_and_outliers)
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

In [3]:

spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()


Spark session created successfully.


In [4]:
# This dataset has already been cleaned up in the LGB notebook and saved as a starting point
# It now just need to be converted to Panadas and run in the GBDT variant model (LGB, XGB, CatBoost)
race_df = spark.read.parquet(os.path.join(parquet_dir, "race_df_p2.parquet"))



In [5]:
race_df.count()

324041

# Switching to Pandas

In [6]:
race_df = race_df.toPandas()
# Quick info about the DataFrame
#print(df.info())
#print(df.head(5))

                                                                                

# Set race_id

In [7]:
race_df["race_id"] = (
    race_df["course_cd"].astype(str) + "_" +
    race_df["race_date"].astype(str) + "_" +
    race_df["race_number"].astype(str)
)


# Set Rank/Label

In [9]:
# If the largest official_fin is 20 (some races can have 20 horses),
# then label = (21 - official_fin).
# So official_fin=1 => label=20, official_fin=2 =>19, etc.
# If your max is 14, you can do (15 - official_fin).  Just ensure "best" horse has largest label.
race_df["rank"] = 21 - race_df["official_fin"]

In [10]:
if "official_fin" in race_df.columns:
    race_df.drop(columns=["official_fin"], inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ["course_cd", "sex", "equip", "surface", "trk_cond", "weather", "med", 
            "race_type", "stk_clm_md", "turf_mud_mark", "layoff_cat"]
for c in cat_cols:
    lbl = LabelEncoder()
    race_df[c] = lbl.fit_transform(race_df[c].astype(str))

# Set race_id

In [11]:
race_df = race_df.sort_values("race_id", ascending=True)

# Set Features

In [12]:
features = [
    # Basic numeric columns
    "purse",
    "wps_pool",
    "weight",
    "claimprice",
    "power",
    "morn_odds",
    "distance_meters",
    "avgspd",
    "class_rating",
    "todays_cls",
    "net_sentiment",
    "avg_spd_sd",
    "ave_cl_sd",
    "hi_spd_sd",
    "pstyerl",

    # Cumulative performance stats
    "all_starts",
    "all_win",
    "all_place",
    "all_show",
    "all_fourth",
    "all_earnings",
    "cond_starts",
    "cond_win",
    "cond_place",
    "cond_show",
    "cond_fourth",
    "cond_earnings",

    # Recent form metrics
    "avg_fin_3",
    "avg_beaten_3",
    "avg_speed_3",
    "avg_fin_5",
    "avg_beaten_5",
    "avg_speed_5",
    "speed_improvement",
    "days_off",

    # Sectionals / GPS
    "avgtime_gate1",
    "avgtime_gate2",
    "avgtime_gate3",
    "avgtime_gate4",
    "total_distance_ran",
    "running_time",
    "speed_q1",
    "speed_q2",
    "speed_q3",
    "speed_q4",
    "total_dist_covered",
    "avg_acceleration",
    "net_progress_gain",
    "gps_avg_stride_length",

    # Jockey/Trainer stats
    "jock_win_percent",
    "jock_itm_percent",
    "trainer_win_percent",
    "trainer_itm_percent",
    "jt_win_percent",
    "jt_itm_percent",
    "jock_win_track",
    "jock_itm_track",
    "trainer_win_track",
    "trainer_itm_track",
    "jt_win_track",
    "jt_itm_track",

    # Other
    "age_at_race_day",
    "is_first_race",
]


X_all = race_df[features].values
y_all = race_df['rank'].values
race_ids = race_df['race_id'].values

# Extract X, y, race_id

In [13]:
X_all = race_df[features].values
y_all = race_df["rank"].values
race_ids = race_df["race_id"].values

# Split Data

In [14]:
from sklearn.utils import shuffle

unique_races = race_df["race_id"].unique()
unique_races = shuffle(unique_races, random_state=42)

train_ratio = 0.8
cut = int(len(unique_races) * train_ratio)
train_races = set(unique_races[:cut])
valid_races = set(unique_races[cut:])

train_mask = race_df["race_id"].isin(train_races)
valid_mask = race_df["race_id"].isin(valid_races)

X_train = X_all[train_mask]
y_train = y_all[train_mask]
race_id_train = race_ids[train_mask]

X_valid = X_all[valid_mask]
y_valid = y_all[valid_mask]
race_id_valid = race_ids[valid_mask]

# Convert race_id → integer group_id for CatBoost

In [15]:
import numpy as np

def make_group_id(race_id_array):
    # 1) Gather all unique race_id, sorted or not
    unique_r = sorted(set(race_id_array))
    # 2) Map each unique ID → integer
    race_to_gid = {r: i for i, r in enumerate(unique_r)}
    # 3) Build the group_id array for each row
    group_ids = np.array([race_to_gid[r] for r in race_id_array], dtype=int)
    return group_ids

train_gid = make_group_id(race_id_train)
valid_gid = make_group_id(race_id_valid)

# CATBoost

### Build CatBoost Pool objects (with group_id=...)

In [26]:
from catboost import Pool

# 1) Build your Pool objects with group_id
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=train_gid,  # an integer group ID per row
    # cat_features=cat_cols_indices if you have any
)
valid_pool = Pool(
    data=X_valid,
    label=y_valid,
    group_id=valid_gid,
    # cat_features=cat_cols_indices
)

### Define Param Grid

In [38]:
param_grid = {
    "iterations": [300, 500],
    "depth": [6, 8],
    "learning_rate": [0.01, 0.05],
    "l2_leaf_reg": [3, 5],
}

In [39]:
#!pip list catboost

In [40]:
from catboost import CatBoostRanker, Pool

best_score = float("-inf")
best_params = None
best_model = None

for iters in param_grid["iterations"]:
    for depth in param_grid["depth"]:
        for lr in param_grid["learning_rate"]:
            for l2_reg in param_grid["l2_leaf_reg"]:

                # Build trial_params
                trial_params = {
                    "loss_function": "QueryRMSE",   # GPU-friendly ranking objective
                    "eval_metric":   "QueryRMSE",   # the main evaluation is also QueryRMSE
                    "custom_metric": ["NDCG:top=5"], # so we can see NDCG@5 in logs/dict
                    "task_type":     "GPU",         # run training on GPU
                    "devices":       "0",           # or "0-1" for multiple GPUs
                    "iterations":    iters,
                    "depth":         depth,
                    "learning_rate": lr,
                    "l2_leaf_reg":   l2_reg,
                    "random_seed":   42,
                    "verbose":       50
                }

                model = CatBoostRanker(**trial_params)
                model.fit(
                    train_pool,
                    eval_set=[valid_pool],   # single validation set in a list
                    early_stopping_rounds=50,
                    use_best_model=True
                )

                # The dictionary typically looks like:
                # {
                #   'learn': {
                #       'QueryRMSE': 1.23,
                #       'NDCG:top=5;type=Base': 0.98  (if custom_metric was logged on training too)
                #   },
                #   'validation': {
                #       'QueryRMSE': 1.25,
                #       'NDCG:top=5;type=Base': 0.99
                #   }
                # }
                scores_dict = model.get_best_score()
                print("Scores dictionary:", scores_dict)

                # 1) CatBoost uses "validation" key, not "validation_0"
                # 2) The custom NDCG metric is labeled as "NDCG:top=5;type=Base"
                val_score = None
                if "validation" in scores_dict:
                    if "NDCG:top=5;type=Base" in scores_dict["validation"]:
                        val_score = scores_dict["validation"]["NDCG:top=5;type=Base"]
                        print(f"Validation NDCG@5: {val_score:.6f}")
                    else:
                        print("WARNING: 'NDCG:top=5;type=Base' not found in 'validation':", scores_dict["validation"])
                else:
                    print("WARNING: 'validation' key not found in get_best_score() =>", scores_dict)

                if val_score is not None:
                    print(f"Params={trial_params}")
                    print(f"Validation NDCG@5= {val_score:.6f}")

                    # Compare & possibly update the global best
                    if val_score > best_score:
                        best_score  = val_score
                        best_params = trial_params
                        best_model  = model
                else:
                    print("WARNING: Could not retrieve NDCG@5 metric. Full dict:", scores_dict)

print("Best Validation NDCG@5:", best_score)
print("Best Params:", best_params)

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4235264	test: 2.4260599	best: 2.4260599 (0)	total: 6.89ms	remaining: 2.06s
50:	learn: 2.1118006	test: 2.1166271	best: 2.1166271 (50)	total: 152ms	remaining: 744ms
100:	learn: 1.9218710	test: 1.9283758	best: 1.9283758 (100)	total: 294ms	remaining: 580ms
150:	learn: 1.8005624	test: 1.8078247	best: 1.8078247 (150)	total: 434ms	remaining: 429ms
200:	learn: 1.7181410	test: 1.7255796	best: 1.7255796 (200)	total: 566ms	remaining: 279ms
250:	learn: 1.6573591	test: 1.6647119	best: 1.6647119 (250)	total: 698ms	remaining: 136ms
299:	learn: 1.6099226	test: 1.6167601	best: 1.6167601 (299)	total: 826ms	remaining: 0us
bestTest = 1.616760055
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.6099226174137145}, 'validation': {'NDCG:top=5;type=Base': 0.9777671781210945, 'QueryRMSE': 1.6167600549484034}}
Validation NDCG@5: 0.977767
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4235278	test: 2.4260614	best: 2.4260614 (0)	total: 6.03ms	remaining: 1.8s
50:	learn: 2.1118684	test: 2.1166940	best: 2.1166940 (50)	total: 147ms	remaining: 719ms
100:	learn: 1.9219640	test: 1.9284680	best: 1.9284680 (100)	total: 281ms	remaining: 553ms
150:	learn: 1.8006012	test: 1.8078408	best: 1.8078408 (150)	total: 415ms	remaining: 409ms
200:	learn: 1.7183319	test: 1.7257657	best: 1.7257657 (200)	total: 549ms	remaining: 270ms
250:	learn: 1.6574856	test: 1.6648293	best: 1.6648293 (250)	total: 680ms	remaining: 133ms
299:	learn: 1.6100981	test: 1.6169032	best: 1.6169032 (299)	total: 809ms	remaining: 0us
bestTest = 1.616903198
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.6100980664111204}, 'validation': {'NDCG:top=5;type=Base': 0.9777283809116417, 'QueryRMSE': 1.616903198308388}}
Validation NDCG@5: 0.977728
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 30

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3918512	test: 2.3947121	best: 2.3947121 (0)	total: 5.88ms	remaining: 1.76s
50:	learn: 1.6514351	test: 1.6586456	best: 1.6586456 (50)	total: 140ms	remaining: 682ms
100:	learn: 1.4850341	test: 1.4916851	best: 1.4916851 (100)	total: 271ms	remaining: 534ms
150:	learn: 1.4142588	test: 1.4217592	best: 1.4217592 (150)	total: 403ms	remaining: 397ms
200:	learn: 1.3739173	test: 1.3824831	best: 1.3824831 (200)	total: 536ms	remaining: 264ms
250:	learn: 1.3454935	test: 1.3550314	best: 1.3550314 (250)	total: 669ms	remaining: 131ms
299:	learn: 1.3236214	test: 1.3338067	best: 1.3338067 (299)	total: 799ms	remaining: 0us
bestTest = 1.333806727
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.323621435503455}, 'validation': {'NDCG:top=5;type=Base': 0.98700181069445, 'QueryRMSE': 1.333806727308941}}
Validation NDCG@5: 0.987002
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 300,

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3918594	test: 2.3947201	best: 2.3947201 (0)	total: 6.02ms	remaining: 1.8s
50:	learn: 1.6522970	test: 1.6595997	best: 1.6595997 (50)	total: 142ms	remaining: 693ms
100:	learn: 1.4860541	test: 1.4924275	best: 1.4924275 (100)	total: 272ms	remaining: 537ms
150:	learn: 1.4154510	test: 1.4228539	best: 1.4228539 (150)	total: 403ms	remaining: 398ms
200:	learn: 1.3749008	test: 1.3832679	best: 1.3832679 (200)	total: 536ms	remaining: 264ms
250:	learn: 1.3461824	test: 1.3555679	best: 1.3555679 (250)	total: 669ms	remaining: 131ms
299:	learn: 1.3242134	test: 1.3341530	best: 1.3341530 (299)	total: 798ms	remaining: 0us
bestTest = 1.334152951
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.3242134215542636}, 'validation': {'NDCG:top=5;type=Base': 0.9869995934672174, 'QueryRMSE': 1.3341529506485401}}
Validation NDCG@5: 0.987000
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 3

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4229420	test: 2.4254433	best: 2.4254433 (0)	total: 6.28ms	remaining: 1.88s
50:	learn: 2.0930514	test: 2.0976122	best: 2.0976122 (50)	total: 196ms	remaining: 957ms
100:	learn: 1.8942053	test: 1.9006305	best: 1.9006305 (100)	total: 387ms	remaining: 762ms
150:	learn: 1.7689930	test: 1.7765745	best: 1.7765745 (150)	total: 584ms	remaining: 576ms
200:	learn: 1.6852134	test: 1.6934707	best: 1.6934707 (200)	total: 779ms	remaining: 384ms
250:	learn: 1.6226136	test: 1.6311451	best: 1.6311451 (250)	total: 974ms	remaining: 190ms
299:	learn: 1.5745908	test: 1.5831471	best: 1.5831471 (299)	total: 1.17s	remaining: 0us
bestTest = 1.583147088
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.574590793688483}, 'validation': {'NDCG:top=5;type=Base': 0.9788688286234886, 'QueryRMSE': 1.5831470875304041}}
Validation NDCG@5: 0.978869
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 3

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4229476	test: 2.4254486	best: 2.4254486 (0)	total: 6.85ms	remaining: 2.05s
50:	learn: 2.0932405	test: 2.0977994	best: 2.0977994 (50)	total: 209ms	remaining: 1.02s
100:	learn: 1.8942242	test: 1.9005093	best: 1.9005093 (100)	total: 407ms	remaining: 803ms
150:	learn: 1.7691453	test: 1.7765474	best: 1.7765474 (150)	total: 605ms	remaining: 597ms
200:	learn: 1.6852509	test: 1.6934188	best: 1.6934188 (200)	total: 802ms	remaining: 395ms
250:	learn: 1.6226690	test: 1.6310337	best: 1.6310337 (250)	total: 998ms	remaining: 195ms
299:	learn: 1.5747548	test: 1.5831890	best: 1.5831890 (299)	total: 1.19s	remaining: 0us
bestTest = 1.583189029
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.574754784309436}, 'validation': {'NDCG:top=5;type=Base': 0.9788733434265531, 'QueryRMSE': 1.5831890289372437}}
Validation NDCG@5: 0.978873
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 3

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3889250	test: 2.3916265	best: 2.3916265 (0)	total: 7.23ms	remaining: 2.16s
50:	learn: 1.6156653	test: 1.6239027	best: 1.6239027 (50)	total: 205ms	remaining: 1000ms
100:	learn: 1.4539305	test: 1.4630243	best: 1.4630243 (100)	total: 400ms	remaining: 787ms
150:	learn: 1.3853301	test: 1.3970034	best: 1.3970034 (150)	total: 600ms	remaining: 592ms
200:	learn: 1.3437682	test: 1.3582103	best: 1.3582103 (200)	total: 801ms	remaining: 395ms
250:	learn: 1.3139106	test: 1.3308571	best: 1.3308571 (250)	total: 1s	remaining: 196ms
299:	learn: 1.2906974	test: 1.3096901	best: 1.3096901 (299)	total: 1.2s	remaining: 0us
bestTest = 1.309690148
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.2906973817946736}, 'validation': {'NDCG:top=5;type=Base': 0.9877085020715555, 'QueryRMSE': 1.3096901477595986}}
Validation NDCG@5: 0.987709
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 300

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3889514	test: 2.3916526	best: 2.3916526 (0)	total: 7.12ms	remaining: 2.13s
50:	learn: 1.6159079	test: 1.6242572	best: 1.6242572 (50)	total: 203ms	remaining: 989ms
100:	learn: 1.4538639	test: 1.4632088	best: 1.4632088 (100)	total: 395ms	remaining: 779ms
150:	learn: 1.3853699	test: 1.3969298	best: 1.3969298 (150)	total: 592ms	remaining: 585ms
200:	learn: 1.3437992	test: 1.3577014	best: 1.3577014 (200)	total: 791ms	remaining: 390ms
250:	learn: 1.3138097	test: 1.3299983	best: 1.3299983 (250)	total: 990ms	remaining: 193ms
299:	learn: 1.2911246	test: 1.3091124	best: 1.3091124 (299)	total: 1.18s	remaining: 0us
bestTest = 1.309112398
bestIteration = 299
Scores dictionary: {'learn': {'QueryRMSE': 1.29112464703315}, 'validation': {'NDCG:top=5;type=Base': 0.987752749521722, 'QueryRMSE': 1.3091123980030372}}
Validation NDCG@5: 0.987753
Params={'loss_function': 'QueryRMSE', 'eval_metric': 'QueryRMSE', 'custom_metric': ['NDCG:top=5'], 'task_type': 'GPU', 'devices': '0', 'iterations': 300

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4235265	test: 2.4260601	best: 2.4260601 (0)	total: 5.47ms	remaining: 2.73s
50:	learn: 2.1118009	test: 2.1166272	best: 2.1166272 (50)	total: 139ms	remaining: 1.22s
100:	learn: 1.9218708	test: 1.9283758	best: 1.9283758 (100)	total: 271ms	remaining: 1.07s
150:	learn: 1.8005623	test: 1.8078247	best: 1.8078247 (150)	total: 402ms	remaining: 929ms
200:	learn: 1.7181411	test: 1.7255795	best: 1.7255795 (200)	total: 535ms	remaining: 796ms
250:	learn: 1.6573590	test: 1.6647120	best: 1.6647120 (250)	total: 668ms	remaining: 662ms
300:	learn: 1.6090583	test: 1.6158825	best: 1.6158825 (300)	total: 800ms	remaining: 529ms
350:	learn: 1.5694985	test: 1.5760386	best: 1.5760386 (350)	total: 933ms	remaining: 396ms
400:	learn: 1.5370039	test: 1.5432929	best: 1.5432929 (400)	total: 1.06s	remaining: 263ms
450:	learn: 1.5100671	test: 1.5163443	best: 1.5163443 (450)	total: 1.2s	remaining: 130ms
499:	learn: 1.4880337	test: 1.4943314	best: 1.4943314 (499)	total: 1.33s	remaining: 0us
bestTest = 1.49433

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4235278	test: 2.4260614	best: 2.4260614 (0)	total: 6.05ms	remaining: 3.02s
50:	learn: 2.1118685	test: 2.1166939	best: 2.1166939 (50)	total: 144ms	remaining: 1.27s
100:	learn: 1.9219640	test: 1.9284681	best: 1.9284681 (100)	total: 280ms	remaining: 1.1s
150:	learn: 1.8006012	test: 1.8078408	best: 1.8078408 (150)	total: 415ms	remaining: 960ms
200:	learn: 1.7183319	test: 1.7257657	best: 1.7257657 (200)	total: 549ms	remaining: 817ms
250:	learn: 1.6574855	test: 1.6648292	best: 1.6648292 (250)	total: 682ms	remaining: 677ms
300:	learn: 1.6092203	test: 1.6160163	best: 1.6160163 (300)	total: 815ms	remaining: 539ms
350:	learn: 1.5697807	test: 1.5762681	best: 1.5762681 (350)	total: 947ms	remaining: 402ms
400:	learn: 1.5373034	test: 1.5435691	best: 1.5435691 (400)	total: 1.08s	remaining: 267ms
450:	learn: 1.5103576	test: 1.5166328	best: 1.5166328 (450)	total: 1.21s	remaining: 132ms
499:	learn: 1.4882705	test: 1.4945985	best: 1.4945985 (499)	total: 1.34s	remaining: 0us
bestTest = 1.49459

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3918514	test: 2.3947120	best: 2.3947120 (0)	total: 5.91ms	remaining: 2.95s
50:	learn: 1.6514350	test: 1.6586456	best: 1.6586456 (50)	total: 142ms	remaining: 1.25s
100:	learn: 1.4850341	test: 1.4916852	best: 1.4916852 (100)	total: 272ms	remaining: 1.07s
150:	learn: 1.4142589	test: 1.4217592	best: 1.4217592 (150)	total: 404ms	remaining: 933ms
200:	learn: 1.3739173	test: 1.3824830	best: 1.3824830 (200)	total: 536ms	remaining: 798ms
250:	learn: 1.3454935	test: 1.3550314	best: 1.3550314 (250)	total: 669ms	remaining: 664ms
300:	learn: 1.3232231	test: 1.3334059	best: 1.3334059 (300)	total: 802ms	remaining: 530ms
350:	learn: 1.3052793	test: 1.3162174	best: 1.3162174 (350)	total: 935ms	remaining: 397ms
400:	learn: 1.2902481	test: 1.3020430	best: 1.3020430 (400)	total: 1.07s	remaining: 264ms
450:	learn: 1.2775491	test: 1.2901030	best: 1.2901030 (450)	total: 1.2s	remaining: 130ms
499:	learn: 1.2671980	test: 1.2804402	best: 1.2804402 (499)	total: 1.33s	remaining: 0us
bestTest = 1.28044

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3918593	test: 2.3947202	best: 2.3947202 (0)	total: 5.8ms	remaining: 2.89s
50:	learn: 1.6522972	test: 1.6595997	best: 1.6595997 (50)	total: 140ms	remaining: 1.23s
100:	learn: 1.4860540	test: 1.4924275	best: 1.4924275 (100)	total: 272ms	remaining: 1.07s
150:	learn: 1.4154572	test: 1.4228450	best: 1.4228450 (150)	total: 404ms	remaining: 933ms
200:	learn: 1.3750596	test: 1.3834498	best: 1.3834498 (200)	total: 537ms	remaining: 799ms
250:	learn: 1.3462353	test: 1.3555277	best: 1.3555277 (250)	total: 672ms	remaining: 666ms
300:	learn: 1.3238427	test: 1.3337221	best: 1.3337221 (300)	total: 804ms	remaining: 532ms
350:	learn: 1.3055555	test: 1.3162080	best: 1.3162080 (350)	total: 939ms	remaining: 399ms
400:	learn: 1.2907951	test: 1.3022607	best: 1.3022607 (400)	total: 1.07s	remaining: 265ms
450:	learn: 1.2783976	test: 1.2904410	best: 1.2904410 (450)	total: 1.21s	remaining: 131ms
499:	learn: 1.2679624	test: 1.2806378	best: 1.2806378 (499)	total: 1.34s	remaining: 0us
bestTest = 1.28063

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4229421	test: 2.4254432	best: 2.4254432 (0)	total: 7.04ms	remaining: 3.51s
50:	learn: 2.0930515	test: 2.0976121	best: 2.0976121 (50)	total: 202ms	remaining: 1.78s
100:	learn: 1.8942054	test: 1.9006306	best: 1.9006306 (100)	total: 398ms	remaining: 1.57s
150:	learn: 1.7689929	test: 1.7765745	best: 1.7765745 (150)	total: 593ms	remaining: 1.37s
200:	learn: 1.6852134	test: 1.6934707	best: 1.6934707 (200)	total: 787ms	remaining: 1.17s
250:	learn: 1.6226135	test: 1.6311452	best: 1.6311452 (250)	total: 980ms	remaining: 973ms
300:	learn: 1.5737236	test: 1.5822759	best: 1.5822759 (300)	total: 1.17s	remaining: 776ms
350:	learn: 1.5346148	test: 1.5432112	best: 1.5432112 (350)	total: 1.37s	remaining: 580ms
400:	learn: 1.5029655	test: 1.5117176	best: 1.5117176 (400)	total: 1.56s	remaining: 385ms
450:	learn: 1.4770301	test: 1.4860694	best: 1.4860694 (450)	total: 1.75s	remaining: 191ms
499:	learn: 1.4558899	test: 1.4652592	best: 1.4652592 (499)	total: 1.95s	remaining: 0us
bestTest = 1.4652

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.4229474	test: 2.4254486	best: 2.4254486 (0)	total: 7.08ms	remaining: 3.53s
50:	learn: 2.0932409	test: 2.0977992	best: 2.0977992 (50)	total: 198ms	remaining: 1.75s
100:	learn: 1.8942243	test: 1.9005094	best: 1.9005094 (100)	total: 390ms	remaining: 1.54s
150:	learn: 1.7691453	test: 1.7765475	best: 1.7765475 (150)	total: 582ms	remaining: 1.34s
200:	learn: 1.6852508	test: 1.6934188	best: 1.6934188 (200)	total: 774ms	remaining: 1.15s
250:	learn: 1.6226690	test: 1.6310337	best: 1.6310337 (250)	total: 965ms	remaining: 957ms
300:	learn: 1.5738908	test: 1.5823260	best: 1.5823260 (300)	total: 1.16s	remaining: 764ms
350:	learn: 1.5348150	test: 1.5433439	best: 1.5433439 (350)	total: 1.34s	remaining: 571ms
400:	learn: 1.5032936	test: 1.5119428	best: 1.5119428 (400)	total: 1.53s	remaining: 379ms
450:	learn: 1.4773366	test: 1.4862830	best: 1.4862830 (450)	total: 1.73s	remaining: 188ms
499:	learn: 1.4562891	test: 1.4655475	best: 1.4655475 (499)	total: 1.91s	remaining: 0us
bestTest = 1.4655

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3889250	test: 2.3916263	best: 2.3916263 (0)	total: 6.94ms	remaining: 3.46s
50:	learn: 1.6156650	test: 1.6239027	best: 1.6239027 (50)	total: 200ms	remaining: 1.76s
100:	learn: 1.4539303	test: 1.4630242	best: 1.4630242 (100)	total: 391ms	remaining: 1.55s
150:	learn: 1.3853301	test: 1.3970034	best: 1.3970034 (150)	total: 589ms	remaining: 1.36s
200:	learn: 1.3437681	test: 1.3582104	best: 1.3582104 (200)	total: 787ms	remaining: 1.17s
250:	learn: 1.3139106	test: 1.3308571	best: 1.3308571 (250)	total: 985ms	remaining: 977ms
300:	learn: 1.2902855	test: 1.3093498	best: 1.3093498 (300)	total: 1.18s	remaining: 781ms
350:	learn: 1.2717259	test: 1.2928048	best: 1.2928048 (350)	total: 1.38s	remaining: 585ms
400:	learn: 1.2557614	test: 1.2791505	best: 1.2791505 (400)	total: 1.57s	remaining: 389ms
450:	learn: 1.2423272	test: 1.2678769	best: 1.2678769 (450)	total: 1.77s	remaining: 193ms
499:	learn: 1.2308391	test: 1.2584205	best: 1.2584205 (499)	total: 1.97s	remaining: 0us
bestTest = 1.2584

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3889516	test: 2.3916526	best: 2.3916526 (0)	total: 6.93ms	remaining: 3.46s
50:	learn: 1.6159079	test: 1.6242573	best: 1.6242573 (50)	total: 203ms	remaining: 1.78s
100:	learn: 1.4538638	test: 1.4632088	best: 1.4632088 (100)	total: 396ms	remaining: 1.56s
150:	learn: 1.3853700	test: 1.3969298	best: 1.3969298 (150)	total: 593ms	remaining: 1.37s
200:	learn: 1.3437992	test: 1.3577014	best: 1.3577014 (200)	total: 793ms	remaining: 1.18s
250:	learn: 1.3138098	test: 1.3299983	best: 1.3299983 (250)	total: 993ms	remaining: 985ms
300:	learn: 1.2907045	test: 1.3087376	best: 1.3087376 (300)	total: 1.19s	remaining: 788ms
350:	learn: 1.2722403	test: 1.2920747	best: 1.2920747 (350)	total: 1.39s	remaining: 591ms
400:	learn: 1.2565006	test: 1.2784461	best: 1.2784461 (400)	total: 1.59s	remaining: 393ms
450:	learn: 1.2432420	test: 1.2671553	best: 1.2671553 (450)	total: 1.79s	remaining: 194ms
499:	learn: 1.2320592	test: 1.2578968	best: 1.2578968 (499)	total: 1.98s	remaining: 0us
bestTest = 1.2578

In [35]:
cat_scores = model.get_best_score()
cat_ndcg_5 = cat_scores["validation"]["NDCG:top=5;type=Base"]
print("Final NDCG@5:", cat_ndcg_5)

Final NDCG@5: 0.9897410523380372


# Re-train on Combined Data

In [41]:
import numpy as np
from catboost import CatBoostRanker, Pool

# 1) Combine your features & labels & race_ids from train and valid
X_full      = np.concatenate([X_train,  X_valid],  axis=0)
y_full      = np.concatenate([y_train,  y_valid],  axis=0)
race_id_full= np.concatenate([race_id_train, race_id_valid], axis=0)

# 2) Build integer group IDs for the entire data set
#    so that CatBoost knows which rows correspond to the same race.
unique_races = sorted(set(race_id_full))              # get unique race_id strings
race_to_gid  = {race: i for i, race in enumerate(unique_races)}
full_gid     = np.array([race_to_gid[r] for r in race_id_full], dtype=int)

# 3) Create a Pool with group_id=full_gid
full_pool = Pool(
    data     = X_full,
    label    = y_full,
    group_id = full_gid,
    # cat_features=cat_indices,  # (optional) if you have categorical column indices
)

# 4) Use your best_params from the hyperparam search
#    e.g. best_params = {... 'task_type': 'GPU', ... etc. }
final_model = CatBoostRanker(**best_params)

# 5) Fit on the entire dataset
#    - We don’t provide an eval_set here, so no "best_model" logic or early stopping.
final_model.fit(
    full_pool,
    use_best_model=False  # no separate validation set this time
    # verbose=100          # optionally see training updates
)

# 6) Save the final model
final_model.save_model("/home/exx/myCode/horse-racing/FoxRiverAIRacing/src/models/catboost_ranker_final.cbm")

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.3894909	total: 4.36ms	remaining: 2.18s
50:	learn: 1.6166574	total: 190ms	remaining: 1.67s
100:	learn: 1.4535758	total: 368ms	remaining: 1.46s
150:	learn: 1.3852295	total: 553ms	remaining: 1.28s
200:	learn: 1.3440591	total: 738ms	remaining: 1.1s
250:	learn: 1.3146208	total: 923ms	remaining: 916ms
300:	learn: 1.2918449	total: 1.11s	remaining: 732ms
350:	learn: 1.2736144	total: 1.29s	remaining: 549ms
400:	learn: 1.2582432	total: 1.48s	remaining: 365ms
450:	learn: 1.2453043	total: 1.66s	remaining: 181ms
499:	learn: 1.2343491	total: 1.84s	remaining: 0us
