# Horse ID Embedding

In [24]:
# Setup Environment
import time
from optuna.importance import MeanDecreaseImpurityImportanceEvaluator
import os
import logging
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
import joblib # Used for encoding horse_id
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import optuna
import optuna.visualization as viz
from catboost import CatBoostRanker, CatBoostRegressor, CatBoostClassifier, Pool
import numpy as np
import itertools
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment 
# Set global references to None
spark = None
master_results_df = None
race_df = None
df = None
training_data = None
train_df = None

In [25]:
spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()

Spark session created successfully.


In [185]:
# This dataset has already been cleaned up in the LGB notebook and saved as a starting point
# It now just needs to be converted to Panadas and run in the GBDT variant model (LGB, XGB, CatBoost)
horse_embedding = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/horse_embedding_data-20250222_1453.parquet")


# Last Operation Carried Forward (LOCF)

In [186]:
horse_embedding.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: double (nullable = true)
 |-- horse_id: double (nullable = true)
 |-- axciskey: string (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- horse_name: string (nullable = true)
 |-- official_fin: long (nullable = true)
 |-- time_behind: double (nullable = true)
 |-- pace_delta_time: double (nullable = true)
 |-- running_time: double (nullable = true)
 |-- dist_bk_gate4: double (nullable = true)
 |-- total_distance_ran: double (nullable = true)
 |-- speed_rating: double (nullable = true)
 |-- prev_speed_rating: double (nullable = true)
 |-- previous_class: double (nullable = true)
 |-- purse: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sex: string (nullable = true)
 |-- equip: string (nullable = true)
 |-- claimprice: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- distance_mete

In [187]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Assume horse_embedding is your original Spark DataFrame.
df = horse_embedding

# Split the DataFrame into future and historical records.
future_df = df.filter(F.col("official_fin").isNull())
historical_df = df.filter(F.col("official_fin").isNotNull())

# Join future rows with historical rows for the same horse, 
# where the historical race is before the future race.
joined = future_df.alias("f").join(
    historical_df.alias("h"),
    on = (
         (F.col("f.horse_id") == F.col("h.horse_id")) &
         (
             (F.col("h.race_date") < F.col("f.race_date")) |
             ((F.col("h.race_date") == F.col("f.race_date")) & (F.col("h.race_number") < F.col("f.race_number")))
         )
    ),
    how = "left"
)

# Define a window to select the most recent historical record for each future row.
w = Window.partitionBy("f.horse_id", "f.race_date", "f.race_number")\
          .orderBy(F.col("h.race_date").desc(), F.col("h.race_number").desc())

# For each future row, keep only the most recent historical record.
joined = joined.withColumn("rn", F.row_number().over(w)).filter(F.col("rn") == 1).drop("rn")

# Define the columns that should be updated from historical data.
update_cols = [
    "combined_4", "combined_3", "combined_2", "combined_1", "combined_0",
    "par_diff_ratio", "class_offset", "class_multiplier", "wide_factor", "standardized_score",
    "base_speed", "official_distance", "normalized_score", "par_time", "raw_performance_score",
    "recent_avg_speed"
]

# From the future rows, get all columns except those in update_cols.
f_cols = [c for c in future_df.columns if c not in update_cols]

# Build the updated future DataFrame by taking columns from the future side ("f")
# and overlaying the updated values from the historical side ("h").
updated_future = joined.select(
    *[F.col("f." + c) for c in f_cols],
    F.col("h.combined_4").alias("combined_4"),
    F.col("h.combined_3").alias("combined_3"),
    F.col("h.combined_2").alias("combined_2"),
    F.col("h.combined_1").alias("combined_1"),
    F.col("h.combined_0").alias("combined_0"),
    F.col("h.par_diff_ratio").alias("par_diff_ratio"),
    F.col("h.class_offset").alias("class_offset"),
    F.col("h.class_multiplier").alias("class_multiplier"),
    F.col("h.wide_factor").alias("wide_factor"),
    F.col("h.standardized_score").alias("standardized_score"),
    F.col("h.base_speed").alias("base_speed"),
    F.col("h.official_distance").alias("official_distance"),
    F.col("h.normalized_score").alias("normalized_score"),
    F.col("h.par_time").alias("par_time"),
    F.col("h.raw_performance_score").alias("raw_performance_score"),
    F.col("h.recent_avg_speed").alias("recent_avg_speed")
)

# Union the unchanged historical rows with the updated future rows.
final_df = historical_df.unionByName(updated_future, allowMissingColumns=True)

# Now, filter out any rows where combined_0 is still null.
final_df = final_df.filter(F.col("combined_0").isNotNull())

# Count the number of rows where official_fin is null.
num_future = final_df.filter(F.col("official_fin").isNull()).count()
print("Number of rows with official_fin is null:", num_future)



Number of rows with official_fin is null: 2068


In [188]:
# Save the final DataFrame to Parquet.
output_path = "/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/horse_embedding_updated.parquet"
final_df.write.mode("overwrite").parquet(output_path)
print("Saved updated horse_embedding DataFrame to:", output_path)



Saved updated horse_embedding DataFrame to: /home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/horse_embedding_updated.parquet




In [180]:
horse_embedding = horse_embedding.filter(F.col("combined_0").isNotNull()).count()
print("Number of rows with official_fin is null:", horse_embedding)

Number of rows with official_fin is null: 774750


In [181]:
from pyspark.sql import functions as F

num_rows = horse_embedding.filter(
    F.col("official_fin").isNull() & F.col("combined_0").isNotNull()
).count()

print("Number of rows where official_fin is null and combined_0 is not null:", num_rows)

AttributeError: 'int' object has no attribute 'filter'

In [134]:
global_speed_accuracy.select("avg_speed", "std_speed", "avg_relevance", "std_relevance").show(100)

+------------------+------------------+------------------+------------------+
|         avg_speed|         std_speed|     avg_relevance|     std_relevance|
+------------------+------------------+------------------+------------------+
| 85.31865575842825|6.3797155106320735| 23.44026889869125|12.972248359346512|
| 61.67822718816071| 4.975797494272431|24.908285341335358|13.044257719174238|
| 60.74313865341632| 4.371697311218189|22.233011553891977|12.812347111994077|
|61.115280808685455| 4.891721774427513|24.908285341335354|13.044257719174238|
| 86.98869299163557| 7.819833912726544|29.113737142750022|12.381541592127373|
| 89.25888117318901| 7.825770056094534|29.113737142750022|12.381541592127373|
|125.68819409238668|4.3002496989856915|26.741913514175362|12.927914556239877|
| 62.76168487630285| 6.298989127568237|32.330717679880514|10.677592749534051|
| 88.03345714691501|7.8260053244128445|29.113737142750022|12.381541592127373|
| 87.07553991777517| 7.820572548383406|29.113737142750022|12.381

In [121]:
global_speed_accuracy.count()

109481

In [110]:
# Check if "Hypnus" made it into train_data
print(train_df.filter(F.col("horse_name") == "Hypnus").count())

3


In [111]:
# Filter the DataFrame for the specific race.
race_df = train_df.filter(
    (F.col("course_cd") == "TOP") &
    (F.col("race_date") == F.lit("2025-02-22").cast("date")) &
    (F.col("race_number") == 11)
)

# Select and order the columns of interest.
race_df.select("saddle_cloth_number", "horse_name", "course_cd", "race_date", "race_number") \
       .orderBy("saddle_cloth_number") \
       .show(truncate=False)

+-------------------+------------------+---------+----------+-----------+
|saddle_cloth_number|horse_name        |course_cd|race_date |race_number|
+-------------------+------------------+---------+----------+-----------+
|1                  |Coal Battle       |TOP      |2025-02-22|11.0       |
|10                 |Bullard           |TOP      |2025-02-22|11.0       |
|11                 |Speed King        |TOP      |2025-02-22|11.0       |
|12                 |Brereton's Baytown|TOP      |2025-02-22|11.0       |
|13                 |Tiztastic         |TOP      |2025-02-22|11.0       |
|14                 |Hot Gunner        |TOP      |2025-02-22|11.0       |
|2                  |Admiral Dennis    |TOP      |2025-02-22|11.0       |
|3                  |Sandman           |TOP      |2025-02-22|11.0       |
|4                  |Hypnus            |TOP      |2025-02-22|11.0       |
|5                  |Madaket Road      |TOP      |2025-02-22|11.0       |
|6                  |Publisher        

In [112]:
train_df.count()

808887

In [113]:
speed_figure.count()


808887

In [114]:
# Check if "Hypnus" made it into train_data
print(speed_figure.filter(F.col("horse_name") == "Hypnus").count())

3


In [115]:
# Filter the DataFrame for the specific race.
race_df = speed_figure.filter(
    (F.col("course_cd") == "TOP") &
    (F.col("race_date") == F.lit("2025-02-22").cast("date")) &
    (F.col("race_number") == 11)
)

# Select and order the columns of interest.
race_df.select("saddle_cloth_number", "horse_name", "course_cd", "race_date", "race_number") \
       .orderBy("saddle_cloth_number") \
       .show(truncate=False)

+-------------------+------------------+---------+-------------------+-----------+
|saddle_cloth_number|horse_name        |course_cd|race_date          |race_number|
+-------------------+------------------+---------+-------------------+-----------+
|1                  |Coal Battle       |TOP      |2025-02-22 00:00:00|11.0       |
|10                 |Bullard           |TOP      |2025-02-22 00:00:00|11.0       |
|11                 |Speed King        |TOP      |2025-02-22 00:00:00|11.0       |
|12                 |Brereton's Baytown|TOP      |2025-02-22 00:00:00|11.0       |
|13                 |Tiztastic         |TOP      |2025-02-22 00:00:00|11.0       |
|14                 |Hot Gunner        |TOP      |2025-02-22 00:00:00|11.0       |
|2                  |Admiral Dennis    |TOP      |2025-02-22 00:00:00|11.0       |
|3                  |Sandman           |TOP      |2025-02-22 00:00:00|11.0       |
|4                  |Hypnus            |TOP      |2025-02-22 00:00:00|11.0       |
|5  

In [53]:
horse_embeddings.count()

334135

In [55]:
# Assuming train_data is a Spark DataFrame
filtered_race_df = train_data.filter(
    (F.col("race_date") == "2025-02-23") &
    (F.col("course_cd") == "TOP") &
    (F.col("race_number") == 11)
).select("saddle_cloth_number", "horse_name")

# Sort the filtered DataFrame by saddle_cloth_number
sorted_race_df = filtered_race_df.orderBy("saddle_cloth_number")

# Show the sorted DataFrame
sorted_race_df.show()

+-------------------+------------------+
|saddle_cloth_number|        horse_name|
+-------------------+------------------+
|                  1|       Coal Battle|
|                 10|           Bullard|
|                 11|        Speed King|
|                 12|Brereton's Baytown|
|                 13|         Tiztastic|
|                 14|        Hot Gunner|
|                  2|    Admiral Dennis|
|                  3|           Sandman|
|                  5|      Madaket Road|
|                  6|         Publisher|
|                  7|       Dreaminblue|
|                  8|         Innovator|
|                  9|     Smoken Wicked|
+-------------------+------------------+



In [33]:
# Assuming speed_figure is a Spark DataFrame
filtered_race_df = speed_figure.filter(
    (F.col("race_date") == "2025-02-22") &
    (F.col("course_cd") == "TOP") &
    (F.col("race_number") == 11)
).select("saddle_cloth_number", "horse_name", "global_speed_score")

# Sort the filtered DataFrame by saddle_cloth_number
sorted_race_df = filtered_race_df.orderBy("saddle_cloth_number")

# Show the sorted DataFrame
sorted_race_df.show()

+-------------------+------------------+------------------+
|saddle_cloth_number|        horse_name|global_speed_score|
+-------------------+------------------+------------------+
|                  1|       Coal Battle| 98.38528977696492|
|                 10|           Bullard| 78.93151400222774|
|                 11|        Speed King|109.59972817695649|
|                 12|Brereton's Baytown| 98.17746131641078|
|                 13|         Tiztastic| 97.87619459625377|
|                 14|        Hot Gunner|45.717996494973896|
|                  2|    Admiral Dennis|136.14234762827152|
|                  3|           Sandman|104.64979423491829|
|                  5|      Madaket Road|108.95725623485843|
|                  6|         Publisher| 71.04359361667235|
|                  7|       Dreaminblue|102.48212308746481|
|                  8|         Innovator| 102.6969932820621|
|                  9|     Smoken Wicked|114.19928204407404|
+-------------------+------------------+

# Switching to Pandas

In [8]:
speed_figure = speed_figure.toPandas()

                                                                                

### Set target_metric as Rank


In [9]:
# 2) Convert horse_id into integer indices
unique_horses = speed_figure["horse_id"].unique()
horse_id_to_idx = {h: i for i, h in enumerate(unique_horses)}
horse_idx = speed_figure["horse_id"].map(horse_id_to_idx)

# Use pd.concat to avoid fragmentation
speed_figure = pd.concat([speed_figure, horse_idx.rename("horse_idx")], axis=1)

In [10]:
# 3) Select numeric columns for embedding input  - 
			 
embedding_features = [
        "custom_speed_figure","off_finish_last_race","time_behind","pace_delta_time",
        "all_starts","all_win","all_place","all_show","all_fourth","horse_itm_percentage",
        "sire_itm_percentage","sire_roi","dam_itm_percentage","dam_roi","age_at_race_day",
        "power","speed_rating","prev_speed_rating","previous_class","class_rating", 
        "speed_improvement","avg_dist_bk_gate1_5","avg_dist_bk_gate2_5","avg_dist_bk_gate3_5",
        "avg_dist_bk_gate4_5","avg_speed_fullrace_5","avg_stride_length_5","avg_strfreq_q1_5",
        "avg_strfreq_q2_5","avg_strfreq_q3_5","avg_strfreq_q4_5"
    ]

In [11]:
# Our target to predict (e.g., finishing position or next speed rating)
target_col = "perf_target"

In [12]:
# 4) Create X and y arrays
X_numerical = speed_figure[embedding_features].astype(float).values  # shape: [num_samples, num_numeric_feats]
X_horse_idx = speed_figure["horse_idx"].values  # shape: [num_samples]
y = speed_figure[target_col].values  # shape: [num_samples]


In [13]:
# Check correlations
for col in embedding_features:
    corr = speed_figure[col].corr(speed_figure["custom_speed_figure"])
    print(f"Correlation between {col} and custom_speed_figure: {corr}")


Correlation between custom_speed_figure and custom_speed_figure: 1.0
Correlation between off_finish_last_race and custom_speed_figure: 0.16209691191607623
Correlation between time_behind and custom_speed_figure: -0.45284133665436904
Correlation between pace_delta_time and custom_speed_figure: -0.08537723436621214
Correlation between all_starts and custom_speed_figure: 0.04418085884888885
Correlation between all_win and custom_speed_figure: 0.08160827509159436
Correlation between all_place and custom_speed_figure: 0.11930363748047343
Correlation between all_show and custom_speed_figure: 0.08244804192409771
Correlation between all_fourth and custom_speed_figure: 0.040757903071475594
Correlation between horse_itm_percentage and custom_speed_figure: 0.21282397297510683
Correlation between sire_itm_percentage and custom_speed_figure: 0.11091873461285694
Correlation between sire_roi and custom_speed_figure: 0.03875439502980622
Correlation between dam_itm_percentage and custom_speed_figure: 0

In [14]:
# 5) Simple train/val split (use time-based if possible!)
X_num_train, X_num_val, X_horse_train, X_horse_val, y_train, y_val = train_test_split(
    X_numerical, X_horse_idx, y, test_size=0.2, random_state=42
)

# -----------------------------------------------------------------------------
# Define the dict inputs that Keras expects
#    train_inputs and val_inputs must exist BEFORE objective() is called
# -----------------------------------------------------------------------------
train_inputs = {
    "numeric_input": X_num_train,
    "horse_id_input": X_horse_train
}
val_inputs = {
    "numeric_input": X_num_val,
    "horse_id_input": X_horse_val
}

# Building a Keras Model with an Embedding Layer

> We’ll have two inputs to our model:

    1.	horse_id input (integer indices) fed into an Embedding layer.
	2.	numeric features (like custom_speed_figure and sire/dam stats) fed into a small Dense network.

Then we’ll concatenate these two outputs and produce a regression output (1 node with a linear activation for MSE).

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

num_horses = len(unique_horses)
embedding_dim = 8  # hyperparameter you can tune
num_numeric_feats = len(embedding_features)
print(num_horses)

51601


In [16]:
# 1) Define horse_id input
horse_id_input = keras.Input(shape=(), name="horse_id_input", dtype=tf.int32)
# Embedding layer for horse IDs
horse_embedding_layer = layers.Embedding(
    input_dim=num_horses, 
    output_dim=embedding_dim, 
    name="horse_embedding"
)
horse_embedded = horse_embedding_layer(horse_id_input)  # shape: (batch, embedding_dim)

# The embedding output will be 2D [batch_size, embedding_dim].
# Optionally, you can Flatten() if you want a 1D vector
horse_embedded = layers.Flatten()(horse_embedded)


I0000 00:00:1738032166.813892 3434948 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1738032166.814098 3434948 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1738032166.833094 3434948 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1738032166.833304 3434948 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [17]:
# 2) Define numeric input
numeric_input = keras.Input(shape=(num_numeric_feats,), name="numeric_input")
x_numeric = layers.Dense(16, activation="relu")(numeric_input)
x_numeric = layers.Dense(16, activation="relu")(x_numeric)

In [18]:
# 3) Concatenate the numeric output and the embedding
combined = layers.Concatenate()([x_numeric, horse_embedded])


In [19]:
# 4) Final output layer for regression
output = layers.Dense(1, activation="linear", name="output")(combined)



In [20]:
# 5) Build the model
model = keras.Model(
    inputs=[numeric_input, horse_id_input],
    outputs=output
)

In [21]:
# 6) Compile the model with MSE or MAE
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="mse",  # for regression
    metrics=["mae"] 
)

model.summary()

### Summary of the Model
	•	horse_id_input -> Embedding -> Flatten -> (None, embedding_dim)
	•	numeric_input -> 2 Dense layers -> (None, 16)
	•	Concatenate -> Final dense(1) for regression.

# Train the Network
	•	We’ll feed two inputs into .fit(): one for the numeric features and another for the horse ID indices.

In [22]:
def objective(trial):
    # -----------------------------
    #  Hyperparameter Search Space
    # -----------------------------
    embedding_dim = trial.suggest_categorical("embedding_dim", [2, 4, 8, 16, 32, 64])
    n_hidden_layers = trial.suggest_int("n_hidden_layers", 1, 5)
    units = trial.suggest_int("units_per_layer", 16, 512, step=16)
    activation = trial.suggest_categorical("activation", ["relu", "selu", "tanh", "gelu", "softplus"])
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [128, 256, 512, 1024])
    epochs = trial.suggest_int("epochs", 5, 50, step=5)

    # OPTIONAL: dropout rate
    use_dropout = trial.suggest_categorical("use_dropout", [False, True])
    dropout_rate = 0.0
    if use_dropout:
        dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)

    # -----------------------------
    #  Build the Model
    # -----------------------------
    # Horse ID input
    horse_id_input = keras.Input(shape=(), name="horse_id_input", dtype=tf.int32)

    # Numeric input
    numeric_input = keras.Input(shape=(X_num_train.shape[1],), name="numeric_input")

    # Embedding layer for horse_id
    horse_embedding_layer = layers.Embedding(
        input_dim=num_horses,  # you must define this globally or pass it in
        output_dim=embedding_dim,
        name="horse_embedding"
    )
    horse_embedded = horse_embedding_layer(horse_id_input)  # shape: [batch, 1, embedding_dim]
    horse_embedded = layers.Flatten()(horse_embedded)       # shape: [batch, embedding_dim]

    # Dense layers for numeric features
    x = numeric_input
    for _ in range(n_hidden_layers):
        x = layers.Dense(units, activation=activation)(x)
        # Optional dropout for each hidden layer
        if use_dropout:
            x = layers.Dropout(dropout_rate)(x)

    # Concatenate embedding + numeric branch
    combined = layers.Concatenate()([x, horse_embedded])
    
    # Final output (regression)
    output = layers.Dense(1, activation="linear")(combined)

    model = keras.Model([numeric_input, horse_id_input], outputs=output)

    # Compile
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mse",
        metrics=["mae"]
    )

    # Early stopping callback (tweak if you want to let it train longer)
    early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=3,
        restore_best_weights=True
    )

    # -----------------------------
    #  Train
    # -----------------------------
    history = model.fit(
        train_inputs,  # {"numeric_input": X_num_train, "horse_id_input": X_horse_train}
        y_train,
        validation_data=(val_inputs, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose= 50
    )

    # Evaluate on validation set
    val_loss, val_mae = model.evaluate(val_inputs, y_val, verbose=0)
    return val_loss  # Minimizing MSE

In [23]:
# Suppose we have:
# - num_horses = length of the unique horse IDs
# - X_num_train, X_horse_train, y_train
# - X_num_val, X_horse_val, y_val
# and we have train_inputs = {"numeric_input": X_num_train, "horse_id_input": X_horse_train}
#                    val_inputs = {"numeric_input": X_num_val,   "horse_id_input": X_horse_val}

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # or more, if you can afford it

print("Best Trial:")
best_trial = study.best_trial
print(f"  Value (Val MSE): {best_trial.value}")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2025-01-27 20:43:17,575][0m A new study created in memory with name: no-name-75260bdc-2e9b-4516-ac8a-20f99a402ecb[0m


Epoch 1/15


I0000 00:00:1738032198.094285 1609377 service.cc:146] XLA service 0x7f7c00003120 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738032198.094309 1609377 service.cc:154]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
I0000 00:00:1738032198.094312 1609377 service.cc:154]   StreamExecutor device (1): NVIDIA RTX A6000, Compute Capability 8.6
2025-01-27 20:43:18.108084: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-01-27 20:43:18.163088: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90300
I0000 00:00:1738032198.439779 1609377 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15


[32m[I 2025-01-27 20:43:35,060][0m Trial 0 finished with value: 2.478759765625 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 3, 'units_per_layer': 400, 'activation': 'softplus', 'learning_rate': 0.00415086820745941, 'batch_size': 128, 'epochs': 15, 'use_dropout': False}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


[32m[I 2025-01-27 20:43:42,869][0m Trial 1 finished with value: 2.600701332092285 and parameters: {'embedding_dim': 32, 'n_hidden_layers': 5, 'units_per_layer': 320, 'activation': 'gelu', 'learning_rate': 0.009896115413005542, 'batch_size': 1024, 'epochs': 30, 'use_dropout': False}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40


[32m[I 2025-01-27 20:43:51,519][0m Trial 2 finished with value: 2.5033724308013916 and parameters: {'embedding_dim': 2, 'n_hidden_layers': 3, 'units_per_layer': 512, 'activation': 'softplus', 'learning_rate': 0.008862914144945836, 'batch_size': 256, 'epochs': 40, 'use_dropout': False}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


[32m[I 2025-01-27 20:43:57,376][0m Trial 3 finished with value: 41.040287017822266 and parameters: {'embedding_dim': 2, 'n_hidden_layers': 4, 'units_per_layer': 496, 'activation': 'gelu', 'learning_rate': 0.00033404374946953386, 'batch_size': 1024, 'epochs': 50, 'use_dropout': True, 'dropout_rate': 0.2}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


[32m[I 2025-01-27 20:44:06,796][0m Trial 4 finished with value: 3.4195053577423096 and parameters: {'embedding_dim': 8, 'n_hidden_layers': 3, 'units_per_layer': 16, 'activation': 'selu', 'learning_rate': 0.06636253081595567, 'batch_size': 256, 'epochs': 25, 'use_dropout': True, 'dropout_rate': 0.30000000000000004}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


[32m[I 2025-01-27 20:44:15,397][0m Trial 5 finished with value: 2.49621319770813 and parameters: {'embedding_dim': 8, 'n_hidden_layers': 5, 'units_per_layer': 288, 'activation': 'gelu', 'learning_rate': 0.009518601608918282, 'batch_size': 256, 'epochs': 10, 'use_dropout': False}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40


[32m[I 2025-01-27 20:44:27,165][0m Trial 6 finished with value: 3.9644932746887207 and parameters: {'embedding_dim': 2, 'n_hidden_layers': 5, 'units_per_layer': 176, 'activation': 'gelu', 'learning_rate': 0.008713520255645306, 'batch_size': 256, 'epochs': 40, 'use_dropout': True, 'dropout_rate': 0.4}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15


[32m[I 2025-01-27 20:44:31,873][0m Trial 7 finished with value: 2.868446111679077 and parameters: {'embedding_dim': 8, 'n_hidden_layers': 1, 'units_per_layer': 368, 'activation': 'tanh', 'learning_rate': 0.0018833779749211606, 'batch_size': 1024, 'epochs': 15, 'use_dropout': True, 'dropout_rate': 0.30000000000000004}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


[32m[I 2025-01-27 20:44:46,430][0m Trial 8 finished with value: 2.777034282684326 and parameters: {'embedding_dim': 32, 'n_hidden_layers': 2, 'units_per_layer': 208, 'activation': 'selu', 'learning_rate': 0.0074745220598561, 'batch_size': 128, 'epochs': 50, 'use_dropout': True, 'dropout_rate': 0.30000000000000004}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


[32m[I 2025-01-27 20:44:58,599][0m Trial 9 finished with value: 2.832414150238037 and parameters: {'embedding_dim': 8, 'n_hidden_layers': 5, 'units_per_layer': 352, 'activation': 'selu', 'learning_rate': 0.0027151104466475906, 'batch_size': 512, 'epochs': 30, 'use_dropout': True, 'dropout_rate': 0.4}. Best is trial 0 with value: 2.478759765625.[0m


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2025-01-27 20:45:09,525][0m Trial 10 finished with value: 2.364327907562256 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 2, 'units_per_layer': 432, 'activation': 'softplus', 'learning_rate': 5.81832870474256e-05, 'batch_size': 128, 'epochs': 5, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2025-01-27 20:45:20,126][0m Trial 11 finished with value: 2.421159029006958 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 2, 'units_per_layer': 432, 'activation': 'softplus', 'learning_rate': 2.1669732330787634e-05, 'batch_size': 128, 'epochs': 5, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2025-01-27 20:45:30,565][0m Trial 12 finished with value: 2.654259204864502 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 1, 'units_per_layer': 448, 'activation': 'softplus', 'learning_rate': 1.6671931330526642e-05, 'batch_size': 128, 'epochs': 5, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/5











Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2025-01-27 20:45:44,820][0m Trial 13 finished with value: 2.514456033706665 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 2, 'units_per_layer': 432, 'activation': 'relu', 'learning_rate': 1.6380111197629825e-05, 'batch_size': 128, 'epochs': 5, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


[32m[I 2025-01-27 20:46:02,264][0m Trial 14 finished with value: 2.4290668964385986 and parameters: {'embedding_dim': 64, 'n_hidden_layers': 2, 'units_per_layer': 224, 'activation': 'softplus', 'learning_rate': 7.300459127256586e-05, 'batch_size': 128, 'epochs': 20, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2025-01-27 20:46:07,422][0m Trial 15 finished with value: 2.6423447132110596 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 2, 'units_per_layer': 128, 'activation': 'softplus', 'learning_rate': 9.615513650655776e-05, 'batch_size': 512, 'epochs': 5, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


[32m[I 2025-01-27 20:46:34,137][0m Trial 16 finished with value: 2.5510401725769043 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 1, 'units_per_layer': 464, 'activation': 'tanh', 'learning_rate': 4.914012889967623e-05, 'batch_size': 128, 'epochs': 15, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/10











Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


[32m[I 2025-01-27 20:46:51,642][0m Trial 17 finished with value: 2.450634717941284 and parameters: {'embedding_dim': 16, 'n_hidden_layers': 4, 'units_per_layer': 400, 'activation': 'relu', 'learning_rate': 0.0002672195574177068, 'batch_size': 128, 'epochs': 10, 'use_dropout': False}. Best is trial 10 with value: 2.364327907562256.[0m


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


[32m[I 2025-01-27 20:47:24,695][0m Trial 18 finished with value: 2.339867353439331 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 2, 'units_per_layer': 288, 'activation': 'softplus', 'learning_rate': 1.1325232398915463e-05, 'batch_size': 128, 'epochs': 20, 'use_dropout': False}. Best is trial 18 with value: 2.339867353439331.[0m


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[32m[I 2025-01-27 20:47:39,437][0m Trial 19 finished with value: 2.3296332359313965 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 4, 'units_per_layer': 256, 'activation': 'softplus', 'learning_rate': 3.4701076120024216e-05, 'batch_size': 512, 'epochs': 20, 'use_dropout': False}. Best is trial 19 with value: 2.3296332359313965.[0m


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


[32m[I 2025-01-27 20:47:56,965][0m Trial 20 finished with value: 2.5154781341552734 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 4, 'units_per_layer': 112, 'activation': 'softplus', 'learning_rate': 1.1969989484409361e-05, 'batch_size': 512, 'epochs': 25, 'use_dropout': False}. Best is trial 19 with value: 2.3296332359313965.[0m


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


[32m[I 2025-01-27 20:48:09,094][0m Trial 21 finished with value: 2.3589682579040527 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 4, 'units_per_layer': 256, 'activation': 'softplus', 'learning_rate': 3.693415502731378e-05, 'batch_size': 512, 'epochs': 20, 'use_dropout': False}. Best is trial 19 with value: 2.3296332359313965.[0m


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[32m[I 2025-01-27 20:48:23,692][0m Trial 22 finished with value: 2.3354666233062744 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 4, 'units_per_layer': 272, 'activation': 'softplus', 'learning_rate': 3.5374993961715374e-05, 'batch_size': 512, 'epochs': 20, 'use_dropout': False}. Best is trial 19 with value: 2.3296332359313965.[0m


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[32m[I 2025-01-27 20:48:38,140][0m Trial 23 finished with value: 2.373223066329956 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 4, 'units_per_layer': 288, 'activation': 'softplus', 'learning_rate': 1.1822469258537194e-05, 'batch_size': 512, 'epochs': 20, 'use_dropout': False}. Best is trial 19 with value: 2.3296332359313965.[0m


Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35


[32m[I 2025-01-27 20:48:55,206][0m Trial 24 finished with value: 2.331352949142456 and parameters: {'embedding_dim': 4, 'n_hidden_layers': 3, 'units_per_layer': 256, 'activation': 'softplus', 'learning_rate': 3.1177992803316876e-05, 'batch_size': 512, 'epochs': 35, 'use_dropout': False}. Best is trial 19 with value: 2.3296332359313965.[0m


Epoch 1/35







Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35


[33m[W 2025-01-27 20:49:02,436][0m Trial 25 failed with parameters: {'embedding_dim': 4, 'n_hidden_layers': 3, 'units_per_layer': 192, 'activation': 'tanh', 'learning_rate': 0.00014541699398932937, 'batch_size': 512, 'epochs': 35, 'use_dropout': False} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/exx/anaconda3/envs/mamba_env/envs/tf_310/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_3434948/3514476866.py", line 70, in objective
    history = model.fit(
  File "/home/exx/anaconda3/envs/mamba_env/envs/tf_310/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
    return fn(*args, **kwargs)
  File "/home/exx/anaconda3/envs/mamba_env/envs/tf_310/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 368, in fit
    logs = self.train_function(iterator)
  File "/home/exx/anaconda

# Train a Final Model with Best Hyperparams (Optional)

After you find the best hyperparameters, you can build a final model using those hyperparams and optionally train it on the combined (train+val) set or just the train set:

In [None]:
best_params = study.best_params

# Rebuild the model with best params:
embedding_dim = best_params["embedding_dim"]
n_hidden_layers = best_params["n_hidden_layers"]
units = best_params["units_per_layer"]
activation = best_params["activation"]
learning_rate = best_params["learning_rate"]
batch_size = best_params["batch_size"]
epochs = best_params["epochs"]

horse_id_input = keras.Input(shape=(), name="horse_id_input", dtype=tf.int32)
numeric_input  = keras.Input(shape=(X_num_train.shape[1],), name="numeric_input")

horse_embedding_layer = layers.Embedding(input_dim=num_horses, output_dim=embedding_dim, name="horse_embedding")
horse_embedded = horse_embedding_layer(horse_id_input)
horse_embedded = layers.Flatten()(horse_embedded)

x = numeric_input
for _ in range(n_hidden_layers):
    x = layers.Dense(units, activation=activation)(x)

combined = layers.Concatenate()([x, horse_embedded])
output = layers.Dense(1, activation="linear", name="output")(combined)
final_model = keras.Model(inputs=[numeric_input, horse_id_input], outputs=output)

final_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
    loss="mse",
    metrics=["mae"]
)

# Train (optionally use all data or keep same splits)
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)
final_model.fit(
    train_inputs, y_train,
    validation_data=(val_inputs, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate final
val_loss, val_mae = final_model.evaluate(val_inputs, y_val, verbose=0)
print(f"Final Model - Val MSE: {val_loss:.4f}, Val MAE: {val_mae:.4f}")

# Extract Embeddings
	•	After training, extract the weights from the Embedding layer.
	•	Save these embeddings for use in the CatBoost model.

In [None]:
# The embedding weights (shape: [num_horses, embedding_dim])
embedding_weights = horse_embedding_layer.get_weights()[0]
# This is a numpy array of shape (num_horses, embedding_dim)


# We already have a mapping from horse_id to the row index in that 
# embedding matrix (horse_id_to_idx). Let’s invert that dictionary 
# to reconstruct each horse’s embedding:

idx_to_horse_id = {v: k for k, v in horse_id_to_idx.items()}

embed_list = []
for i in range(num_horses):
    horse_id = idx_to_horse_id[i]
    emb_vec = embedding_weights[i].tolist()  # convert to Python list
    embed_list.append([horse_id] + emb_vec)

# Create a DataFrame with columns: ["horse_id", "embed_0", ..., "embed_7"]
embed_cols = ["horse_id"] + [f"embed_{k}" for k in range(embedding_dim)]
embed_df = pd.DataFrame(embed_list, columns=embed_cols)

print(embed_df.head())


*Note: Now embed_df has one row per horse_id with its learned embedding vector.

# Merging Embeddings Back into Your Main Data

Finally, you can join embed_df with your main “training” DataFrame for CatBoost:

In [None]:
# If your main data is still in df, merge on horse_id:
df_final = pd.merge(
    speed_figure,       # original DataFrame with race-level rows
    embed_df, # the embedding vectors
    on="horse_id",
    how="left"
)

# Save to Parquet or CSV
df_final.to_parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/CatBoost_Embedding_data.parquet")