# LSTM

Plan is to model a horseâ€™s per-race sequence of positions/velocities/accelerations


In [1]:
# Setup Environment

# Setup Environment
import time
from optuna.importance import MeanDecreaseImpurityImportanceEvaluator
import os
import logging
import datetime
import numpy as np
import pandas as pd
import optuna
import optuna.visualization as viz
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql

In [3]:
spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()

Spark session created successfully.


In [4]:
def gps_sql_queries():
    queries = {
        "gps_horse": """
            SELECT g.course_cd, g.race_date,g.race_number,
            REGEXP_REPLACE(TRIM(UPPER(saddle_cloth_number)), '\s+$', '') AS saddle_cloth_number, time_stamp, 
            longitude, latitude, speed, progress, stride_frequency, g.post_time, location,
            re.axciskey, h.horse_id, re.official_fin, h.horse_name
            FROM gpspoint g
            JOIN results_entries re on g.course_cd = re.course_cd
                AND g.race_date = re.race_date
                AND g.race_number = re.race_number
                AND g.saddle_cloth_number = re.program_num
            JOIN horse h on re.axciskey = h.axciskey        
            """,
        "sectionals": """
            SELECT REGEXP_REPLACE(TRIM(UPPER(course_cd)), '\s+$', '') AS course_cd, race_date, 
            race_number, REGEXP_REPLACE(TRIM(UPPER(saddle_cloth_number)), '\s+$', '') AS saddle_cloth_number, 
            gate_name, gate_numeric,
                length_to_finish, sectional_time, running_time, distance_back, distance_ran,
                number_of_strides, post_time
            FROM sectionals
            """
    }
    return queries

In [5]:
# queries = gps_sql_queries()
# dfs = load_data_from_postgresql(spark, jdbc_url, jdbc_properties, queries, parquet_dir)
#         # Suppose we have a dictionary of queries
# for name, df in dfs.items():
#     logging.info(f"DataFrame '{name}' loaded. Schema:")
#     df.printSchema()
#     if name == "gps_horse":
#         gps_horse_df = df
#     elif name == "sectionals":
#         sectionals_df = df    
#     else:
#         logging.error(f"Unknown DataFrame name: {name}")
#         continue


In [6]:

# start_time = time.time()
# gps_horse_df.write.mode("overwrite").parquet(f"{parquet_dir}/gps_horse_df")
# sectionals_df.write.mode("overwrite").parquet(f"{parquet_dir}/sectionals_df")
# logging.info(f"Data written to Parquet in {time.time() - start_time:.2f} seconds")
    

In [15]:
horse_embedding = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/horse_embedding_data-20250318_2235.parquet")
#sectionals_df = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/sectionals_df")


In [8]:
from pyspark.sql.functions import col, concat_ws, lpad, date_format

gps_horse_df = gps_horse_df.withColumn(
    "race_id",
    concat_ws(
        "_",
        col("course_cd"),
        date_format(col("race_date"), "yyyyMMdd"),
        lpad(col("race_number").cast("string"), 2, "0")
    )
)

In [16]:
horse_embedding.count()

386324

In [17]:
horse_embedding.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: double (nullable = true)
 |-- group_id: string (nullable = true)
 |-- class_rating: double (nullable = true)
 |-- horse_id: double (nullable = true)
 |-- axciskey: string (nullable = true)
 |-- race_id: string (nullable = true)
 |-- post_time: string (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- horse_name: string (nullable = true)
 |-- official_fin: long (nullable = true)
 |-- par_time: double (nullable = true)
 |-- running_time: double (nullable = true)
 |-- total_distance_ran: double (nullable = true)
 |-- avgtime_gate1: double (nullable = true)
 |-- avgtime_gate2: double (nullable = true)
 |-- avgtime_gate3: double (nullable = true)
 |-- avgtime_gate4: double (nullable = true)
 |-- dist_bk_gate1: double (nullable = true)
 |-- dist_bk_gate2: double (nullable = true)
 |-- dist_bk_gate3: double (nullable = true)
 |-- dist_bk_gate4: double (nullable = true)

## Sort

In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, to_date

# Ensure race_date is in proper date format
gps_horse_df = gps_horse_df.withColumn("race_date", to_date(col("race_date")))

# Define a Window partitioned by horse_id, ordered by race_date and post_time
window_spec = Window.partitionBy("horse_id").orderBy(col("race_date"), col("post_time"))

# Assign a row number to ensure strict chronological order
gps_horse_df = gps_horse_df.withColumn("race_ordinal", row_number().over(window_spec))


## Padding Sequences to a Fixed Length

###  PySpark Code to Count GPS Points per Horse per Race

In [14]:
from pyspark.sql import functions as F

# Step 1: Compute sequence lengths per horse per race
df_seq_len = gps_horse_df.groupBy("race_id", "horse_id").agg(F.count("*").alias("seq_length"))

# Show distribution statistics to validate
df_seq_len.describe().show()

# Step 2: Define length buckets
df_buckets = df_seq_len.withColumn(
    "bucket",
    F.when(F.col("seq_length") <= 100, "short")
     .when(F.col("seq_length") <= 150, "medium")
     .otherwise("long")
)

# Step 3: Join back with original data
df_binned = gps_horse_df.join(df_buckets, ["race_id", "horse_id"])

# Show sample results
df_binned.select("race_id", "horse_id", "seq_length", "bucket").show(10, truncate=False)

# Step 4: Get overall distribution of bucket counts
df_binned.groupBy("bucket").count().show()

                                                                                

+-------+---------------+------------------+------------------+
|summary|        race_id|          horse_id|        seq_length|
+-------+---------------+------------------+------------------+
|  count|         353237|            353237|            353237|
|   mean|           null|250595.63236297443|  105.952827704912|
| stddev|           null|334779.99393681716|21.089323837123167|
|    min|AQU_20221229_01|                 1|                 8|
|    max|TWO_20241215_13|           2275277|               344|
+-------+---------------+------------------+------------------+



                                                                                

+---------------+--------+----------+------+
|race_id        |horse_id|seq_length|bucket|
+---------------+--------+----------+------+
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
|AQU_20221229_01|6303    |103       |medium|
+---------------+--------+----------+------+
only showing top 10 rows





+------+--------+
|bucket|   count|
+------+--------+
|  long|  515559|
|medium|22505959|
| short|14404941|
+------+--------+





### ðŸš€ Next Steps: Aggregate GPS Data & Pad Sequences for LSTM

Now that the bucket distribution looks good, letâ€™s aggregate the GPS data per horse per race, then pad sequences within each bucket for LSTM training.

### ðŸ“Œ Step 2: Aggregate GPS Data into Time-Ordered Sequences

Collect GPS points as a time-ordered sequence per (horse_id, race_id).


In [13]:
from pyspark.sql.window import Window

# Step 1: Define a window partitioned by (horse_id, race_id) and ordered by time_stamp
window_spec = Window.partitionBy("horse_id", "race_id").orderBy("time_stamp")

# Step 2: Aggregate GPS data into an ordered list per horse per race
df_agg = df_binned.withColumn(
    "seq",
    F.collect_list(F.struct("longitude", "latitude", "speed", "stride_frequency")).over(window_spec)
)

# Step 3: Convert to a single row per (horse_id, race_id)
df_agg = df_agg.groupBy("horse_id", "race_id", "bucket").agg(
    F.max("seq").alias("seq")  # Ensures each row has a full sequence
)

# Step 4: Compute sequence lengths for validation
df_agg = df_agg.withColumn("seq_length", F.size("seq"))

# Step 5: Show sample results
df_agg.select("horse_id", "race_id", "bucket", "seq_length").show(10, truncate=False)

[Stage 28:>                                                         (0 + 1) / 1]

+--------+---------------+------+----------+
|horse_id|race_id        |bucket|seq_length|
+--------+---------------+------+----------+
|19      |SAR_20230713_05|medium|138       |
|48      |PEN_20220818_07|short |95        |
|120     |PIM_20230922_02|medium|125       |
|127     |LRL_20230618_02|short |87        |
|154     |TAM_20231220_05|medium|110       |
|162     |TGP_20220514_03|short |77        |
|181     |LRL_20221028_06|short |95        |
|201     |LRL_20231027_10|medium|127       |
|203     |PIM_20220923_02|medium|127       |
|203     |TAM_20230219_09|medium|124       |
+--------+---------------+------+----------+
only showing top 10 rows



                                                                                

### ðŸ“Œ Step 3: Pad Sequences Based on Buckets

Each bucket will be padded to its own max sequence length.

In [None]:
from pyspark.sql.types import ArrayType, FloatType

# Define bucket padding sizes
BUCKETS = {
    "short": 100,      # 0-100 points â†’ pad to 100
    "medium": 150,     # 101-150 points â†’ pad to 150
    "long": 300        # 151+ points â†’ pad to 300
}

# Padding UDF
def pad_sequence(seq, bucket):
    """Pads each sequence based on its assigned bucket size."""
    seq = [list(s) for s in seq] if seq else []
    pad_length = BUCKETS.get(bucket, 300)  # Default to max bucket size

    if len(seq) < pad_length:
        seq.extend([[0.0, 0.0, 0.0, 0.0]] * (pad_length - len(seq)))  # Padding
    elif len(seq) > pad_length:
        seq = seq[:pad_length]  # Truncate if needed

    return seq

# Register UDF
pad_udf = F.udf(pad_sequence, ArrayType(ArrayType(FloatType())))

# Apply padding
df_padded = df_agg.withColumn("padded_seq", pad_udf("seq", "bucket")).drop("seq")


### ðŸ“Œ Step 4: Save to Parquet for LSTM Training

In [None]:
df_padded.write.mode("overwrite").parquet(f"{parquet_dir}/lstm_data.parquet")
print("âœ… Padded sequences saved for LSTM training!")

### ðŸš€ Step 5: Preview the Saved Parquet File

#### ðŸ“Œ Load & Inspect the Parquet File in PySpark

In [None]:
# Load the Parquet file
df_check = spark.read.parquet(f"{parquet_dir}/lstm_data.parquet")

# Show the schema to confirm the structure
df_check.printSchema()

# Display a sample of the dataset
df_check.select("horse_id", "race_id", "seq_length", "bucket", "padded_seq").show(5, truncate=False)

# Compute statistics on sequence lengths
df_check.select(F.min("seq_length"), F.max("seq_length"), F.avg("seq_length")).show()

### ðŸš€ Step 6: Load the Parquet File into NumPy for LSTM Training

#### ðŸ“Œ Load the Parquet File & Convert to NumPy

### ðŸ”¥ Step 1: Fix the seq_length Computation

In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

# Load the Parquet file
df = pq.read_table(f"{parquet_dir}/lstm_data.parquet").to_pandas()

# Fix the sequence format by converting numpy arrays to lists
df["padded_seq"] = df["padded_seq"].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Recalculate sequence lengths
df["seq_length"] = df["padded_seq"].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Verify the new sequence lengths
print(df["seq_length"].describe())
print(df[["horse_id", "race_id", "bucket", "seq_length"]].head())

### ðŸ”¥ Step 2: Handle nan Values in padded_seq

In [None]:
# Replace NaNs with 0.0
def fix_nans(seq):
    """Replace NaNs in sequences with 0.0."""
    return [[0.0 if np.isnan(value) else value for value in point] for point in seq]

df["padded_seq"] = df["padded_seq"].apply(lambda x: fix_nans(x) if isinstance(x, list) else x)

### ðŸ”¥ Step 3: Convert to NumPy for LSTM Training

In [None]:
import numpy as np
import pandas as pd

# Check if padded_seq is properly formatted as lists of lists
df["seq_length"] = df["padded_seq"].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Display unique sequence lengths
print("Unique sequence lengths:", df["seq_length"].unique())

# Check if any sequences are missing or empty
print("Empty sequences count:", df[df["seq_length"] == 0].shape[0])

# Sample a few problematic sequences
print(df[df["seq_length"] != 300]["padded_seq"].head())

### ðŸ”¥ Step 2: Fix the Padding in NumPy


In [None]:
# Define max sequence length (should match your longest padded bucket)
MAX_SEQ_LEN = 300  

def pad_numpy(seq, target_length=MAX_SEQ_LEN):
    """Ensures all sequences are the same length."""
    seq = np.array(seq, dtype=np.float32) if isinstance(seq, list) else np.zeros((0, 4))

    # Ensure the sequence is 2D (seq_length, num_features)
    if seq.ndim == 1:
        seq = seq.reshape(-1, 4)  # Reshape single-dim sequences

    pad_size = target_length - len(seq)

    if pad_size > 0:
        pad = np.zeros((pad_size, seq.shape[1]))  # Create zero padding
        seq = np.vstack([seq, pad])
    elif pad_size < 0:
        seq = seq[:target_length]  # Truncate if too long

    return seq

# Apply padding fix to every row
df["padded_seq_fixed"] = df["padded_seq"].apply(lambda x: pad_numpy(x))

# Convert to NumPy array
X = np.stack(df["padded_seq_fixed"].values)  # Now stacking works!

# Verify shape
print("âœ… Fixed Dataset Shape:", X.shape)  # Expected: (num_samples, 300, 4)

# Save as NumPy file for LSTM training
np.save(f"{parquet_dir}/lstm_input.npy", X)

### ðŸš€  Build & Train the LSTM Model


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.optimizers import Adam

# Load the processed NumPy array
X = np.load(f"{parquet_dir}/lstm_input.npy")

# Verify dataset shape
print("âœ… Dataset Shape:", X.shape)  # Expected: (num_samples, 300, 4)

# Define the number of features (longitude, latitude, speed, stride_frequency)
num_features = X.shape[2]  # Should be 4
seq_length = X.shape[1]  # Should be 300

# Create Y (Target Variable) - Example: Predict next speed value
Y = np.roll(X[:, :, 2], shift=-1, axis=1)  # Using speed (column index 2) as the target
Y[:, -1] = Y[:, -2]  # Fill last column to avoid NaN shift issue

# Verify target shape
print("âœ… Target Shape:", Y.shape)  # Expected: (num_samples, 300)

#### ðŸ“Œ Step 2: Define the LSTM Model


In [None]:
import tensorflow as tf

# List available GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# Enable memory growth (prevents OOM errors)
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    print("âœ… GPU memory growth enabled")

In [None]:
import tensorflow as tf

# âœ… List GPUs
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)  # Prevent full allocation
            tf.config.experimental.set_virtual_device_configuration(
                gpu,
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=34000)]  # Limit to 34GB
            )
        print("âœ… GPU memory growth enabled with limit")
    except RuntimeError as e:
        print(f"Error setting memory config: {e}")

# âœ… Use MirroredStrategy to enable multi-GPU training
strategy = tf.distribute.MirroredStrategy()

print(f"âœ… Running on {strategy.num_replicas_in_sync} GPUs")

In [None]:
import tensorflow as tf

# âœ… List GPUs
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# âœ… Enable memory growth (prevents full allocation)
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

# âœ… Use MirroredStrategy to enable multi-GPU training
strategy = tf.distribute.MirroredStrategy(devices=["/GPU:0", "/GPU:1"])

print(f"âœ… Running on {strategy.num_replicas_in_sync} GPUs")

with strategy.scope():
    model = tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0.0, input_shape=(seq_length, num_features)),
        tf.keras.layers.LSTM(128, return_sequences=True, activation="tanh"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.LSTM(64, return_sequences=False, activation="tanh"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1)  # Predicting next speed value
    ])

    model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=["mae"])

print("âœ… Model compiled with MirroredStrategy using NVLink.")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.optimizers import Adam

# âœ… Enable multi-GPU training using MirroredStrategy
strategy = tf.distribute.MirroredStrategy()

print(f"âœ… Running on {strategy.num_replicas_in_sync} GPUs")

# Define the LSTM model within the strategy scope
with strategy.scope():
    model = Sequential([
        Masking(mask_value=0.0, input_shape=(seq_length, num_features)),
        LSTM(128, return_sequences=True, activation="tanh"),
        Dropout(0.3),
        LSTM(64, return_sequences=False, activation="tanh"),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(1)  # Predicting the next speed value
    ])

    # Compile the model
    model.compile(loss="mse", optimizer=Adam(learning_rate=0.001), metrics=["mae"])

print("âœ… Model compiled with MirroredStrategy using NVLink.")

In [None]:
import tensorflow as tf

# Enable GPU memory growth to prevent CUDA OOM issues
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)  # Prevents full allocation
        print("âœ… GPU memory growth enabled")
    except RuntimeError as e:
        print(f"Error setting memory growth: {e}")

In [None]:
# Reduce batch size from 32 to 16 or lower
batch_size = 16  

# Define the LSTM model
model = Sequential([
    Masking(mask_value=0.0, input_shape=(seq_length, num_features)),  # Ignore padded values
    LSTM(64, return_sequences=True, activation="tanh"),  # Reduce from 128 to 64
    Dropout(0.2),
    LSTM(32, return_sequences=False, activation="tanh"),  # Reduce from 64 to 32
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dense(1)  # Predict next speed value
])

# Compile the model
model.compile(loss="mse", optimizer=Adam(learning_rate=0.001), metrics=["mae"])

# Show model summary
model.summary()