# LSTM

Plan is to model a horse’s per-race sequence of positions/velocities/accelerations


In [None]:
# Setup Environment
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
os.environ.pop("CUDA_VISIBLE_DEVICES", None)

# Setup Environment
import time
from optuna.importance import MeanDecreaseImpurityImportanceEvaluator
import os
import logging
import datetime
import numpy as np
import pandas as pd
import optuna
import optuna.visualization as viz
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql

In [None]:
spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()

In [None]:
def gps_sql_queries():
    queries = {
        "gps_horse": """
            SELECT g.course_cd, g.race_date,g.race_number,
            REGEXP_REPLACE(TRIM(UPPER(saddle_cloth_number)), '\s+$', '') AS saddle_cloth_number, time_stamp, 
            longitude, latitude, speed, progress, stride_frequency, g.post_time, location,
            re.axciskey, h.horse_id, re.official_fin, h.horse_name
            FROM gpspoint g
            JOIN results_entries re on g.course_cd = re.course_cd
                AND g.race_date = re.race_date
                AND g.race_number = re.race_number
                AND g.saddle_cloth_number = re.program_num
            JOIN horse h on re.axciskey = h.axciskey
            WHERE speed is not null
            AND progress is not null
            AND stride_frequency is not null
            """,
        "sectionals": """
            SELECT s.course_cd, s.race_date, s.race_number, 
            REGEXP_REPLACE(TRIM(UPPER(saddle_cloth_number)), '\s+$', '') AS saddle_cloth_number, 
            s.gate_name, s.gate_numeric, s.length_to_finish, s.sectional_time, s.running_time, 
            s.distance_back, s.distance_ran, s.number_of_strides, s.post_time, re.official_fin
            FROM sectionals s
            JOIN results_entries re on s.course_cd = re.course_cd
                AND s.race_date = re.race_date
                AND s.race_number = re.race_number
                AND s.saddle_cloth_number = re.program_num
            JOIN horse h on re.axciskey = h.axciskey 
            WHERE length_to_finish is not null
            AND sectional_time is not null
            AND running_time is not null
            AND distance_back is not null
            AND distance_ran is not null
            AND distance_ran is not null
            """
    }
    return queries

In [None]:
queries = gps_sql_queries()
dfs = load_data_from_postgresql(spark, jdbc_url, jdbc_properties, queries, parquet_dir)
        # Suppose we have a dictionary of queries
for name, df in dfs.items():
    logging.info(f"DataFrame '{name}' loaded. Schema:")
    df.printSchema()
    if name == "gps_horse":
        gps_horse_df = df
    elif name == "sectionals":
        sectionals_df = df    
    else:
        logging.error(f"Unknown DataFrame name: {name}")
        continue


In [None]:

start_time = time.time()
gps_horse_df.write.mode("overwrite").parquet(f"{parquet_dir}/gps_horse_df")
sectionals_df.write.mode("overwrite").parquet(f"{parquet_dir}/sectionals_df")
logging.info(f"Data written to Parquet in {time.time() - start_time:.2f} seconds")
    

In [None]:
from pyspark.sql.functions import col, concat_ws, lpad, date_format

gps_horse_df = gps_horse_df.withColumn(
    "race_id",
    concat_ws(
        "_",
        col("course_cd"),
        date_format(col("race_date"), "yyyyMMdd"),
        lpad(col("race_number").cast("string"), 2, "0")
    )
)

## Sort

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, to_date

# Ensure race_date is in proper date format
gps_horse_df = gps_horse_df.withColumn("race_date", to_date(col("race_date")))

# Define a Window partitioned by horse_id, ordered by race_date and post_time
window_spec = Window.partitionBy("horse_id").orderBy(col("race_date"), col("post_time"))

# Assign a row number to ensure strict chronological order
gps_horse_df = gps_horse_df.withColumn("race_ordinal", row_number().over(window_spec))


## Padding Sequences to a Fixed Length

###  Step 1: Select Relevant Columns & Order by Sequence

In [None]:

features = ["speed", "progress", "stride_frequency"]  # Add or adjust as needed

# Filter nulls (optional, depending on your quality)
gps_horse_df_filtered = gps_horse_df.dropna(subset=features + ["race_ordinal", "horse_id"])

# Group and collect sequences
from pyspark.sql.functions import collect_list, struct

sequence_df = gps_horse_df_filtered \
    .select("horse_id", "race_ordinal", *features) \
    .orderBy("horse_id", "race_ordinal") \
    .withColumn("features", struct(*[col(c) for c in features])) \
    .groupBy("horse_id") \
    .agg(collect_list("features").alias("sequence"))

###  PySpark Code to Count GPS Points per Horse per Race

In [None]:
from pyspark.sql import functions as F

# Step 1: Compute sequence lengths per horse per race
df_seq_len = gps_horse_df.groupBy("race_id", "horse_id").agg(F.count("*").alias("seq_length"))

# Show distribution statistics to validate
df_seq_len.describe().show()

# Step 2: Define length buckets
df_buckets = df_seq_len.withColumn(
    "bucket",
    F.when(F.col("seq_length") <= 100, "short")
     .when(F.col("seq_length") <= 150, "medium")
     .otherwise("long")
)

# Step 3: Join back with original data
df_binned = gps_horse_df.join(df_buckets, ["race_id", "horse_id"])

# Show sample results
df_binned.select("race_id", "horse_id", "seq_length", "bucket").show(10, truncate=False)

# Step 4: Get overall distribution of bucket counts
df_binned.groupBy("bucket").count().show()

###  Filter by Bucket

In [None]:
short_df = df_binned.filter(F.col("bucket") == "short")
medium_df = df_binned.filter(F.col("bucket") == "medium")
long_df = df_binned.filter(F.col("bucket") == "long")

In [None]:
short_df.columns

### Convert to Sequence Format

In [None]:
from pyspark.sql.functions import struct, collect_list

def make_sequences(df, features):
    return df.select("race_id", "horse_id", "race_ordinal", *features) \
        .orderBy("race_id", "horse_id", "race_ordinal") \
        .withColumn("features", struct(*[col(f) for f in features])) \
        .groupBy("race_id", "horse_id") \
        .agg(collect_list("features").alias("sequence"))

#### Apply this to each bucket:

In [None]:
features = ["speed", "progress", "stride_frequency"]
short_seq = make_sequences(short_df, features)
medium_seq = make_sequences(medium_df, features)
long_seq = make_sequences(long_df, features)

In [None]:
short_df.columns

In [None]:
short_seq.columns

#### Convert to Pandas + Pad

Convert to Pandas and pad each list to a fixed length:

	•	short → pad to 100
	•	medium → pad to 150
	•	long → pad to max (or truncate at 200–250)


In [None]:
# Bring in official_fin per (race_id, horse_id)
short_seq = short_seq.join(
    short_df.select("race_id", "horse_id", "official_fin").dropDuplicates(["race_id", "horse_id"]),
    on=["race_id", "horse_id"],
    how="left"
)

medium_seq = medium_seq.join(
    medium_df.select("race_id", "horse_id", "official_fin").dropDuplicates(["race_id", "horse_id"]),
    on=["race_id", "horse_id"],
    how="left"
)

long_seq = long_seq.join(
    long_df.select("race_id", "horse_id", "official_fin").dropDuplicates(["race_id", "horse_id"]),
    on=["race_id", "horse_id"],
    how="left"
)

In [None]:
def pad_sequence(seq, target_len):
    padded = seq[:target_len]
    pad_len = target_len - len(padded)
    pad_val = {"speed": 0.0, "progress": 0.0, "stride_frequency": 0.0}
    return padded + [pad_val] * pad_len

And turn into NumPy arrays:

In [None]:
short_pd = short_seq.toPandas()
short_pd["padded_seq"] = short_pd["sequence"].apply(lambda x: pad_sequence(x, 100))
X_short = np.array([[ [d["speed"], d["progress"], d["stride_frequency"]] for d in seq] for seq in short_pd["padded_seq"]])

medium_pd = medium_seq.toPandas()
medium_pd["padded_seq"] = medium_pd["sequence"].apply(lambda x: pad_sequence(x, 100))
X_medium = np.array([[ [d["speed"], d["progress"], d["stride_frequency"]] for d in seq] for seq in medium_pd["padded_seq"]])

long_pd = long_seq.toPandas()
long_pd["padded_seq"] = long_pd["sequence"].apply(lambda x: pad_sequence(x, 100))
X_long = np.array([[ [d["speed"], d["progress"], d["stride_frequency"]] for d in seq] for seq in long_pd["padded_seq"]])


In [None]:
short_pd.columns

✅ Shape Check

In [None]:
print(X_short.shape)   # (num_samples, 100, 3)
print(X_medium.shape)  # (num_samples, 100, 3)
print(X_long.shape)    # (num_samples, 100, 3)

### 🧠 Step-by-Step: Create PyTorch Dataset and DataLoader from X_short, X_medium, and X_long

Assuming I’ll eventually associate each sequence with a label (e.g., official_finish, win, etc.), the Dataset class will accept both X and y.

⸻

✅ Step 1: Install PyTorch


In [None]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class HorseRaceDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]


✅ Step 3: Prepare Labels 

In [None]:
y_short = short_pd["official_fin"].apply(lambda x: 1 if x == 1 else 0).values  # Example: binary winner
y_medium = medium_pd["official_fin"].apply(lambda x: 1 if x == 1 else 0).values
y_long = long_pd["official_fin"].apply(lambda x: 1 if x == 1 else 0).values

In [None]:
X_short = X_short.astype(np.float32)
X_medium = X_medium.astype(np.float32)
X_long = X_long.astype(np.float32)

y_short = y_short.astype(np.float32)
y_medium = y_medium.astype(np.float32)
y_long = y_long.astype(np.float32)

### Normalize in NumPy

In [None]:
def normalize_feature(tensor_3d, feature_idx):
    feature = tensor_3d[:, :, feature_idx]
    mean = np.mean(feature)
    std = np.std(feature)
    tensor_3d[:, :, feature_idx] = (feature - mean) / (std + 1e-8)
    return tensor_3d

for i in range(3):  # 3 features
    X_short = normalize_feature(X_short, i)
    X_medium = normalize_feature(X_medium, i)
    X_long = normalize_feature(X_long, i)

✅ Step 4: Create Datasets

In [None]:
train_dataset_short = HorseRaceDataset(X_short.astype(np.float32), y_short)
train_dataset_medium = HorseRaceDataset(X_medium.astype(np.float32), y_medium)
train_dataset_long = HorseRaceDataset(X_long.astype(np.float32), y_long)

✅ Step 5: Wrap in DataLoader

In [None]:
train_loader_short = DataLoader(train_dataset_short, batch_size=64, shuffle=True, pin_memory=True)
train_loader_medium = DataLoader(train_dataset_medium, batch_size=64, shuffle=True, pin_memory=True)
train_loader_long = DataLoader(train_dataset_long, batch_size=64, shuffle=True, pin_memory=True)

✅ Sanity Check

In [None]:
for batch_X, batch_y in train_loader_long:
    print("X batch shape:", batch_X.shape)  # torch.Size([64, 100, 3])
    print("y batch shape:", batch_y.shape)  # torch.Size([64])
    break

In [None]:
print(X_short.dtype, X_short.shape)  # float32 (164445, 100, 3)
print(y_short.dtype, y_short.shape)  # float32 (164445,)

In [None]:
X_short = np.clip(X_short, -10, 10)

# 🧠 Basic LSTM Model for Sequence Classification

This model assumes:

	•	Input shape: (batch_size, sequence_length=100, num_features=3)
	•	Output: binary classification (e.g. win = 1, not win = 0)


### ✅ HorseRaceLSTM Class

In [None]:
import torch
import torch.nn as nn

class HorseRaceLSTM(nn.Module):
    def __init__(self, input_size=3, hidden_size=64, num_layers=1, dropout=0.2):
        super(HorseRaceLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # no sigmoid!
        return out.squeeze(1)         # return logits

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Device name 0:", torch.cuda.get_device_name(0))

In [None]:
!pip show torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = HorseRaceLSTM(input_size=3, hidden_size=64, num_layers=2)
model.to(device)
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


### 🧪 Mini Training Loop

In [None]:
print("Post-normalization check:")
print("Min:", X_short.min(), "Max:", X_short.max())
print("Any NaNs?", np.isnan(X_short).any())

In [None]:
def train_one_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        print("X_batch device:", X_batch.device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        for name, param in model.named_parameters():
            if param.grad is not None:
                grad = param.grad
                print(f"{name}: mean={grad.mean().item():.6f}, std={grad.std().item():.6f}")
        optimizer.step()

        running_loss += loss.item()
        assert y_batch.min() >= 0.0 and y_batch.max() <= 1.0

    return running_loss / len(dataloader)

In [None]:
#!pip install pynvml matplotlib

In [None]:
#!pip install gpustat

In [None]:
for epoch in range(10):
    loss = train_one_epoch(model, train_loader_short, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}: Loss = {loss:.4f}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            logits = model(X_batch)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")

In [None]:
evaluate_model(model, train_loader_short, device)

Awesome — your clarity here opens up some very strategic use cases. Let’s align your LSTM with your goal: predicting relative horse performance (pre-race) and ranking horses competitively, especially in the top 3–4.

⸻

🧠 Your Current Insight:

You already nailed it:

“One option is to get scores and put them in as features for my CatBoost YetiRank model.”

That’s an excellent direction. But here are more ways you can apply LSTM effectively in this race prediction pipeline — all using pre-race data.

⸻

🎯 Option 1: Use LSTM as a “Speed Projection Score Generator”

What it does:
	•	Feed in each horse’s last N races (as sequences: speed, stride freq, progress, surface, etc.)
	•	Model outputs a projected performance score for today’s race.

How to use it:
	•	Rank all horses in the race based on this projected score.
	•	Feed this score into:
	•	A CatBoost ranking model
	•	A stacked ensemble model
	•	Or use it standalone as a “FoxSpeedScore”

⸻

🧠 Option 2: Train LSTM to Rank Instead of Classify

How:

Train your LSTM to output a continuous score (like “expected finish time,” or normalized “rank” from 0 to 1).

Then:
	•	Sort horses in a race by this score
	•	Take the top 3–4 as your predicted finish order

You can also:
	•	Use pairwise ranking loss (like in YetiRank or RankNet)
	•	Or assign labels like:

label = 1.0 if finish_position == 1 else
        0.75 if finish_position == 2 else
        0.5 if finish_position == 3 else
        0.25 if finish_position == 4 else 0.0



⸻

🔄 Option 3: LSTM as Embedding Generator (for Meta Models)

Train the LSTM to output a vector embedding for each horse’s recent form (from sequences of past races).

Then:
	•	Concatenate that embedding with other race-day features (track, surface, odds, trainer stats)
	•	Feed it into your CatBoost or final model

This gives your meta-model temporal awareness — something tree-based models lack natively.

⸻

🔍 Option 4: Predict “Speed Projection Curve”

Instead of just a win probability, predict the horse’s expected:
	•	Early pace (first call)
	•	Mid-race pace (second call)
	•	Final pace (finish)

From pre-race data, this gives you a 3-point projected speed profile.

Feed those into your betting strategy or exotic models (like for exactas, trifectas).

⸻

🔧 Option 5: Sequence-based Feature Augmentation

You can extract LSTM-informed features like:
	•	“Last-5 Race Speed Trend” (up or down)
	•	“Form Stability Score” (std of last speeds)
	•	“Fatigue Recovery” (from stride frequency drop-off)

Use those as handcrafted features in your CatBoost or XGBoost ensemble.

⸻

🧠 Recap: How to Use LSTM in Your Pipeline

Use Case	LSTM Role	How to Use It
Score projection	Predict a win potential score	Feed into CatBoost ranking or rank directly
Ranking (not classification)	Predict relative order or rank	Train with ordinal or continuous targets
Feature augmentation	Temporal-aware feature creator	Feed to final model as extra inputs
Embedding for fusion model	Sequence-to-vector encoder	Concatenate with static features
Sectional prediction	Project in-race speed curve	Help strategy for exotic bets



⸻

Want help restructuring your LSTM to output rank scores, or building embeddings to plug into CatBoost? Or shall we start designing the next phase of the ensemble architecture?


In [None]:
torch.save(model.state_dict(), "/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/models/LSTM/horse_lstm_short.pt")

In [None]:
model = HorseRaceLSTM(input_size=3, hidden_size=64, num_layers=2)
model.load_state_dict(torch.load("horse_lstm_short.pt"))
model.to(device)
model.eval()