# LSTM: Sequence-aware “FoxSpeedScore” Generator

Train your LSTM to analyze a horse’s past performance sequence and predict a FoxSpeedScore that reflects its projected strength today.

✅ You can then: • Rank horses by this score for simple win/place bets • Or use it as a ranking feature in your CatBoost/YetiRank model

⸻

🔍 How Accurate is Ranking by Speed Score Alone?

🔥 The Good:

If your LSTM is trained on well-normalized, consistent historical speed signals: • Relative ordering can be quite meaningful • Especially in smaller fields or when you’re identifying top 3–4 finishers • Even if absolute values are off, ranking is often more robust

❄️ The Limitations: • LSTM alone won’t account for today’s track conditions, surface bias, jockey/trainer changes, distance, class jump/drop • It may favor horses that ran fast recently but are now outclassed or mispositioned

In [2]:
# Setup Environment
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
os.environ.pop("CUDA_VISIBLE_DEVICES", None)

# Setup Environment
import time
from optuna.importance import MeanDecreaseImpurityImportanceEvaluator
import os
import logging
import datetime
import numpy as np
import pandas as pd
import optuna
import optuna.visualization as viz
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql

In [4]:
spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()

Spark session created successfully.


In [5]:
def gps_sql_queries():
    queries = {
        "gps_horse": """
            SELECT g.course_cd, g.race_date,g.race_number,
            REGEXP_REPLACE(TRIM(UPPER(saddle_cloth_number)), '\s+$', '') AS saddle_cloth_number, time_stamp, 
            longitude, latitude, speed, progress, stride_frequency, g.post_time, location,
            re.axciskey, h.horse_id, re.official_fin, h.horse_name
            FROM gpspoint g
            JOIN results_entries re on g.course_cd = re.course_cd
                AND g.race_date = re.race_date
                AND g.race_number = re.race_number
                AND g.saddle_cloth_number = re.program_num
            JOIN horse h on re.axciskey = h.axciskey
            WHERE speed is not null
            AND progress is not null
            AND stride_frequency is not null
            """,
        "sectionals": """
            SELECT s.course_cd, s.race_date, s.race_number, 
            REGEXP_REPLACE(TRIM(UPPER(saddle_cloth_number)), '\s+$', '') AS saddle_cloth_number, 
            s.gate_name, s.gate_numeric, s.length_to_finish, s.sectional_time, s.running_time, 
            s.distance_back, s.distance_ran, s.number_of_strides, s.post_time, re.official_fin
            FROM sectionals s
            JOIN results_entries re on s.course_cd = re.course_cd
                AND s.race_date = re.race_date
                AND s.race_number = re.race_number
                AND s.saddle_cloth_number = re.program_num
            JOIN horse h on re.axciskey = h.axciskey 
            WHERE length_to_finish is not null
            AND sectional_time is not null
            AND running_time is not null
            AND distance_back is not null
            AND distance_ran is not null
            AND distance_ran is not null
            """
    }
    return queries

In [6]:
queries = gps_sql_queries()
dfs = load_data_from_postgresql(spark, jdbc_url, jdbc_properties, queries, parquet_dir)
        # Suppose we have a dictionary of queries
for name, df in dfs.items():
    logging.info(f"DataFrame '{name}' loaded. Schema:")
    df.printSchema()
    if name == "gps_horse":
        gps_horse_df = df
    elif name == "sectionals":
        sectionals_df = df    
    else:
        logging.error(f"Unknown DataFrame name: {name}")
        continue


[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- time_stamp: timestamp (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- progress: double (nullable = true)
 |-- stride_frequency: double (nullable = true)
 |-- post_time: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- axciskey: string (nullable = true)
 |-- horse_id: integer (nullable = true)
 |-- official_fin: integer (nullable = true)
 |-- horse_name: string (nullable = true)

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- gate_name: string (nullable = true)
 |-- gate_numeric: double (nullable = true)
 |-- length_to_finish: double (nullable = true)
 |-

                                                                                

In [7]:

start_time = time.time()
gps_horse_df.write.mode("overwrite").parquet(f"{parquet_dir}/gps_horse_df")
sectionals_df.write.mode("overwrite").parquet(f"{parquet_dir}/sectionals_df")
logging.info(f"Data written to Parquet in {time.time() - start_time:.2f} seconds")
    

                                                                                

In [8]:
from pyspark.sql.functions import col, concat_ws, lpad, date_format

gps_horse_df = gps_horse_df.withColumn(
    "race_id",
    concat_ws(
        "_",
        col("course_cd"),
        date_format(col("race_date"), "yyyyMMdd"),
        lpad(col("race_number").cast("string"), 2, "0")
    )
)

## Sort

In [9]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, to_date

# Ensure race_date is in proper date format
gps_horse_df = gps_horse_df.withColumn("race_date", to_date(col("race_date")))

# Define a Window partitioned by horse_id, ordered by race_date and post_time
window_spec = Window.partitionBy("horse_id").orderBy(col("race_date"), col("post_time"))

# Assign a row number to ensure strict chronological order
gps_horse_df = gps_horse_df.withColumn("race_ordinal", row_number().over(window_spec))


## Padding Sequences to a Fixed Length

###  Step 1: Select Relevant Columns & Order by Sequence

In [10]:

features = ["speed", "progress", "stride_frequency"]  # Add or adjust as needed

# Filter nulls (optional, depending on your quality)
gps_horse_df_filtered = gps_horse_df.dropna(subset=features + ["race_ordinal", "horse_id"])

# Group and collect sequences
from pyspark.sql.functions import collect_list, struct

sequence_df = gps_horse_df_filtered \
    .select("horse_id", "race_ordinal", *features) \
    .orderBy("horse_id", "race_ordinal") \
    .withColumn("features", struct(*[col(c) for c in features])) \
    .groupBy("horse_id") \
    .agg(collect_list("features").alias("sequence"))

###  PySpark Code to Count GPS Points per Horse per Race

In [11]:
from pyspark.sql import functions as F

# Step 1: Compute sequence lengths per horse per race
df_seq_len = gps_horse_df.groupBy("race_id", "horse_id").agg(F.count("*").alias("seq_length"))

# Show distribution statistics to validate
df_seq_len.describe().show()

# Step 2: Define length buckets
df_buckets = df_seq_len.withColumn(
    "bucket",
    F.when(F.col("seq_length") <= 100, "short")
     .when(F.col("seq_length") <= 150, "medium")
     .otherwise("long")
)

# Step 3: Join back with original data
df_binned = gps_horse_df.join(df_buckets, ["race_id", "horse_id"])

# Show sample results
df_binned.select("race_id", "horse_id", "seq_length", "bucket").show(10, truncate=False)

# Step 4: Get overall distribution of bucket counts
df_binned.groupBy("bucket").count().show()

                                                                                

+-------+---------------+------------------+-----------------+
|summary|        race_id|          horse_id|       seq_length|
+-------+---------------+------------------+-----------------+
|  count|         353595|            353595|           353595|
|   mean|           null|251063.94700716922|96.16060464655892|
| stddev|           null|335519.12547872664|22.24771164873177|
|    min|AQU_20221229_01|                 1|                1|
|    max|TWO_20241215_13|           2279735|              305|
+-------+---------------+------------------+-----------------+



                                                                                

+---------------+--------+----------+------+
|race_id        |horse_id|seq_length|bucket|
+---------------+--------+----------+------+
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
|AQU_20221229_01|6303    |96        |short |
+---------------+--------+----------+------+
only showing top 10 rows





+------+--------+
|bucket|   count|
+------+--------+
|  long|  360411|
|medium|17958421|
| short|15683077|
+------+--------+



                                                                                

###  Filter by Bucket

In [32]:
short_df = df_binned.filter(F.col("bucket") == "short")
medium_df = df_binned.filter(F.col("bucket") == "medium")
long_df = df_binned.filter(F.col("bucket") == "long")

## Assign Target/Relevance:

In [33]:
def assign_piecewise_log_labels_spark(df, alpha = 30.0, beta = 4.0):
    from pyspark.sql import functions as F
    df_out = df.withColumn(
        "relevance",
        F.when(F.col("official_fin") == 1, 70.0)
         .when(F.col("official_fin") == 2, 56.0)
         .when(F.col("official_fin") == 3, 44.0)
         .when(F.col("official_fin") == 4, 34.0)
         .otherwise(F.lit(alpha) / F.log(F.lit(beta) + F.col("official_fin")))
    ).withColumn(
        "top4_label",
        F.when(F.col("official_fin") <= 4, F.lit(1)).otherwise(F.lit(0))
    )
    
    return df_out  # ← 🔥 This line is critical

In [34]:
short_df = assign_piecewise_log_labels_spark(short_df)
medium_df = assign_piecewise_log_labels_spark(medium_df)
long_df = assign_piecewise_log_labels_spark(long_df)

### Convert to Sequence Format

In [35]:
from pyspark.sql.functions import struct, collect_list

def make_sequences(df, features):
    return df.select("race_id", "horse_id", "race_ordinal", *features) \
        .orderBy("race_id", "horse_id", "race_ordinal") \
        .withColumn("features", struct(*[col(f) for f in features])) \
        .groupBy("race_id", "horse_id") \
        .agg(collect_list("features").alias("sequence"))

#### Apply this to each bucket:

In [36]:
features = ["speed", "progress", "stride_frequency"]
short_seq = make_sequences(short_df, features)
medium_seq = make_sequences(medium_df, features)
long_seq = make_sequences(long_df, features)

#### Convert to Pandas + Pad

Convert to Pandas and pad each list to a fixed length:

	•	short → pad to 100
	•	medium → pad to 150
	•	long → pad to max (or truncate at 200–250)


In [37]:
# Bring in official_fin per (race_id, horse_id)
short_seq = short_seq.join(
    short_df.select("race_id", "horse_id", "relevance").dropDuplicates(["race_id", "horse_id"]),
    on=["race_id", "horse_id"],
    how="left"
)

medium_seq = medium_seq.join(
    medium_df.select("race_id", "horse_id", "relevance").dropDuplicates(["race_id", "horse_id"]),
    on=["race_id", "horse_id"],
    how="left"
)

long_seq = long_seq.join(
    long_df.select("race_id", "horse_id", "relevance").dropDuplicates(["race_id", "horse_id"]),
    on=["race_id", "horse_id"],
    how="left"
)

In [41]:
import pandas as pd
import numpy as np

def pad_sequence(seq, target_len):
    """
    Pad or truncate a horse's race history sequence to a fixed length.
    Each element of the sequence is a dictionary with keys like 'speed', 'progress', 'stride_frequency'.
    """
    pad_val = {"speed": 0.0, "progress": 0.0, "stride_frequency": 0.0}
    seq = seq[:target_len]
    return seq + [pad_val] * (target_len - len(seq))

def prepare_unified_lstm_dataset(short_pd, medium_pd, long_pd, target_len=150):
    """
    Combine short, medium, and long sequence DataFrames into a single training set for LSTM regression.

    Each row represents a horse in a race and contains:
      - a sequence of past performance features (speed, progress, stride_frequency)
      - a target 'relevance' score representing the outcome of that race

    Returns:
        X_all: 3D np.array [samples, time_steps, features]
        y_all: 1D np.array [samples]
        combined_pd: original DataFrame including padded_seq and metadata
    """
    # Combine datasets and remove duplicates for same race_id + horse_id
    combined_pd = pd.concat([short_pd, medium_pd, long_pd], ignore_index=True)
    combined_pd = combined_pd.drop_duplicates(subset=["race_id", "horse_id"])
    combined_pd = combined_pd[combined_pd["relevance"].notna()]

    # Pad each sequence to target length
    combined_pd["padded_seq"] = combined_pd["sequence"].apply(lambda x: pad_sequence(x, target_len))

    # Extract the 3 feature values for each timestep
    X_all = np.array([
        [[d["speed"], d["progress"], d["stride_frequency"]] for d in seq]
        for seq in combined_pd["padded_seq"]
    ], dtype=np.float32)

    y_all = combined_pd["relevance"].values.astype(np.float32)

    return X_all, y_all, combined_pd

### The following code:
    1.	Combines short, medium, long buckets
    
	2.	Applies the assign_piecewise_log_labels_spark function
    
	3.	Prepares one record per horse/race, with sequence and relevance
    
	4.	Pads all sequences to the same length
    
	5.	Outputs: X_all, y_all


In [42]:
TARGET_SEQ_LEN = 150

short_pd = short_seq.toPandas()
medium_pd = medium_seq.toPandas()
long_pd = long_seq.toPandas()

X_all, y_all, full_pd = prepare_unified_lstm_dataset(short_pd, medium_pd, long_pd, target_len=TARGET_SEQ_LEN)

✅ Shape Check

In [45]:
print(X_all.shape)   # (num_samples, sequence_length, features) = (353595, 150, 3)
print(y_all.shape)   # (num_samples,) — 1 FoxSpeedScore per sample
print(full_pd.shape) # (num_samples, 5) — includes race_id, horse_id, sequence, relevance, padded_seq

(353595, 150, 3)
(353595,)
(353595, 5)


### ✅ PyTorch Dataset + DataLoader Functions

In [64]:
y_all = y_all / y_all.max()

In [65]:
import torch
from torch.utils.data import Dataset, DataLoader

class HorseRaceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [66]:
def build_lstm_dataloaders(X_all, y_all, batch_size=64, val_split=0.2, shuffle=True):
    from sklearn.model_selection import train_test_split

    X_train, X_val, y_train, y_val = train_test_split(
        X_all, y_all, test_size=val_split, random_state=42
    )

    train_dataset = HorseRaceDataset(X_train, y_train)
    val_dataset = HorseRaceDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    return train_loader, val_loader, train_dataset, val_dataset

In [67]:
train_loader, val_loader, train_dataset, val_dataset = build_lstm_dataloaders(X_all, y_all)

✅ Sanity Check

In [68]:
for batch_X, batch_y in train_loader:
    print("X batch shape:", batch_X.shape)  # torch.Size([64, 100, 3])
    print("y batch shape:", batch_y.shape)  # torch.Size([64])
    break

X batch shape: torch.Size([64, 150, 3])
y batch shape: torch.Size([64])


In [69]:
print(X_all.dtype, X_all.shape)  # float32 (164445, 100, 3)
print(y_all.dtype, y_all.shape)  # float32 (164445,)

float32 (353595, 150, 3)
float32 (353595,)


# 🧠 Basic LSTM Model for Sequence Classification

This model assumes:

	•	Input shape: (batch_size, sequence_length=100, num_features=3)
	•	Output: binary classification (e.g. win = 1, not win = 0)


### ✅ HorseRaceLSTM Class

In [70]:
import torch
import torch.nn as nn

class HorseRaceLSTM(nn.Module):
    def __init__(self, input_size=3, hidden_size=64, num_layers=1, dropout=0.2):
        super(HorseRaceLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out.squeeze(1)

In [71]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Device name 0:", torch.cuda.get_device_name(0))

CUDA available: True
CUDA device count: 2
Device name 0: NVIDIA RTX A6000


In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = HorseRaceLSTM(input_size=3, hidden_size=64, num_layers=2)
model.to(device)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


### 🧪 Mini Training Loop

In [73]:
print("Post-normalization check:")
print("Min:", X_short.min(), "Max:", X_short.max())
print("Any NaNs?", np.isnan(X_short).any())

Post-normalization check:
Min: 0.0 Max: 2112.3
Any NaNs? False


In [81]:
def train_one_epoch(model, dataloader, optimizer, loss_fn, device, verbose=False):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()

        if verbose:
            for name, param in model.named_parameters():
                if param.grad is not None:
                    grad = param.grad
                    print(f"{name}: mean={grad.mean().item():.6f}, std={grad.std().item():.6f}")

        optimizer.step()
        running_loss += loss.item()

    return running_loss / len(dataloader)

In [82]:
def evaluate_lstm(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [83]:
for epoch in range(10):
    train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, device, verbose=False)
    val_loss = evaluate_lstm(model, val_loader, loss_fn, device, verbose=False)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

Would you like help writing an evaluation script to test how often your top 1–3 LSTM-ranked horses actually finished in the top 3?

This is how you start to trust your model and quantify edge.

In [85]:
projected_scores = {}
model.eval()
with torch.no_grad():
    for i, row in short_pd.iterrows():
        x = torch.tensor(X_short[i], dtype=torch.float32).unsqueeze(0).to(device)
        score = model(x).item()
        projected_scores[row["horse_id"]] = score

# Sort horses
ranked = sorted(projected_scores.items(), key=lambda x: x[1], reverse=True)
print("Top projected finishers:", ranked[:5])

Top projected finishers: [(215667, 0.825968325138092), (4168, 0.8017905950546265), (417398, 0.801307201385498), (151216, 0.8009016513824463), (432162, 0.8001235127449036)]


In [87]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_regression(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            preds = model(X_batch)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    mse = mean_squared_error(all_labels, all_preds)
    mae = mean_absolute_error(all_labels, all_preds)
    r2 = r2_score(all_labels, all_preds)

    print(f"MSE:  {mse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"R²:   {r2:.4f}")

In [88]:
evaluate_regression(model, val_loader, device)

MSE:  0.0674
MAE:  0.2110
R²:   0.2808


In [86]:
torch.save(model.state_dict(), "/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/models/LSTM/foxspeedscore_lstm_20250321.pt")

In [None]:
model = HorseRaceLSTM(input_size=3, hidden_size=64, num_layers=2)
model.load_state_dict(torch.load("horse_lstm_short.pt"))
model.to(device)
model.eval()

In [None]:
with torch.no_grad():
    for horse in race_horses:
        x = horse_sequence_tensor.unsqueeze(0).to(device)  # shape: [1, seq_len, features]
        fox_score = model(x).item()
        projected_scores[horse_id] = fox_score

# Rank by score
ranked_horses = sorted(projected_scores.items(), key=lambda x: x[1], reverse=True)

Would you like me to help rewire your train_one_epoch() and data prep code to support this regression setup with relevance?

In [93]:
def evaluate_racewise_ranking(df, score_col="pred_score", label_col="official_fin", top_k=3):
    """
    Evaluates how often the model's top-k predicted horses actually finished in the top-k.

    Parameters:
        df (pd.DataFrame): Must include 'race_id', 'horse_id', model predictions, and actual finish
        score_col (str): Column name for model's predicted score
        label_col (str): Column name for true finish (1 = winner, 2 = second, ...)
        top_k (int): Number of top horses to evaluate (e.g., top 3 or 4)

    Returns:
        dict: Hit@K, MRR, AvgWinnerRank, Total Races
    """
    hit_at_k = 0
    reciprocal_ranks = []
    true_ranks = []

    for race_id, group in df.groupby("race_id"):
        group_sorted = group.sort_values(by=score_col, ascending=False).reset_index(drop=True)

        top_k_predicted = group_sorted.head(top_k)
        actual_top_k = group[group[label_col] <= top_k]

        # Hit@K: any of predicted top-k finished top-k
        hit = any(horse_id in actual_top_k["horse_id"].values for horse_id in top_k_predicted["horse_id"].values)
        hit_at_k += int(hit)

        # MRR: rank of actual winner
        winner_row = group[group[label_col] == 1]
        if not winner_row.empty:
            winner_id = winner_row["horse_id"].values[0]
            winner_rank = group_sorted[group_sorted["horse_id"] == winner_id].index[0] + 1
            reciprocal_ranks.append(1.0 / winner_rank)
            true_ranks.append(winner_rank)

    num_races = df["race_id"].nunique()
    return {
        f"Hit@{top_k}": round(hit_at_k / num_races, 4) if num_races else 0,
        "MRR": round(np.mean(reciprocal_ranks), 4) if reciprocal_ranks else 0,
        "AvgWinnerRank": round(np.mean(true_ranks), 2) if true_ranks else None,
        "Total Races": num_races
    }

In [94]:
def predict_scores(model, X_all, device, batch_size=1024):
    model.eval()
    preds = []

    with torch.no_grad():
        for i in range(0, len(X_all), batch_size):
            batch = X_all[i:i+batch_size]
            batch_tensor = torch.tensor(batch, dtype=torch.float32).to(device)
            batch_preds = model(batch_tensor).cpu().numpy()
            preds.extend(batch_preds)

    return np.array(preds)

In [95]:
pred_scores = predict_scores(model, X_all, device)
race_results_df = full_pd.copy()
race_results_df["pred_score"] = pred_scores

In [98]:
# Assuming you have it in the Spark side
officials = short_df.select("race_id", "horse_id", "official_fin").dropDuplicates()
officials_pd = officials.toPandas()

# Merge with full_pd
race_results_df = full_pd.merge(officials_pd, on=["race_id", "horse_id"], how="left")
race_results_df["pred_score"] = pred_scores

                                                                                

In [99]:
ranking_metrics = evaluate_racewise_ranking(
    race_results_df,
    score_col="pred_score",
    label_col="official_fin",  # Must be actual 1, 2, 3... positions
    top_k=3
)

print(ranking_metrics)

{'Hit@3': 0.558, 'MRR': 0.6944, 'AvgWinnerRank': 2.05, 'Total Races': 45535}


# ✅ Immediate Goal: Write FoxSpeedScores to foxspeedscore_model_output



In [112]:
import pandas as pd
from datetime import datetime

# Build Pandas DF with prediction results
output_pd = full_pd[["race_id", "horse_id"]].copy()
output_pd["score"] = pred_scores
output_pd["sequence_len"] = full_pd["sequence"].apply(len)
output_pd["model_version"] = "lstm_v1"
output_pd["run_timestamp"] = datetime.utcnow()

# Convert to Spark DataFrame
output_df = spark.createDataFrame(output_pd)

In [113]:

staging_table = "foxspeedscore_model_output"
(
        output_df.write.format("jdbc")
        .option("url", jdbc_url)
        .option("dbtable", staging_table)
        .option("user", jdbc_properties["user"])
        .option("driver", jdbc_properties["driver"])
        .mode("overwrite")
        .save()
)
