In [1]:
# Setup Environment
import time
from optuna.importance import MeanDecreaseImpurityImportanceEvaluator
import os
import logging
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
import joblib # Used for encoding horse_id
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import optuna
import optuna.visualization as viz
from catboost import CatBoostRanker, CatBoostRegressor, CatBoostClassifier, Pool
import numpy as np
import itertools
import pyspark.sql.functions as F
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
from src.data_preprocessing.data_prep1.data_utils import initialize_environment 
# Set global references to None
spark = None
master_results_df = None
race_df = None
df = None
training_data = None
train_df = None

2025-03-12 23:06:44.143777: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-12 23:06:44.152126: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-12 23:06:44.154568: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-12 23:06:44.161113: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()

Spark session created successfully.


In [71]:
train_df = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/train_df")
global_speed_score = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/global_speed_score.parquet")
horse_embedding = spark.read.parquet("/home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/horse_embedding_data-20250312_1948.parquet")

In [72]:
# Load historical and future data from Spark.
historical_df = global_speed_score.filter(F.col("data_flag") == "historical")
future_df = global_speed_score.filter(F.col("data_flag") == "future")


In [73]:
global_speed_score.count()

386165

In [74]:
future_df.count()

868

In [75]:
historical_df_spark = (
    historical_df_spark
    .withColumn("race_date_str", F.date_format("race_date", "yyyy-MM-dd"))
    .withColumn(
        "group_id",
        F.concat(
            F.col("course_cd"),
            F.lit("_"),
            F.col("race_date_str"),
            F.lit("_"),
            F.col("race_number").cast("string")
            )
        )
    )


In [76]:
race_sizes_df = (
    historical_df_spark.groupBy("group_id")
      .agg(F.count("*").alias("num_horses"))
)

In [77]:


# 2) Group by the computed "num_horses" to find how many races have that count
race_size_distribution_df = (
    race_sizes_df
    .groupBy("num_horses")
    .count()  # how many races have that particular horse count
    .withColumnRenamed("count", "num_races")
)

# 3) Optionally compute total number of races, then compute percentages
total_races = race_size_distribution_df.agg(F.sum("num_races")).collect()[0][0]

race_size_distribution_df = (
    race_size_distribution_df
    .withColumn("pct_of_races", F.round((F.col("num_races") / F.lit(total_races)) * 100, 2))
    .orderBy("num_horses")
)

race_size_distribution_df.show(200, truncate=False)

+----------+---------+------------+
|num_horses|num_races|pct_of_races|
+----------+---------+------------+
|3         |4        |0.02        |
|4         |22       |0.12        |
|5         |1796     |10.19       |
|6         |3862     |21.91       |
|7         |4170     |23.66       |
|8         |3165     |17.96       |
|9         |2063     |11.71       |
|10        |1396     |7.92        |
|11        |604      |3.43        |
|12        |495      |2.81        |
|13        |26       |0.15        |
|14        |19       |0.11        |
|17        |1        |0.01        |
+----------+---------+------------+



In [78]:
from src.data_preprocessing.data_prep2.data_healthcheck import time_series_data_healthcheck
import pprint

In [79]:
healthcheck_report = time_series_data_healthcheck(global_speed_score)
pprint.pprint(healthcheck_report)

[Stage 4652:>                                                       (0 + 1) / 1]

{'categorical_columns': {'columns': ['course_cd',
                                     'group_id',
                                     'axciskey',
                                     'race_id',
                                     'saddle_cloth_number',
                                     'horse_name',
                                     'sex',
                                     'equip',
                                     'previous_surface',
                                     'med',
                                     'surface',
                                     'trk_cond',
                                     'race_type',
                                     'stk_clm_md',
                                     'turf_mud_mark',
                                     'data_flag',
                                     'layoff_cat',
                                     'track_name',
                                     'race_date_str'],
                         'suggested_action'

                                                                                

In [65]:
import pyspark.sql.functions as F

# Identify numeric columns (float, double, int, etc.)
numeric_columns = [
    c for c, t in global_speed_score.dtypes
    if t in ("double", "float", "int", "bigint", "decimal")
]

# Build an expression that sums up invalid entries in each numeric column
summary_exprs = []
for c in numeric_columns:
    summary_exprs.append(
        F.sum(
            F.when(
                F.isnan(F.col(c)) |
                F.col(c).isNull() |
                (F.col(c) == float("inf")) |
                (F.col(c) == -float("inf")),
                1
            ).otherwise(0)
        ).alias(c)
    )

# Produce a single row that shows how many bad values each numeric column has
summary = global_speed_score.select(*summary_exprs)
summary.show(truncate=False)



+-----------+------------+--------+------------+--------+------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------+--------+--------+--------+---------+------------------+--------+--------+--------+--------+----------------+----------------+-------+-------+-------+-------+--------+--------+-------+-------+-------+-------+------------------+----------+----------+----------+----------+-----------------+-----------------+-----------------+--------------+------+----------+-----------------+-----------------+-----+------+------+----------+---------+---------+-------+-----+---------------+---------+----------------+----------------+-------------------+-------------------+-----------------+-----------------+-------------------+--------+------------------+-------+----------+-------+---------+--------+----------+------------+--------------------+-------------+-------------+---------+-----------+------

                                                                                

In [68]:
import pyspark.sql.functions as F

# 1) Identify numeric columns
numeric_columns = [
    c for c, t in global_speed_score.dtypes
    if t in ("double", "float", "int", "bigint", "decimal")
]

# 2) For each numeric column, calculate counts for each type of “bad” value
results = []
for col_name in numeric_columns:
    null_count = global_speed_score.filter(F.col(col_name).isNull()).count()
    nan_count = global_speed_score.filter(F.isnan(col_name)).count()            # only valid for float/double
    pos_inf_count = global_speed_score.filter(F.col(col_name) == float("inf")).count()
    neg_inf_count = global_speed_score.filter(F.col(col_name) == -float("inf")).count()

    results.append((col_name, null_count, nan_count, pos_inf_count, neg_inf_count))

# 3) Convert the list to a Spark DataFrame
bad_counts_df = (
    spark.createDataFrame(
        results, 
        ["column", "null_count", "nan_count", "pos_inf_count", "neg_inf_count"]
    )
)

# 4) Show the breakdown
bad_counts_df.show(250, truncate=False)

+------------------------+----------+---------+-------------+-------------+
|column                  |null_count|nan_count|pos_inf_count|neg_inf_count|
+------------------------+----------+---------+-------------+-------------+
|race_number             |0         |0        |0            |0            |
|class_rating            |0         |0        |0            |0            |
|horse_id                |0         |0        |0            |0            |
|official_fin            |536       |0        |0            |0            |
|par_time                |0         |0        |0            |0            |
|running_time            |0         |0        |0            |0            |
|total_distance_ran      |0         |0        |0            |0            |
|avgtime_gate1           |0         |0        |0            |0            |
|avgtime_gate2           |0         |0        |0            |0            |
|avgtime_gate3           |0         |0        |0            |0            |
|avgtime_gat

In [70]:
# global_speed_score.cols
num_rows = global_speed_score.count()            # full scan to count rows
num_cols = len(global_speed_score.columns)       # simply the length of the column list
print(f"Rows: {num_rows}, Columns: {num_cols}")

Rows: 133030, Columns: 150


In [None]:
from pyspark.sql import functions as F

for col in future_df.columns:
    null_count = future_df.filter(F.col(col).isNull()).count()
    if null_count > 0:
        print(f"{col}: {null_count} null values")

In [None]:
for col in historical_df_spark.columns:
    null_count = historical_df_spark.filter(F.col(col).isNull()).count()
    if null_count > 0:
        print(f"{col}: {null_count} null values")

In [None]:
global_speed_score.printSchema()

In [None]:
# Separate historical and future data
historical_df = global_speed_score.filter(F.col("data_flag") == "historical")
future_df = global_speed_score.filter(F.col("data_flag") == "future")


In [None]:
from pyspark.sql.types import FloatType, DoubleType
import pyspark.sql.functions as F

# Get all numeric columns (float and double)
numeric_cols = [field.name for field in global_speed_score.schema.fields 
                if field.dataType.typeName() in ['double', 'float']]

cols_with_issues = []

for col_name in numeric_cols:
    # Count NaN values
    nan_count = global_speed_score.filter(F.isnan(F.col(col_name))).count()
    # Count positive infinity values
    pos_inf_count = global_speed_score.filter(F.col(col_name) == float("inf")).count()
    # Count negative infinity values
    neg_inf_count = global_speed_score.filter(F.col(col_name) == float("-inf")).count()
    total_issues = nan_count + pos_inf_count + neg_inf_count
    if total_issues > 0:
        cols_with_issues.append({
            "column": col_name,
            "total_issues": total_issues,
            "nan_count": nan_count,
            "pos_inf_count": pos_inf_count,
            "neg_inf_count": neg_inf_count
        })

print("Numeric columns with NaN or Infinity values:")
for info in cols_with_issues:
    print(f"{info['column']}: total issues={info['total_issues']} (NaN: {info['nan_count']}, +Inf: {info['pos_inf_count']}, -Inf: {info['neg_inf_count']})")

In [None]:
count_hist = global_speed_score.filter(F.col("data_flag") == "historical").count()
count_fut = global_speed_score.filter(F.col("data_flag") == "future").count()
count_total = global_speed_score.count()

print(f"Final DF total count: {count_total}")
print(f"Final DF count for historical: {count_hist}")
print(f"Final DF count for future: {count_fut}")

In [None]:
# Show basic descriptive stats for global_speed_score
global_speed_score_stats = global_speed_score.select("global_speed_score_iq").describe()
global_speed_score_stats.show()

In [None]:
global_speed_score_quantiles = global_speed_score.select("global_speed_score_iq").summary()
global_speed_score_quantiles.show()

In [None]:
# Show count, mean, stddev, min, 25%, 50%, 75%, and max
global_speed_score_quantiles = global_speed_score.select("global_speed_score_iq").summary()
global_speed_score_quantiles.show()

In [None]:
import pyspark.sql.functions as F

df_bins = (
    global_speed_score.withColumn("score_bin", F.floor(F.col("global_speed_score_iq") / 10) * 10)
      .groupBy("score_bin")
      .count()
      .orderBy("score_bin")
)

df_bins.show(100, truncate=False)

In [None]:
# Example for standardized_score with bin size = 0.5
df_bins_std = (
    global_speed_score.withColumn("score_bin", F.floor(F.col("global_speed_score_iq") / 0.5) * 0.5)
      .groupBy("score_bin")
      .count()
      .orderBy("score_bin")
)
df_bins_std.show(200, truncate=False)

In [None]:
import matplotlib.pyplot as plt

# Convert the column to an RDD and use the histogram function
hist_result = (
    global_speed_score.select("global_speed_score_iq")
      .rdd
      .flatMap(lambda x: x)  # flatten out the column values
      .histogram(10)         # 10 bins by default
)

# hist_result returns a tuple (bins, counts)
# bins: list of bin boundaries
# counts: list of counts in each bin

bins = hist_result[0]
counts = hist_result[1]

plt.figure(figsize=(8, 5))
plt.bar(x=bins[:-1], height=counts, width=(bins[1] - bins[0]) * 0.9)
plt.xlabel("global_speed_score")
plt.ylabel("Count")
plt.title("Histogram of global_speed_score")
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Convert the column to an RDD and use the histogram function
hist_result = (
    global_speed_score.select("global_speed_score_iq")
      .rdd
      .flatMap(lambda x: x)  # flatten out the column values
      .histogram(10)         # 10 bins by default
)

# hist_result returns a tuple (bins, counts)
# bins: list of bin boundaries
# counts: list of counts in each bin

bins = hist_result[0]
counts = hist_result[1]

plt.figure(figsize=(8, 5))
plt.bar(x=bins[:-1], height=counts, width=(bins[1] - bins[0]) * 0.9)
plt.xlabel("global_speed_score_iq")
plt.ylabel("Count")
plt.title("Histogram of global_speed_score")
plt.show()

In [None]:
pdf = global_speed_score.select("global_speed_score_iq").toPandas()

plt.figure(figsize=(8, 5))
plt.hist(pdf["global_speed_score_iq"], bins=50, edgecolor='black')
plt.xlabel("global_speed_score_iq")
plt.ylabel("Count")
plt.title("Histogram of global_speed_score")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pyspark.sql.functions as F

# If the DataFrame is huge, take a sample to avoid OOM on driver
pdf = (global_speed_score
       .select("global_speed_score_iq")
       .dropna()
       .sample(withReplacement=False, fraction=0.01, seed=42)  # e.g., 1% sample
       .toPandas()
      )

plt.figure(figsize=(8, 5))
plt.hist(pdf["global_speed_score_iq"], bins=350, edgecolor='black')
plt.title("Distribution of global_speed_score_iq (sampled)")
plt.xlabel("global_speed_score_iq")
plt.ylabel("Count")
plt.show()

In [None]:
import pyspark.sql.functions as F

high_scores_df = (
    global_speed_score
    .select("horse_id", "horse_name", "global_speed_score_iq")
    .filter(
        (F.col("global_speed_score_iq") >= 110) &
        (F.col("global_speed_score_iq") <= 140)
    )
)

# Show them
high_scores_df.show(truncate=False)

# Or collect to the driver if you want to iterate in Python
results = high_scores_df.collect()
for row in results:
    print(row["horse_id"], row["horse_name"], row["global_speed_score_iq"])

In [None]:
# Group by what should be your PK columns
dup_check_df = (
    global_speed_score.groupBy("course_cd", "race_date", "race_number", "horse_id")
      .count()
      .filter("count > 1")  # means there's more than one row for that key
)

dup_check_df.show(truncate=False)

In [None]:
# Count number of races
# Count the number of unique races
race_count = (
    global_speed_score
    .select("course_cd", "race_date", "race_number")
    .distinct()  # Get unique race combinations
    .count()
)

# Print the number of unique races
print(f"Total number of races: {race_count}")

In [None]:
from pyspark.sql import functions as F

# Check if the column exists, and drop it if it does
if "horse_count" in global_speed_score.columns:
    global_speed_score = global_speed_score.drop("horse_count")

In [None]:
# Count the number of horses in each race
race_horse_counts = (
    global_speed_score
    .groupBy("course_cd", "race_date", "race_number")
    .agg(F.count("saddle_cloth_number").alias("horse_count"))  # Count horses per race
)

# Join the count back to the main DataFrame
global_speed_score = global_speed_score.join(race_horse_counts, on=["course_cd", "race_date", "race_number"], how="left")

In [None]:
# Define the race to filter (modify these values as needed)
selected_course = "TOP"
selected_date = "2025-03-01"  # Adjust as needed
selected_race = 2  # Adjust as needed

# Filter and sort the DataFrame
filtered_race = (
    global_speed_score
    .filter(
        (F.col("course_cd") == selected_course) &
        (F.col("race_date") == selected_date) &
        (F.col("race_number") == selected_race)
    )
    .select(
        "course_cd", 
        "race_date", 
        "race_number", 
        "horse_name", 
        "saddle_cloth_number", 
        "horse_count"
    )
    .orderBy(F.col("saddle_cloth_number").asc())  # Sort by saddle_cloth_number
)

# Show the filtered race details
filtered_race.show(truncate=False)

In [None]:
# Filter the DataFrame for the specific race.
race_df = speed_score.filter(
    (F.col("course_cd") == "TGP") &
    (F.col("race_date") == F.lit("2025-02-23").cast("date")) &
    (F.col("race_number") == 2)
)

# Select and order the columns of interest.
race_df.select("saddle_cloth_number", "horse_name", "course_cd", "race_date", "race_number") \
       .orderBy("saddle_cloth_number") \
       .show(truncate=False)

In [None]:
horse_embedding.printSchema()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Define the list of TPD tracks you want to filter for
tpd_tracks = [
    'CNL', 'SAR', 'PIM', 'TSA', 'BEL', 'MVR', 'TWO', 'CLS', 'KEE', 'TAM', 'TTP', 'TKD', 
    'ELP', 'PEN', 'HOU', 'DMR', 'TLS', 'AQU', 'MTH', 'TGP', 'TGG', 'CBY', 'LRL', 
    'TED', 'IND', 'CTD', 'ASD', 'TCD', 'LAD', 'TOP'
]

# Convert saddle_cloth_number to Integer (if stored as string)
horse_embedding = horse_embedding.withColumn(
    "saddle_cloth_number", F.col("saddle_cloth_number").cast("int")
)

# Define window partitioned by race and ordered by saddle_cloth_number
race_window = Window.partitionBy("course_cd", "race_date", "race_number").orderBy("saddle_cloth_number")

# Add a column for expected saddle_cloth_number (incremental index starting at 1)
horse_embedding = horse_embedding.withColumn(
    "expected_number",
    F.row_number().over(race_window)
)

# Identify races where saddle_cloth_number != expected_number
races_with_gaps = (
    horse_embedding
    .filter(F.col("saddle_cloth_number") != F.col("expected_number"))
    .filter(F.col("course_cd").isin(tpd_tracks))  # <-- Track filter applied here
    .select("course_cd", "race_date", "race_number")
    .distinct()
)

# Show races with non-contiguous saddle cloth numbers from the specified tracks
races_with_gaps.show()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Define the specific track(s) you want to filter for
selected_tracks = ['TOP']  # Change this to filter for different tracks

# Convert saddle_cloth_number to Integer (if stored as string)
horse_embedding = horse_embedding.withColumn(
    "saddle_cloth_number", F.col("saddle_cloth_number").cast("int")
)

# Define window partitioned by race and ordered by saddle_cloth_number
race_window = Window.partitionBy("course_cd", "race_date", "race_number").orderBy("saddle_cloth_number")

# Add a column for expected saddle_cloth_number (incremental index starting at 1)
horse_embedding = horse_embedding.withColumn(
    "expected_number",
    F.row_number().over(race_window)
)

# Identify races where saddle_cloth_number != expected_number
races_with_gaps = (
    horse_embedding
    .filter(F.col("saddle_cloth_number") != F.col("expected_number"))
    .filter(F.col("course_cd").isin(selected_tracks))  # <-- Filter for specific track(s)
    .select("course_cd", "race_date", "race_number")
    .distinct()
)

# Show races with non-contiguous saddle cloth numbers for the selected track(s)
races_with_gaps.show()

Approach

>1.	Find races where saddle cloth numbers are non-contiguous (already identified in races_with_gaps).

>2.	Use that list to filter the main dataset (horse_embedding) and retrieve the horses that are present in those races.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Convert saddle_cloth_number to Integer (if stored as string)
horse_embedding = horse_embedding.withColumn(
    "saddle_cloth_number", F.col("saddle_cloth_number").cast("int")
)

# Define window partitioned by race and ordered by saddle_cloth_number
race_window = Window.partitionBy("course_cd", "race_date", "race_number").orderBy("saddle_cloth_number")

# Add a column for expected saddle_cloth_number (incremental index starting at 1)
horse_embedding = horse_embedding.withColumn(
    "expected_number",
    F.row_number().over(race_window)
)

# Identify races where saddle_cloth_number != expected_number
races_with_gaps = (
    horse_embedding
    .filter(F.col("saddle_cloth_number") != F.col("expected_number"))
    .select("course_cd", "race_date", "race_number")
    .distinct()
)

# Now, retrieve all horses in these races so we can see what saddle cloth numbers are present
horses_in_missing_races = (
    horse_embedding
    .join(races_with_gaps, ["course_cd", "race_date", "race_number"])  # Join to get only races with gaps
    .select("course_cd", "race_date", "race_number", "horse_name", "saddle_cloth_number")
    .orderBy("course_cd", "race_date", "race_number", "saddle_cloth_number")  # Order for readability
)

# Show results
horses_in_missing_races.show(50, truncate=False)

In [None]:
# Filter the DataFrame for the specific race.
race_df = horse_enhanced.filter(
    (F.col("course_cd") == "AQU") &
    (F.col("race_date") == F.lit("2025-02-23").cast("date")) &
    (F.col("race_number") == 10)
)

# Select and order the columns of interest.
race_df.select("saddle_cloth_number", "horse_name", "course_cd", "race_date", "race_number") \
       .orderBy("saddle_cloth_number") \
       .show(truncate=False)