# Results Dataset Preparation

A lot of work has gone into compiling the current dataset. I have merged the gps_df, sectionals_df and results_df. I have limited the amount of Equibase data I am using just to keep the focus on the TPD GPS data, and to do some feature engineering.  However, there are some good metrics from the Equibase data that are just basic measures that could be obtained from any racebook sheet. 

## Get Started

1. Going to load the parquet DataFrame from disk and do some imputation, one-hot encoding, string indexing, and scaling. The run it through XBBoost to see how it's looking. At this point I will do the integration of route data, and add the GPS aggregations. I just want to see what I can minimally do and how its working before I go down the wrong path. If the XGBoost doesn't do any better than the LSTM, at least I won't have wasted any more time on it. 

### Load master_results_df.parquet file

In [None]:
spark.stop()

In [1]:
# Setup Environment

import os
import logging
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import set_config
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
import pyspark.sql.functions as F
import xgboost as xgb
from sklearn import set_config
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as F_min, max as F_max , upper, trim,
                                   row_number, mean as F_mean, countDistinct, last, first, when)
import configparser
from pyspark.sql import SparkSession
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, Window
from src.data_preprocessing.data_prep1.data_utils import (save_parquet, gather_statistics, 
                initialize_environment, load_config, initialize_spark, 
                identify_and_impute_outliers, 
                identify_and_remove_outliers, identify_missing_and_outliers)
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# Set global references to None
spark = None
master_results_df = None
race_df = None
df = None

In [3]:

spark, jdbc_url, jdbc_properties, parquet_dir, log_file = initialize_environment()


Spark session created successfully.


In [4]:
    # Suppose we have a dictionary of queries
    queries = sql_queries()  # you define these queries

    # Load them
    dfs = load_data_from_postgresql(
        spark, jdbc_url, jdbc_properties,
        queries, parquet_dir
    )

    race_df = None
    workouts_df = None
    for name, df in dfs.items():
        logging.info(f"DataFrame '{name}' loaded. Schema:")
        if name == "results":
            race_df = df


                                                                                

In [5]:
# ------------------------------------------------
# 1) Basic logging
# ------------------------------------------------
def setup_logging(script_dir, log_file):
    """Sets up logging configuration to write logs to a file."""
    try:
        # Truncate the log file first
        with open(log_file, 'w'):
            pass

        logger = logging.getLogger()
        if logger.hasHandlers():
            logger.handlers.clear()

        logger.setLevel(logging.INFO)

        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)

        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        logging.info("Logging initialized.")
    except Exception as e:
        print(f"Failed to set up logging: {e}", file=sys.stderr)
        sys.exit(1)

log_file = "/home/exx/myCode/horse-racing/FoxRiverAIRacing/logs/Data_Model_training.log"
setup_logging(parquet_dir, log_file)

logging.info("Data prep logging ready")

In [6]:
race_df.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- horse_id: integer (nullable = true)
 |-- horse_name: string (nullable = true)
 |-- official_fin: integer (nullable = true)
 |-- purse: integer (nullable = true)
 |-- wps_pool: decimal(10,2) (nullable = true)
 |-- weight: decimal(10,2) (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sex: string (nullable = true)
 |-- equip: string (nullable = true)
 |-- claimprice: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- trk_cond: string (nullable = true)
 |-- weather: string (nullable = true)
 |-- distance: decimal(10,2) (nullable = true)
 |-- dist_unit: string (nullable = true)
 |-- power: decimal(10,2) (nullable = true)
 |-- med: string (nullable = true)
 |-- morn_odds: decimal(10,2) (nullable = true)
 |-- avgspd: double (nullable = true)
 |-- race_type: string (nullable =

In [None]:
# Check for Dups:
primary_keys = ["course_cd", "race_date", "race_number", "horse_id"]
duplicates = (
    race_df.groupBy(*primary_keys)
      .agg(F.count("*").alias("cnt"))
      .filter(F.col("cnt") > 1)
)

dup_count = duplicates.count()
if dup_count > 0:
    print(f"Found {dup_count} duplicate primary key combinations.")
    duplicates.show()
    raise ValueError(f"Duplicates found: {dup_count}. Deduplication required.")

print(dup_count)

In [None]:
# 2. Convert Decimal Columns to Double
decimal_cols = ["wps_pool", "weight", "power", "distance", "morn_odds", 
                "todays_cls", "all_earnings", "cond_earnings", "wps_pool",
               "jock_win_percent", "jock_itm_percent", "trainer_itm_percent", 
                "trainer_win_percent", "jt_win_percent", "jt_itm_percent",
                "jock_win_track", "jock_itm_track", "trainer_win_track", "trainer_itm_track",
                "jt_win_track", "jt_itm_track"]
for col_name in decimal_cols:
    race_df = race_df.withColumn(col_name, F.col(col_name).cast("double"))
print("2. Decimal columns converted to double.")

In [None]:
def impute_date_of_birth_with_median(race_df):
    """
    Impute date_of_birth with the median value (or a default if no data exists).
    """
    race_df = race_df.withColumn("date_of_birth_ts", F.col("date_of_birth").cast("timestamp").cast("long"))
    median_window = Window.orderBy("date_of_birth_ts")

    median_ts = race_df.filter(F.col("date_of_birth_ts").isNotNull()).approxQuantile("date_of_birth_ts", [0.5], 0)[0]
    if median_ts is None:
        median_date = F.lit("2000-01-01").cast("date")
    else:
        median_date = F.from_unixtime(F.lit(median_ts)).cast("date")

    race_df = race_df.withColumn(
        "date_of_birth",
        F.when(F.col("date_of_birth").isNull(), median_date).otherwise(F.col("date_of_birth"))
    ).drop("date_of_birth_ts")
    print("3a. Missing date_of_birth values imputed with median date.")
    return df

In [None]:
# 3b. Create age_at_race_day
race_df = race_df.withColumn(
    "age_at_race_day",
    F.datediff(F.col("race_date"), F.col("date_of_birth")) / 365.25
)
print("3b. Created age_at_race_day.")

In [None]:
# 3c. Impute categorical and numeric columns -- ensure no whitespace in categorical columns
categorical_defaults = {"weather": "UNKNOWN", "turf_mud_mark": "MISSING", "trk_cond": "UNKNOWN"}
# Fill missing values for categorical defaults
race_df = race_df.fillna(categorical_defaults)

In [None]:

race_df = race_df.withColumn(
    "med",
    when(col("med") == "", "NONE").otherwise(col("med"))
)


In [None]:
race_df = race_df.withColumn(
    "turf_mud_mark",
    when(col("turf_mud_mark") == "", "MISSING").otherwise(col("turf_mud_mark"))
)

In [None]:
race_df = race_df.drop("date_of_birth")

In [None]:
columns_to_update = [
    'jock_itm_percent', 'jock_itm_track', 'jock_win_percent', 'jock_win_track',
    'jt_itm_percent', 'jt_itm_track', 'jt_win_percent', 'jt_win_track',
    'trainer_itm_percent', 'trainer_itm_track', 'trainer_win_percent', 'trainer_win_track'
]

# Set the specified columns to 0
for column in columns_to_update:
    race_df = race_df.withColumn(column, lit(0))


In [None]:
# Remove whitespace in column names
race_df = race_df.select([F.col(c).alias(c.strip()) for c in race_df.columns])

In [None]:
# Calculate the mean of the 'wps_pool' column, excluding nulls
mean_value = race_df.select(F.mean(F.col("wps_pool")).alias("mean_wps_pool")).collect()[0]["mean_wps_pool"]
race_df = race_df.withColumn(
    "wps_pool",
    when(col("wps_pool").isNull(), mean_value).otherwise(col("wps_pool"))
    )


## Missing Horse Form Values

Speed improvement, days_off, avg_speed, avg_speed, avg_beaten_3, avg_beaten_5, avg_fin_3, avg_fin_5

In [None]:
# Assuming race_df_asc is the DataFrame you are working with
race_df = race_df.withColumn(
    "is_first_race",
    when(
        col("speed_improvement").isNull() &
        col("days_off").isNull() &
        col("avg_speed_3").isNull() &
        col("avg_speed_5").isNull() &
        col("avg_beaten_3").isNull() &
        col("avg_beaten_5").isNull() &
        col("avg_fin_3").isNull() &
        col("avg_fin_5").isNull(),
        1
    ).otherwise(0)
)

# Populate the columns with -1 where is_first_race is 1
columns_to_update = [
    "speed_improvement", "days_off", "avg_speed_3", "avg_speed_5",
    "avg_beaten_3", "avg_beaten_5", "avg_fin_3", "avg_fin_5"
]

for column in columns_to_update:
    race_df = race_df.withColumn(
        column,
        when(col("is_first_race") == 1, lit(-1)).otherwise(col(column))
    )

# Show the updated DataFrame
#race_df_asc.select("speed_improvement", "days_off").show()

In [None]:
# Delete rows that have mising values in gate 4 and distance back because it impacts the target label -- 
# only 22, and 29 rows erach.

race_df = race_df.filter(
    col("avgtime_gate4").isNotNull() & col("sa_dist_bk_gate4").isNotNull()
)
# Set remaining values to 0 -- total of 6 features/rows

# Assuming df is your DataFrame
missing_values_to_fill = {
    'avg_acceleration': 0,
    'speed_q2': 0,
    'speed_q3': 0,
    'speed_q4': 0
}

# Fill missing values with 0 for the specified columns
race_df = race_df.fillna(missing_values_to_fill)


In [None]:
#save_parquet(spark, race_df, "race_df_p1", parquet_dir)

In [None]:
race_df = None

In [None]:
race_df = spark.read.parquet(os.path.join(parquet_dir, "race_df_p1.parquet"))

In [None]:
race_df.count()

In [None]:
# 