# Generating Jockey and Trainer Stats without Data Leakage

## trainer_accum_stats and jock_accum_stats

It's all about time -- what are the stats for the jockey and the trainer at the time of the race being analyzed. Taking their lifetime career stats will lead to data leakage -- using information that shouldn't be available. 

This section explains how I developed  trainer_accum_stats ad jock_accum_stats which provide information up to but not including the current race.

## 1. Create Base Data with Race Results

### Environment Setup

In [31]:
# Environment setup

import logging
import os
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text
import geopandas as gpd
from datetime import datetime
import configparser
from src.data_ingestion.ingestion_utils import (
    get_db_connection, update_tracking, load_processed_files
)
from src.data_ingestion.eqb_ppData import process_pluspro_data
from src.data_ingestion.eqb_resultsCharts import process_resultscharts_data
from src.data_ingestion.tpd_datasets import (
    process_tpd_sectionals_data,
    process_tpd_gpsdata_data
)

# Load the configuration file
config = configparser.ConfigParser()
config.read('/home/exx/myCode/horse-racing/FoxRiverAIRacing/config.ini')

# Set up logging for consistent logging behavior in Notebook
logging.basicConfig(level=logging.INFO)

# Retrieve database credentials from config file
# Retrieve database credentials from config file
db_host = config['database']['host']
db_port = config['database']['port']
db_name = config['database']['dbname']  # Corrected from 'name' to 'dbname'
db_user = config['database']['user']

# Establish connection using get_db_connection
conn = get_db_connection(config)

# Create the SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{db_user}@{db_host}:{db_port}/{db_name}')

Assemble a dataset that includes each race entry with the necessary details.

In [47]:
query = """ 
WITH race_results AS (
    SELECT
        r2.jock_key,
        r2.stat_type,
        r2.race_date AS as_of_date,
        re.official_fin AS finish_position,
        re.win_payoff,
        re.place_payoff,
        re.show_payoff

    FROM
        v_runners r2
    JOIN
        v_racedata r ON r.course_cd = r2.course_cd
                   AND r.race_date = r2.race_date
                   AND r.race_number = r2.race_number
    JOIN
        v_results_entries re ON re.course_cd = r.course_cd
                           AND re.race_date = r.race_date
                           AND re.race_number = r.race_number
                           AND re.program_num = r2.saddle_cloth_number
    WHERE
        re.official_fin IS NOT NULL
)
Select * from race_results
"""

# Execute the query and load it into a DataFrame
df = pd.read_sql_query(query, engine)

# Display the DataFrame
df.head()


Unnamed: 0,jock_key,stat_type,as_of_date,finish_position,win_payoff,place_payoff,show_payoff
0,134310,DIRT_RTE,2023-06-28,6,0.0,0.0,0.0
1,122644,DIRT_RTE,2022-07-03,7,0.0,0.0,0.0
2,99823,DIRT_RTE,2022-02-16,3,0.0,0.0,4.0
3,99823,DIRT_RTE,2022-02-16,10,0.0,0.0,0.0
4,150151,DIRT_RTE,2022-02-16,3,0.0,0.0,11.6


Use window functions to calculate cumulative sums up to but not including each race date.

In [60]:
query = """
WITH race_results AS (
    SELECT
        r2.jock_key,
        r2.stat_type,
        r2.race_date AS as_of_date,
        re.official_fin AS finish_position,
        re.win_payoff,
        re.place_payoff,
        re.show_payoff
    FROM
        v_runners r2
    JOIN
        v_racedata r ON r.course_cd = r2.course_cd
                   AND r.race_date = r2.race_date
                   AND r.race_number = r2.race_number
    JOIN
        v_results_entries re ON re.course_cd = r.course_cd
                           AND re.race_date = r.race_date
                           AND re.race_number = r.race_number
                           AND re.program_num = r2.saddle_cloth_number
    WHERE
        re.official_fin IS NOT NULL
),
daily_stats AS (
    SELECT
        rr.jock_key,
        rr.stat_type,
        rr.as_of_date,
        COUNT(*) AS daily_starts,
        SUM(CASE WHEN rr.finish_position = 1 THEN 1 ELSE 0 END) AS daily_win,
        SUM(CASE WHEN rr.finish_position = 2 THEN 1 ELSE 0 END) AS daily_place,
        SUM(CASE WHEN rr.finish_position = 3 THEN 1 ELSE 0 END) AS daily_show,
        SUM(CASE WHEN rr.finish_position = 4 THEN 1 ELSE 0 END) AS daily_fourth,
        SUM(COALESCE(rr.win_payoff, 0)) AS daily_win_earnings,
        SUM(COALESCE(rr.place_payoff, 0)) AS daily_place_earnings,
        SUM(COALESCE(rr.show_payoff, 0)) AS daily_show_earnings
    FROM
        race_results rr
    GROUP BY
        rr.jock_key,
        rr.stat_type,
        rr.as_of_date
),
cumulative_stats AS (
    SELECT
        jock_key,
        stat_type,
        as_of_date,
        SUM(daily_starts) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS starts,
        SUM(daily_win) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS win,
        SUM(daily_place) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS place,
        SUM(daily_show) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS show,
        SUM(daily_fourth) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS fourth,
        SUM(daily_win_earnings) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS win_earnings,
        SUM(daily_place_earnings) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS place_earnings,
        SUM(daily_show_earnings) OVER (
            PARTITION BY jock_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS show_earnings
    FROM
        daily_stats
)
INSERT INTO jock_accum_stats (
    jock_key, stat_type, as_of_date, win, place, show, fourth, starts,
    win_earnings, place_earnings, show_earnings
)
SELECT
    jock_key,
    stat_type,
    as_of_date,
    COALESCE(win, 0) AS win,
    COALESCE(place, 0) AS place,
    COALESCE(show, 0) AS show,
    COALESCE(fourth, 0) AS fourth,
    COALESCE(starts, 0) AS starts,
    COALESCE(win_earnings, 0) AS win_earnings,
    COALESCE(place_earnings, 0) AS place_earnings,
    COALESCE(show_earnings, 0) AS show_earnings
FROM
    cumulative_stats
ON CONFLICT (jock_key, stat_type, as_of_date) DO UPDATE SET
    win = EXCLUDED.win,
    place = EXCLUDED.place,
    show = EXCLUDED.show,
    fourth = EXCLUDED.fourth,
    starts = EXCLUDED.starts,
    win_earnings = EXCLUDED.win_earnings,
    place_earnings = EXCLUDED.place_earnings,
    show_earnings = EXCLUDED.show_earnings;  
   
"""
# Execute the query
try:
    with engine.connect() as connection:
        connection.execute(text(query))
        logging.info("Data inserted/updated successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

In [70]:
# View sample from jock_accum_stats
query = """
SELECT *
FROM jock_accum_stats
WHERE total_earnings > 0;
"""

df = pd.read_sql_query(query, engine)

# Display the DataFrame
df.head()

Unnamed: 0,jock_key,stat_type,win,place,show,starts,fourth,as_of_date,win_percentage,itm_percentage,win_earnings,place_earnings,show_earnings,total_earnings
0,100175,TURF_RTE,73,70,74,619,91,2024-07-04,11.79,35.06,876.8,973.0,959.0,2808.8
1,101167,TURF_RTE,12,14,20,237,29,2024-10-31,5.06,19.41,137.8,125.0,180.3,443.1
2,106513,DIRT_RTE,93,100,76,473,65,2023-03-30,19.66,56.87,894.7,794.0,748.3,2437.0
3,109183,DIRT_RTE,137,141,112,761,90,2023-12-03,18.0,51.25,1089.98,1243.6,1242.42,3576.0
4,109183,TURF_RTE,130,131,109,841,97,2023-10-26,15.46,44.0,1243.2,1341.5,1344.12,3928.82


In [68]:
# Populating trainer_accum_stats
query = """
WITH race_results AS (
    SELECT
        r2.train_key,
        r2.stat_type,
        r2.race_date AS as_of_date,
        re.official_fin AS finish_position,
        re.win_payoff,
        re.place_payoff,
        re.show_payoff
    FROM
        v_runners r2
    JOIN
        v_racedata r ON r.course_cd = r2.course_cd
                   AND r.race_date = r2.race_date
                   AND r.race_number = r2.race_number
    JOIN
        v_results_entries re ON re.course_cd = r.course_cd
                           AND re.race_date = r.race_date
                           AND re.race_number = r.race_number
                           AND re.program_num = r2.saddle_cloth_number
    WHERE
        re.official_fin IS NOT NULL
),
daily_stats AS (
    SELECT
        rr.train_key,
        rr.stat_type,
        rr.as_of_date,
        COUNT(*) AS daily_starts,
        SUM(CASE WHEN rr.finish_position = 1 THEN 1 ELSE 0 END) AS daily_win,
        SUM(CASE WHEN rr.finish_position = 2 THEN 1 ELSE 0 END) AS daily_place,
        SUM(CASE WHEN rr.finish_position = 3 THEN 1 ELSE 0 END) AS daily_show,
        SUM(CASE WHEN rr.finish_position = 4 THEN 1 ELSE 0 END) AS daily_fourth,
        SUM(COALESCE(rr.win_payoff, 0)) AS daily_win_earnings,
        SUM(COALESCE(rr.place_payoff, 0)) AS daily_place_earnings,
        SUM(COALESCE(rr.show_payoff, 0)) AS daily_show_earnings
    FROM
        race_results rr
    GROUP BY
        rr.train_key,
        rr.stat_type,
        rr.as_of_date
),
cumulative_stats AS (
    SELECT
        train_key,
        stat_type,
        as_of_date,
        SUM(daily_starts) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS starts,
        SUM(daily_win) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS win,
        SUM(daily_place) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS place,
        SUM(daily_show) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS show,
        SUM(daily_fourth) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS fourth,
        SUM(daily_win_earnings) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS win_earnings,
        SUM(daily_place_earnings) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS place_earnings,
        SUM(daily_show_earnings) OVER (
            PARTITION BY train_key, stat_type
            ORDER BY as_of_date
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
        ) AS show_earnings
    FROM
        daily_stats
)
INSERT INTO trainer_accum_stats (
    train_key, stat_type, as_of_date, win, place, show, fourth, starts,
    win_earnings, place_earnings, show_earnings
)
SELECT
    train_key,
    stat_type,
    as_of_date,
    COALESCE(win, 0) AS win,
    COALESCE(place, 0) AS place,
    COALESCE(show, 0) AS show,
    COALESCE(fourth, 0) AS fourth,
    COALESCE(starts, 0) AS starts,
    COALESCE(win_earnings, 0) AS win_earnings,
    COALESCE(place_earnings, 0) AS place_earnings,
    COALESCE(show_earnings, 0) AS show_earnings
FROM
    cumulative_stats
ON CONFLICT (train_key, stat_type, as_of_date) DO UPDATE SET
    win = EXCLUDED.win,
    place = EXCLUDED.place,
    show = EXCLUDED.show,
    fourth = EXCLUDED.fourth,
    starts = EXCLUDED.starts,
    win_earnings = EXCLUDED.win_earnings,
    place_earnings = EXCLUDED.place_earnings,
    show_earnings = EXCLUDED.show_earnings;  
"""

# Execute the query
try:
    with engine.connect() as connection:
        connection.execute(text(query))
        logging.info("Data inserted/updated successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

In [72]:
# View sample from trainer_accum_stats
query = """
SELECT *
FROM trainer_accum_stats
WHERE total_earnings > 0;
"""

df = pd.read_sql_query(query, engine)

# Display the DataFrame
df.head()

Unnamed: 0,train_key,stat_type,win,place,show,starts,fourth,as_of_date,win_percentage,itm_percentage,win_earnings,place_earnings,show_earnings,total_earnings
0,10056,ALL_WEATHR,0,1,0,5,0,2023-05-14,0.0,20.0,0.0,23.0,8.5,31.5
1,10067,DIRT_RTE,1,0,0,1,0,2023-03-19,100.0,100.0,4.2,3.2,2.6,10.0
2,10067,DIRT_RTE,1,0,0,2,0,2024-01-06,50.0,50.0,4.2,3.2,2.6,10.0
3,10081,ALL_WEATHR,0,1,0,3,0,2022-10-16,0.0,33.33,0.0,7.7,4.1,11.8
4,10081,ALL_WEATHR,0,1,0,4,0,2022-11-19,0.0,25.0,0.0,7.7,4.1,11.8
