# Dataset

- 2.000 races
- 3 racing tracks in the US (AQU = Aqueduct, BEL = Belmont , SAR = Saratoga)
- Different race track conditions (e.g., muddy, soft) or race types (e.g., Stakes, Handicap)
- For each race and horse the dataset contains the coordinates in a fixed time window, frame-by-frame (we can calculate speeds)

In [None]:
import pandas as pd
import numpy as np
import os



In [13]:
file__path = r"../data/raw/nyra_2019_complete.parquet"

df = pd.read_parquet(file__path)
df.head()

Unnamed: 0,AQU,2019-01-01,9,6,72,40.6729017197787,-73.8276065972899,600,D,GD,48,CLM,25000.00,00420,120,Andre Shivnarine Worrie,2090,8
0,AQU,2019-01-01,9,6,73,40.672946,-73.827587,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
1,AQU,2019-01-01,9,6,74,40.67299,-73.827568,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
2,AQU,2019-01-01,9,6,63,40.67251,-73.827781,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
3,AQU,2019-01-01,9,6,64,40.672553,-73.827762,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
4,AQU,2019-01-01,9,6,65,40.672596,-73.827742,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8


In [14]:
df.shape

(5228429, 18)

## Data Preprocessing

In [15]:
df.columns = ['track_id','race_date','race_number','program_number','trakus_index','latitude','longitude','distance_id','course_type','track_condition','run_up_distance','race_type','purse','post_time','weight_carried','jockey','odds','position_at_finish']
df.head()

Unnamed: 0,track_id,race_date,race_number,program_number,trakus_index,latitude,longitude,distance_id,course_type,track_condition,run_up_distance,race_type,purse,post_time,weight_carried,jockey,odds,position_at_finish
0,AQU,2019-01-01,9,6,73,40.672946,-73.827587,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
1,AQU,2019-01-01,9,6,74,40.67299,-73.827568,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
2,AQU,2019-01-01,9,6,63,40.67251,-73.827781,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
3,AQU,2019-01-01,9,6,64,40.672553,-73.827762,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
4,AQU,2019-01-01,9,6,65,40.672596,-73.827742,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8


In [16]:
# program number 3 characters with whitespace or strings

df["program_number"] = df["program_number"].apply(lambda x: str(x).rstrip())
df["program_number"].unique()

array(['6', '2', '9', '3', '1', '11', '5', '4', '10', '7', '1A', '8',
       '13', '2B', '12', '14', '15', '16', '3X', '1X'], dtype=object)

`horse_id` = `track_id`+`race_date`+`race_number`+`program_number`

In [18]:
df["horse_id"] = df.apply(lambda x: f"{x["track_id"]}_{x["race_date"]}_{x["race_number"]}_{x["program_number"]}", axis=1)
df["horse_id"].head()

0    AQU_2019-01-01_9_6
1    AQU_2019-01-01_9_6
2    AQU_2019-01-01_9_6
3    AQU_2019-01-01_9_6
4    AQU_2019-01-01_9_6
Name: horse_id, dtype: object

In [17]:
df["win"] = np.where(df["position_at_finish"] == 1, 1, 0)

`rid` = `track_id`+`race_date`+`race_number`

In [19]:
# add unique race_id (rid)

df["rid"] = df.apply(lambda x: f"{x["track_id"]}_{x["race_date"]}_{x["race_number"]}", axis=1)


In [20]:
df["rid"].head()

0    AQU_2019-01-01_9
1    AQU_2019-01-01_9
2    AQU_2019-01-01_9
3    AQU_2019-01-01_9
4    AQU_2019-01-01_9
Name: rid, dtype: object

**Table reading example**

In [26]:
example = df[df["rid"] == "AQU_2019-01-01_9"]
example = example.groupby("horse_id").first().reset_index()

example[["horse_id", "win", "jockey", "program_number", "rid"]].head(20)

Unnamed: 0,horse_id,win,jockey,program_number,rid
0,AQU_2019-01-01_9_1,0,Harry Hernandez,1,AQU_2019-01-01_9
1,AQU_2019-01-01_9_10,0,Luis R. Reyes,10,AQU_2019-01-01_9
2,AQU_2019-01-01_9_11,0,Rajiv Maragh,11,AQU_2019-01-01_9
3,AQU_2019-01-01_9_3,1,Benjamin Hernandez,3,AQU_2019-01-01_9
4,AQU_2019-01-01_9_4,0,Joel Sone,4,AQU_2019-01-01_9
5,AQU_2019-01-01_9_6,0,Andre Shivnarine Worrie,6,AQU_2019-01-01_9
6,AQU_2019-01-01_9_7,0,Manuel Franco,7,AQU_2019-01-01_9
7,AQU_2019-01-01_9_8,0,Reylu Gutierrez,8,AQU_2019-01-01_9
8,AQU_2019-01-01_9_9,0,Joey R. Martinez,9,AQU_2019-01-01_9


In [25]:
df.columns

Index(['track_id', 'race_date', 'race_number', 'program_number',
       'trakus_index', 'latitude', 'longitude', 'distance_id', 'course_type',
       'track_condition', 'run_up_distance', 'race_type', 'purse', 'post_time',
       'weight_carried', 'jockey', 'odds', 'position_at_finish', 'win',
       'horse_id', 'rid'],
      dtype='object')