In [16]:
# Assert minimum versions 
import sys 
assert sys.version_info >= (3, 5) 
import sklearn 
assert sklearn.__version__ >= "0.20" 
 
# Import packages and modules that will be used 
import numpy as np 
import pandas as pd
from sklearn import linear_model
from sklearn import metrics 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import SGDClassifier 
from sklearn.metrics import accuracy_score 

# Import and configure matplotlib 
%matplotlib inline  
import matplotlib as mpl  
import matplotlib.pyplot as plt 
mpl.rc('figure', dpi=120) # set good resolution

# Set a seed for reproducability
import random
random.seed(42)
# numpy needs a random seed, too
np.random.seed(42)

import datetime
from datetime import date

## Load datasets

### Races

The purpose of this notebook is to add historical speed data for all horse in scope of the study.

In [17]:
df_races = pd.read_csv('/Users/phillipmonk/research_paper/horse_code/data/race_data.csv')
df_races.head()

Unnamed: 0,race_course,race_no,datetime,distance,class,track_cond,track_rail,race_time,prize_money,position,...,jockey,weight,prize,800m,400m,margin,horse_adjusted_t1_speed,sp,s_tab_win,s_tab_place
0,Flemington,2,2022-01-01 11:05:00,1400,BM 70,4,Out 6m Entire Circuit,83.79,130000,1,...,J.Kah,61.5,71500,4.0,3.0,0.0,62.038672,0.0,4.6,1.9
1,Flemington,2,2022-01-01 11:05:00,1400,BM 70,4,Out 6m Entire Circuit,83.79,130000,2,...,J.Richards (a2),60.5,23400,3.0,4.0,0.1,62.027217,0.0,0.0,1.9
2,Flemington,2,2022-01-01 11:05:00,1400,BM 70,4,Out 6m Entire Circuit,83.79,130000,3,...,N.Heywood,62.0,11700,1.0,1.0,0.6,61.969947,26.0,0.0,4.2
3,Flemington,2,2022-01-01 11:05:00,1400,BM 70,4,Out 6m Entire Circuit,83.79,130000,4,...,M.J.Cartwright (a2),59.5,6500,2.0,2.0,0.7,61.958494,10.0,0.0,0.0
4,Flemington,2,2022-01-01 11:05:00,1400,BM 70,4,Out 6m Entire Circuit,83.79,130000,5,...,D.Oliver,61.0,3900,8.0,8.0,1.95,61.815337,0.0,0.0,0.0


In [19]:
df_races_sorted = df_races.sort_values(by=["horse_name", "datetime", "race_course", "race_no"])
df_races_sorted["horse_adjusted_t-1_speed"] = df_races_sorted.groupby("horse_name")["horse_adjusted_t1_speed"].shift(1)
df_races_sorted["horse_adjusted_t-2_speed"] = df_races_sorted.groupby("horse_name")["horse_adjusted_t1_speed"].shift(2)
df_races_sorted["horse_adjusted_t-3_speed"] = df_races_sorted.groupby("horse_name")["horse_adjusted_t1_speed"].shift(3)

df_races_sorted

Unnamed: 0,race_course,race_no,datetime,distance,class,track_cond,track_rail,race_time,prize_money,position,...,800m,400m,margin,horse_adjusted_t1_speed,sp,s_tab_win,s_tab_place,horse_adjusted_t-1_speed,horse_adjusted_t-2_speed,horse_adjusted_t-3_speed
404,Flemington,6,2022-01-21 17:30:00,1000,BM 70,3,Out 12m Entire Circuit,57.74,50000,10,...,2.0,2.0,5.55,61.454456,6.0,0.0,0.0,,,
6902,Royal Randwick,4,2022-12-24 14:05:00,1100,BM 70,4,+9m Entire,63.57,120000,4,...,8.0,7.0,1.32,62.870796,13.0,0.0,0.0,,,
5145,Royal Randwick,1,2022-10-15 12:30:00,1400,Listed,7,True,83.94,160000,1,...,10.0,9.0,0.00,62.852105,9.0,9.2,3.0,,,
5358,Royal Randwick,7,2022-10-22 16:10:00,1600,Group 2,6,+4m Entire Course,98.89,1000000,3,...,7.0,8.0,2.58,61.044756,5.5,0.0,1.8,62.852105,,
7894,Royal Randwick,7,2023-02-11 16:00:00,1200,Group 2,4,True,70.20,250000,10,...,12.0,12.0,2.80,62.342791,51.0,0.0,0.0,61.044756,62.852105,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4784,Caulfield,10,2022-10-08 17:45:00,1200,Group 3,5,True Entire Circuit,70.05,200000,8,...,12.0,12.0,3.05,62.753113,15.0,0.0,0.0,61.517441,61.704272,61.557003
4566,Flemington,1,2022-10-01 12:40:00,1000,Listed,4,Out 9m Entire Circuit,57.90,175000,1,...,1.0,1.0,0.00,62.487047,8.0,8.3,2.1,,,
5644,Flemington,1,2022-11-01 10:45:00,1000,Group 3,5,Out 2m Entire Circuit,58.52,200000,5,...,2.0,3.0,3.00,61.650918,4.0,0.0,0.0,62.487047,,
8567,Rosehill Gardens,2,2023-03-11 12:55:00,1200,Group 3,4,True,70.56,200000,7,...,2.0,2.0,4.78,61.758726,5.0,0.0,0.0,61.650918,62.487047,
