In [39]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt

In [72]:
data = pd.read_csv('raw_data.csv')

  data = pd.read_csv('raw_data.csv')


In [73]:
data['surface_name'].unique()

array(['Turf', 'Dirt', 'Synthetic', 'Downhill Turf', 'Steeplechase'],
      dtype=object)

In [74]:
data.head()

Unnamed: 0,date,time,race_distance,race_type,race_class,surface_name,default_condition,horse_number_1,runner_odds_1,morning_odds_1,...,horse_last_class_2,horse_num_races_2,early_2,middle_2,finish_2,jockey_trainer_starts_2,jockey_trainer_1st_2,jockey_trainer_2nd_2,jockey_trainer_3rd_2,result
0,2022-03-15,08:07 PM,7f,Thoroughbred,Maiden Special,Turf,Firm,1,9/2,9,...,-,0.0,0.0,0.0,0.0,0,0,0,0,1
1,2022-03-15,08:07 PM,7f,Thoroughbred,Maiden Special,Turf,Firm,1,9/2,9,...,-,0.0,0.0,0.0,0.0,0,0,0,0,1
2,2022-03-15,08:07 PM,7f,Thoroughbred,Maiden Special,Turf,Firm,1,9/2,9,...,-,0.0,0.0,0.0,0.0,0,0,0,0,1
3,2022-03-15,08:07 PM,7f,Thoroughbred,Maiden Special,Turf,Firm,1,9/2,9,...,-,0.0,0.0,0.0,0.0,0,0,0,0,1
4,2022-03-15,08:07 PM,7f,Thoroughbred,Maiden Special,Turf,Firm,2,14,11,...,-,0.0,0.0,0.0,0.0,0,0,0,0,1


## Preprocessing plan

* date - convert into day_of_week, year_sine, year_cosine
* time - convert into time_sine, time_cosine
* race_distance - convert into consistent units
* race_type - one-hot encode
* race_class - one-hot encode
* surface_name - one-hot encode
* (?) horse_number - maybe normalize between 0 and 1 since larger numbers have to run slightly farther.
* runner_odds - drop
* morning_odds - drop
* horse_name -
* horse_age - 
* horse_gender - one-hot encode
* horse_siredam - 
* trainer - 
* horse_med - drop
* horse_weight
* jockey
* horse_power_rating
* horse_wins/starts
* horse_days_off
* horse_avg_speed
* horse_avg_distance
* horse_high_speed
* horse_avg_class
* horse_last_class
* horse_num_races
* early
* middle
* finish
* jockey_trainer_starts
* jockey_trainer_1st
* jockey_trainer_2nd
* jockey_trainer_3rd

In [75]:
from sklearn.model_selection import train_test_split

X = data.iloc[:,:-1]
y = data['result']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=40)

In [76]:
Xs = { 'X_train': X_train, 'X_val': X_val }

for name, Xi in Xs.items():
    # date, time - convert to cycles
    date_time = (Xi['date'] + ' ' + Xi['time']).apply(lambda x: dt.datetime.strptime(
        x, '%Y-%m-%d %I:%M %p'))
    def date_to_nth_day(date):
        new_year_day = pd.Timestamp(year=date.year, month=1, day=1)
        return (date - new_year_day).days + 1
    def nth_day_to_cycle(n):
        radians = n*(2*np.pi)/(365.25)
        return np.cos(radians), np.sin(radians)
    Xi['date_cos'] = date_time.apply(lambda x: nth_day_to_cycle(date_to_nth_day(x))[0])
    Xi['date_sin'] = date_time.apply(lambda x: nth_day_to_cycle(date_to_nth_day(x))[1])
    def mins_to_cycle(mins):
        radians = mins*(2*np.pi)/(60*24)
        return np.cos(radians), np.sin(radians)
    Xi['time_cos'] = date_time.apply(lambda x: mins_to_cycle(x.hour*60 + x.minute)[0])
    Xi['time_sin'] = date_time.apply(lambda x: mins_to_cycle(x.hour*60 + x.minute)[1])
    Xi.drop(['date','time'], axis=1, inplace=True)

    # race_distance - standardize units to meters
    def conv_dist(dist):
        dist_to_meters = {'f': 201.168,
                          'mtr': 1,     # mtr (meter) comes before m (mile) in search
                          'm': 1609.34,
                          'y': 0.9144}
        for k,v in dist_to_meters.items():
            if k in dist:
                return float(dist[:-len(k)]) * v
    Xi['race_distance_meters'] = Xi['race_distance'].apply(lambda x: conv_dist(x))
    Xi.drop(['race_distance'], axis=1, inplace=True)

    # race_type - one-hot encode
    def fixed_one_hot(df, column, categories):
        for c in categories:
            df[c] = (df[column] == c)*1
    fixed_one_hot(Xi, 'race_type', ['Thoroughbred', 'Harness', 'Mixed', 'QuarterHorse', 'Arabian'])
    Xi.drop(['race_type'], axis=1, inplace=True)

    # race_class (TODO)
    Xi.drop(['race_class'], axis=1, inplace=True)

    # surface_name - one-hot encode
    fixed_one_hot(Xi, 'surface_name', ['Turf', 'Dirt', 'Synthetic', 'Downhill Turf', 'Steeplechase'])
    Xi.drop(['surface_name'], axis=1, inplace=True)

    # default_condition (TODO - see if there's a relationship e.g. good > good to soft > soft)
    Xi.drop(['default_condition'], axis=1, inplace=True)

    for i in ['1','2']:
        # horse number (TODO - normalize somehow. also has ints and strings.)
        Xi.drop(['horse_number_'+i], axis=1, inplace=True)

        # runner odds - drop
        Xi.drop(['runner_odds_'+i], axis=1, inplace=True)

        # morning odds - drop
        Xi.drop(['morning_odds_'+i], axis=1, inplace=True)

        # horse name (TODO)
        Xi.drop(['horse_name_'+i], axis=1, inplace=True)

        # horse age - as is

        # horse gender (TODO - a bunch of different "genders")
        Xi.drop(['horse_gender_'+i], axis=1, inplace=True)

        # horse siredam
        Xi.drop(['horse_siredam_'+i], axis=1, inplace=True)

        # horse med - drop
        Xi.drop(['horse_med_'+i], axis=1, inplace=True)

        # horse trainer (TODO)
        Xi.drop(['trainer_'+i], axis=1, inplace=True)

        # horse weight - convert to int and fill na with mean.
        def str_to_int(x):
            return int(x) if x.isdigit() else np.nan
        weight_ints = Xi['horse_weight_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['jockey_weight_'+i] = weight_ints.fillna(weight_ints.mean())
        Xi.drop(['horse_weight_'+i], axis=1, inplace=True)

        # horse jockey (TODO)
        Xi.drop(['jockey_'+i], axis=1, inplace=True)

        # horse power rating - convert to int and fill na with mean.
        power_ints = Xi['horse_power_rating_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_power_rating_'+i] = power_ints.fillna(power_ints.mean())

        # horse wins/starts (TODO - handle confidence with increasing number of starts)
        Xi.drop(['horse_wins/starts_'+i], axis=1, inplace=True)

        # horse days off - convert to int and fill na with mean.
        days_off_ints = Xi['horse_days_off_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_days_off_'+i] = days_off_ints.fillna(days_off_ints.mean())

        # avg speed - convert to int and fill na with mean.
        avg_speed_ints = Xi['horse_avg_speed_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_avg_speed_'+i] = avg_speed_ints.fillna(avg_speed_ints.mean())

        # avg distance - convert to int and fill na with mean.
        avg_distance_ints = Xi['horse_avg_distance_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_avg_distance_'+i] = avg_distance_ints.fillna(avg_distance_ints.mean())

        # high speed - convert to int and fill na with mean.
        high_speed_ints = Xi['horse_high_speed_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_high_speed_'+i] = high_speed_ints.fillna(high_speed_ints.mean())

        # avg class - convert to int and fill na with mean.
        avg_class_ints = Xi['horse_avg_class_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_avg_class_'+i] = avg_class_ints.fillna(avg_class_ints.mean())

        # last class - convert to int and fill na with mean.
        last_class_ints = Xi['horse_last_class_'+i].apply(lambda x: str_to_int(x.replace(' ','')))
        Xi['horse_last_class_'+i] = last_class_ints.fillna(last_class_ints.mean())

        # num races - as is

        # early - as is

        # middle - as is

        # finish - as is

        # starts - as is

        # 1st - as is

        # 2nd - as is

        # 3rd - as is
        
    Xs[name] = Xi

In [77]:
Xs['X_train'].head()

Unnamed: 0,horse_age_1,horse_power_rating_1,horse_days_off_1,horse_avg_speed_1,horse_avg_distance_1,horse_high_speed_1,horse_avg_class_1,horse_last_class_1,horse_num_races_1,early_1,...,Mixed,QuarterHorse,Arabian,Turf,Dirt,Synthetic,Downhill Turf,Steeplechase,jockey_weight_1,jockey_weight_2
79638,9.0,56.096866,17.0,52.0,44.0,55.0,66.0,68.0,10.0,6.0,...,0,0,0,0,1,0,0,0,122.0,122.0
16014,5.0,56.096866,16.0,66.0,65.0,80.0,82.0,80.0,4.0,3.8,...,0,0,0,0,1,0,0,0,124.0,124.0
14166,5.0,56.096866,13.0,86.0,86.0,96.0,95.0,97.0,10.0,3.9,...,0,0,0,0,1,0,0,0,124.0,118.0
54731,5.0,56.096866,23.0,68.0,67.0,82.0,87.0,86.0,8.0,6.5,...,0,0,0,0,1,0,0,0,122.0,122.0
83742,4.0,56.096866,16.0,68.166203,63.468122,75.957577,77.630104,78.777988,0.0,0.0,...,0,0,0,1,0,0,0,0,130.0,125.0


In [56]:
Xs['X_train'].isna().sum()

horse_age_1                0
horse_power_rating_1       0
horse_days_off_1           0
horse_avg_speed_1          0
horse_avg_distance_1       0
horse_high_speed_1         0
horse_avg_class_1          0
horse_last_class_1         0
horse_num_races_1          0
early_1                    0
middle_1                   0
finish_1                   0
jockey_trainer_starts_1    0
jockey_trainer_1st_1       0
jockey_trainer_2nd_1       0
jockey_trainer_3rd_1       0
horse_age_2                0
horse_power_rating_2       0
horse_days_off_2           0
horse_avg_speed_2          0
horse_avg_distance_2       0
horse_high_speed_2         0
horse_avg_class_2          0
horse_last_class_2         0
horse_num_races_2          0
early_2                    0
middle_2                   0
finish_2                   0
jockey_trainer_starts_2    0
jockey_trainer_1st_2       0
jockey_trainer_2nd_2       0
jockey_trainer_3rd_2       0
date_cos                   0
date_sin                   0
time_cos      

## Models

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
rf = RandomForestClassifier()
rf.fit(Xs['X_train'], y_train)

RandomForestClassifier()

In [58]:
preds = rf.predict(Xs['X_val'])

In [59]:
from sklearn.metrics import confusion_matrix

confusion_matrix(preds, y_val)

array([[6567, 2483],
       [2533, 6661]])