In [132]:
import pandas as pd
import numpy as np
import pylogit
from scipy.special import logit
from datetime import timedelta
from sklearn.metrics import brier_score_loss
pd.options.display.max_columns = 100

In [3]:
df = pd.read_csv('xc.csv')
df = df.loc[df.race_type == 'flat_race']

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,race_id,stadium_id,distance_m,going,race_type,race_grade,dog_id,place,time,decimal_price,comment,box,kg,winner,dt,origTime,dnf
0,1,4418717,13025,515,-0.1,flat_race,A6,2176330,4,31.41,11.0,"BadlyBlkVW1,Crd3",5,27.75,0,2019-01-01,31.41,0
1,2,4418717,13025,515,-0.1,flat_race,A6,2207348,3,31.33,7.0,"CrdRnUp&1,Led2To 3/4",1,31.25,0,2019-01-01,31.33,0
2,3,4418717,13025,515,-0.1,flat_race,A6,2216661,5,31.49,7.0,"BadlyBlkWide1,Blk3",6,28.5,0,2019-01-01,31.49,0
3,4,4418717,13025,515,-0.1,flat_race,A6,2338001,2,31.31,3.5,"SAw,BBlk 1/4,Crowded3",4,35.5,0,2019-01-01,31.31,0
4,5,4418717,13025,515,-0.1,flat_race,A6,2342199,6,31.7,4.0,"EP,CrdRnUp&1& 1/4&3&4",3,34.0,0,2019-01-01,31.7,0


In [7]:
dups = df.groupby('race_id').box.agg(lambda x : len(x)-x.nunique())
dups = dups.loc[dups>0]

In [8]:
df = df.loc[~df.race_id.isin(dups.index)].copy()

In [9]:
win_choice = df.loc[df.winner == 1].sample(frac = 1.0).groupby('race_id').head(1).loc[:,['race_id','box']].copy()
win_choice['twinner'] = 1

In [10]:
dfm = df.merge(win_choice, on = ['race_id','box'], how = 'left')
dfm.twinner = dfm.twinner.fillna(0)
dfm = dfm.sort_values(['race_id','box'])

In [34]:
dfm = dfm.sort_values('dt')

In [135]:
startupTIME = 1.75
distDEFAULT = 400
distEXPONENT = 0.11
dfm['mps'] = dfm.time - startupTIME
dfm['mmps'] = dfm.distance_m/dfm.mps * (dfm.distance_m**distEXPONENT)/(distDEFAULT**distEXPONENT)

In [136]:
dfm['avgmps'] = dfm.groupby('dog_id').mmps.apply(lambda x: x.shift().expanding().mean())

In [137]:
# filtering out races with less than 4 dogs
flattrack = dfm.groupby('race_id').dog_id.agg(lambda x: x.count())
flattrack = flattrack.loc[flattrack >= 4]
dfmf = dfm.loc[dfm.race_id.isin(flattrack.index)].copy()

In [138]:
# add column to count number of prior races for each dog
dfmf['prior_races'] = dfmf.groupby('dog_id').cumcount()

In [139]:
# filter out races where all dogs did not have at least 3 prior races
priors = dfmf[dfmf['prior_races'] >= 3].groupby('race_id').prior_races.count()/dfmf.groupby('race_id').prior_races.count()
priors = priors.loc[priors == 1]
dfmfp = dfmf.loc[dfmf.race_id.isin(priors.index)].copy()

In [140]:
# create column with date of last race run by that dog
dfmfp['last_race'] = dfmfp.groupby('dog_id').dt.shift()

In [141]:
# convert dates to datetime
dfmfp['dt'] = pd.to_datetime(dfmfp['dt'])
dfmfp['last_race'] = pd.to_datetime(dfmfp['last_race'])

In [142]:
# filter out races where all dogs have not run a race in the past 90 days
ninety = dfmfp.loc[dfmfp.dt > dfmfp.last_race + timedelta(90)].race_id.unique()
df_final = dfmfp.loc[~dfmfp.race_id.isin(ninety)].copy()

In [143]:
train = df_final[df_final.dt.between('2019-07-01','2020-01-31')].sort_values(['race_id','box'])
test = df_final[df_final.dt > '2020-01-31'].sort_values(['race_id','box'])

In [126]:
def mlogit(formula, df, obs_id, alt_id) :
    """
    df must be sorted by obs_id to use this function
    """
    from patsy import dmatrices
    from collections import OrderedDict
    import pylogit as pl
    import numpy as np
    data = dmatrices(formula+' -1', df, return_type = "dataframe")
    all_alts = sorted(df.loc[:,alt_id].unique())
    
    spec = OrderedDict()
    names = OrderedDict()
    spec['intercept'] = all_alts[1:]
    names['intercept'] = [f'(Intercept):{alt}' for alt in all_alts[1:]]
    c_names = data[1].design_info.column_names
    choice = data[0].design_info.column_names[0]
    data[1][choice] = data[0]
    data[1][[obs_id,alt_id]] = df[[obs_id,alt_id]]    
    for c in c_names :
        spec[c] = [all_alts]
        names[c] = [c]
    model = pl.create_choice_model(data = data[1], alt_id_col = alt_id, obs_id_col = obs_id, 
                                   choice_col = choice, specification = spec, names = names, model_type = 'MNL')
    model.fit_mle(np.zeros(model.design.shape[1]), method = 'Powell')
    return model

In [144]:
model = mlogit('twinner ~ avgmps', train, 'race_id', 'box')

Log-likelihood at zero: -36,672.1279
Initial Log-likelihood: -36,672.1279


  warn('Method %s does not use gradient information (jac).' % method,
  warn('Method %s does not use Hessian information (hess).' % method,
  results = minimize(estimator.calc_neg_log_likelihood_and_neg_gradient,


Estimation Time for Point Estimation: 6.20 seconds.
Final log-likelihood: -36,293.5730


In [145]:
model.get_statsmodels_summary()

0,1,2,3
Dep. Variable:,twinner,No. Observations:,20721.0
Model:,Multinomial Logit Model,Df Residuals:,20715.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 27 Apr 2021",Pseudo R-squ.:,0.01
Time:,23:08:51,Pseudo R-bar-squ.:,0.01
AIC:,72599.146,Log-Likelihood:,-36293.573
BIC:,72646.779,LL-Null:,-36672.128

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
(Intercept):2,-0.0456,0.024,-1.892,0.059,-0.093,0.002
(Intercept):3,0.0168,0.024,0.706,0.480,-0.030,0.063
(Intercept):4,-0.0141,0.024,-0.586,0.558,-0.061,0.033
(Intercept):5,-0.1060,0.025,-4.316,0.000,-0.154,-0.058
(Intercept):6,0.0245,0.024,1.024,0.306,-0.022,0.071
avgmps,1.2394,0.047,26.457,0.000,1.148,1.331


In [146]:
ytrue = test.twinner
ypreds = model.predict(test)

In [148]:
print('Brier score loss on games after 01/31/2020:', brier_score_loss(ytrue, ypreds))

Brier score loss on games after 01/31/2020: 0.14005503205974762
