# HKJC Backtester

By: Shan Ali

## Load Data

In [1]:
# import libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Evaluation & Back Test Functions

#### Pseudo R2 Function

In [3]:
# load logit library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# define custom pseudo-R2 function from log-liklihood
def pseudo_r2(model, X_test):
    y_pred = pd.DataFrame(model.predict_proba(X_test))
    pred_0 = y_pred.copy()
    pred_0.iloc[:,:] = 1/14
    L_model = log_loss(y_test, y_pred)
    L_0 = log_loss(y_test, pred_0)
    r2 = 1 - (L_model/L_0)
    return r2

#### Accuracy Function

In [4]:
# define true positive acuracy for 1st place function
def p_accuracy(y_hat):
    y_hat_1 = y_hat == 1
    y_1 = y_test == 1
    y_0 = y_test != 0
    tp = round(np.mean(y_hat_1 & y_1),4)
    fp = round(np.mean(y_hat_1 & y_0),4)
    return tp, fp

#### Backtest Functions

In [166]:
# define test/train split function
def backtest_split(date, df, cat=False, op=False):
    # isolate non-catagorical features
    cols = ['won','date','race_id','place','race','horse_no','weight_adj','weight_horse_declared','draw','win_odds',
            'distance','pool','rating','age','race_count','days_since_race','recent_result','average_result',
            'recent_lengths_behind','average_lengths_behind','recent_normal_speed','average_normal_speed',
            'recent_draw','average_draw','recent_actual_weight','average_actual_weight','distance_pref',
            'surface_pref','venue_pref','recent_jockey_perf','average_jockey_perf','recent_trainer_perf',
            'average_trainer_perf','recent_jockey_skill','average_jockey_skill','going_pref','win_div',
            'place_div_1','place_div_2','place_div_3','quinella_div','quinella_place_div_1','quinella_place_div_2',
            'quinella_place_div_3','forecast_div','tierce_div','trio_div','first4_div','quartet_div']
    cats = ['race_track','sex','colour','surface']
    if cat: dummines = pd.get_dummies(df[cats], drop_first=True)
    df = df[cols]
    if cat: df = pd.merge(df, dummines, right_index=True,left_index=True)
    
    # save all win odds
    cols = ['race_id','place','pool','win_odds','win_div','place_div_1','place_div_2','place_div_3',
            'quinella_div','quinella_place_div_1','quinella_place_div_2','quinella_place_div_3',
            'forecast_div','tierce_div','trio_div','first4_div','quartet_div','horse_no']
    win_odds = df[cols].copy()
    
    # set test and train -> approx 90/10 split
    train_mask = df['date'] < date
    test_mask = df['date'] == date
    if op: 
        train_mask = df['date'] < '2020-09-06'
        test_mask = df['date'].between('2020-09-06', date, inclusive=True)
    train = df[train_mask].reset_index(drop=True)
    test = df[test_mask].reset_index(drop=True)
    
    # define impution targets, initialize, fit, & transform df
    impute = ['age','recent_lengths_behind','average_lengths_behind']
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    train.loc[:,impute] = imputer.fit_transform(train.loc[:,impute])
    test.loc[:,impute] = imputer.transform(test.loc[:,impute])
    
    # drop nans & set X test and train
    train = train.dropna().reset_index(drop=True)
    test = test.dropna().reset_index(drop=True)
    X_train = train.drop(columns=['place','won']).reset_index(drop=True)
    X_test = test.drop(columns=['place','won']).reset_index(drop=True)
    
    # set y test and train
    y = 'place'
    y_train = train[y].reset_index(drop=True)
    y_test = test[y].reset_index(drop=True)
    
    # display split
    split = pd.DataFrame()
    split['runs'] = [X_train.shape[0], X_test.shape[0]]
    split['%'] = [X_train.shape[0]/df.shape[0],X_test.shape[0]/df.shape[0]]
    split.rename(index={0:'Train',1:'Test'})

    # save win odds & divs
    cols = ['race_id','win_odds']
    other_train, other_test = X_train[cols].copy(), X_test[cols].copy()
    other_train['place'], other_test['place'] = y_train, y_test
    cols = ['date','win_div','race_id','place_div_1','place_div_2','place_div_3','quinella_div',
            'quinella_place_div_1','quinella_place_div_2','quinella_place_div_3','forecast_div',
            'tierce_div','trio_div','first4_div','quartet_div']
    X_train, X_test = X_train.drop(columns=cols), X_test.drop(columns=cols)

    # scale fit and transform sets 
    scale = ['race','horse_no','weight_adj','weight_horse_declared','draw','win_odds','distance',
             'pool','rating','age','race_count','days_since_race','recent_result','average_result',
             'recent_lengths_behind','average_lengths_behind','recent_normal_speed','average_normal_speed',
             'recent_draw','average_draw','recent_actual_weight','average_actual_weight','distance_pref',
             'surface_pref','venue_pref','recent_jockey_perf','average_jockey_perf','recent_trainer_perf',
             'average_trainer_perf','recent_jockey_skill','average_jockey_skill','going_pref']
    scaler = MinMaxScaler()
    scaler.fit(X_train[scale])
    X_train[scale] = scaler.transform(X_train[scale])
    X_test[scale] = scaler.transform(X_test[scale])

    return X_train, X_test, y_train, y_test, other_train, other_test, split, win_odds

#### Expected Earning Function

In [183]:
# load estimaton library
from sklearn.linear_model import LogisticRegression

# define function to combine target and public probabilities
def combine_prob(df, df2, wo):
    
    # merge in dropped horse-odds pairs
    races = df['race_id'].unique()
    wo = wo[wo['race_id'].isin(races)]
    df = df.merge(wo, on=['race_id','place'],how='right',suffixes=['_1',''])
    df = df.drop(columns='win_odds_1')
    
    # adj implied probability for take for test data
    df['prob'] = 1/df['win_odds'] # get implied probability
    race_total_prob = df.groupby('race_id')[['prob']].sum().rename(columns={'prob':'total_prob'})
    temp = df[['race_id','prob']].merge(race_total_prob, left_on='race_id',right_index=True)
    df['public_prob'] = temp['prob']/temp['total_prob']

    # adj implied probability for take for train data
    df2['prob'] = 1/df2['win_odds'] # get implied probability
    race_total_prob = df2.groupby('race_id')[['prob']].sum().rename(columns={'prob':'total_prob'})
    temp = df2[['race_id','prob']].merge(race_total_prob, left_on='race_id',right_index=True)
    df2['public_prob'] = temp['prob']/temp['total_prob']
    
    # define structures for combiner model for win bets
    input_cols = ['public_prob','target_prob','place']
    X, X2 = df2[input_cols].copy().dropna(), df[input_cols].drop(columns=['place']).copy().dropna()
    y, X = X['place'], X.drop(columns=['place'])
    
    # combine probabilities
    X2 = df[['public_prob','target_prob']].copy()
    X2['target_prob'] = np.where(df['target_prob'].isna(),df['public_prob'],df['target_prob'])
    logit = LogisticRegression(max_iter=10000).fit(X,y)
    df['combined_prob'] = pd.DataFrame(logit.predict_proba(X2))[0]
    return df.reset_index(drop=True)

# define function to transform race df to accomidate different bet types
def bet_trasformer(df, target='Win'):
    temp = df.copy()
    
    # select features by target
    if target in ['Win','WPlace']: 
        return temp
    
    # generate win place df
    elif target == 'Place': 
        # calculate show or place odds
        races = temp['race_id'].unique()
        temp3 = pd.DataFrame()
        for race in races: # for each race
            temp2 = temp[temp['race_id'] == race]
            cp, pp = [], []
            for i in temp2.index: # for each horse
                # define win probability
                cpwi = temp2.loc[i,'combined_prob']
                ppwi = temp2.loc[i,'public_prob']
                pd3 = temp2.loc[i,'place_div_3']
                cpsi, ppsi, cpti, ppti = 0, 0, 0, 0
                                    
                # calc second probability
                for j in temp2.index:
                    if j != i:
                        cpwj = temp2.loc[j,'combined_prob']
                        ppwj = temp2.loc[j,'public_prob']
                        cpsi += cpwj*cpwi/(1-cpwj)
                        ppsi += ppwj*ppwi/(1-ppwj)
                        
                        # calc third probability
                        if pd3 != 0:
                            for k in temp2.index:
                                if k !=j:
                                    cpwk = temp2.loc[k,'combined_prob']
                                    ppwk = temp2.loc[k,'public_prob']
                                    cpti += cpwk*(cpwj/(1-cpwk))*(cpwi/(1-cpwk-cpwj))
                                    ppti += ppwk*(ppwj/(1-ppwk))*(ppwi/(1-ppwk-ppwj))
                                    
                # sum to get show/place prob
                cp += [cpwi + cpsi + cpti]
                pp += [ppwi + ppsi + ppti]
            
            # save output to race-level df
            temp2['combined_prob'], temp2['public_prob'] = cp, pp
            temp3 = temp3.append(temp2)
        temp = temp3
        return temp.reset_index(drop=True)
      
    # generate quinella or forecast df
    elif target in ['Quinella','Forecast','QPlace']:
        # merge to generate 1,2 pairs
        temp_cols = ['place','combined_prob','public_prob','race_id']
        temp = temp.merge(temp[temp_cols], on='race_id', suffixes=['_1','_2']) # generate pairs
        temp = temp.rename(columns={'combined_prob_1':'target_1','combined_prob_2':'target_2','public_prob_1':'public_1','public_prob_2':'public_2'})
        temp = temp[temp['place_1'] != temp['place_2']] # remove self-pairs
        
        if target == 'Forecast':
            # calc 1,2 probabilities
            temp['combined_prob'] = (temp['target_1']*temp['target_2'])/(1-temp['target_1'])
            temp['public_prob'] = (temp['public_1']*temp['public_2'])/(1-temp['public_1'])
            temp['place'] = np.where((temp['place_1'] == 1)&(temp['place_2'] == 2),1,0)
            return temp.reset_index(drop=True)
        
        # remove duplicate pairs
        races = temp['race_id'].unique()
        temp3 = pd.DataFrame()
        for race in races:
            temp2 = temp[temp['race_id'] == race]
            for i in temp2.index:
                try: p1, p2 = temp2.loc[i,'place_1'], temp2.loc[i,'place_2']
                except: p1, p2 = 0, 0
                for j in temp2.index:
                    if j != i:
                        p11, p22 = temp2.loc[j,'place_1'], temp2.loc[j,'place_2']
                        if p1 == p22 and p2 == p11: temp2 = temp2.drop(index=j)
            temp3 = temp3.append(temp2)
        temp = temp3

        if target == 'Quinella':
            # calc 1,2 or 2,1 probabilities
            temp['combined_prob'] = (temp['target_1']*temp['target_2'])/(1-temp['target_1']) + (temp['target_1']*temp['target_2'])/(1-temp['target_2'])
            temp['public_prob'] = (temp['public_1']*temp['public_2'])/(1-temp['public_1']) + (temp['public_1']*temp['public_2'])/(1-temp['public_2'])
            temp['place'] = np.where((temp['place_1'].isin([1,2]))&(temp['place_2'].isin([1,2])),1,0)
        
        # !! Broken
        elif taget == 'QPlace':
            # calc 1,2 or 1,3 or 2,1 or 2,3 or 3,2 or 3,1 probabilities
            temp['combined_prob'] = (temp['target_1']*temp['target_2'])/(1-temp['target_1']) + (temp['target_1']*temp['target_2'])/(1-temp['target_2'])
            temp['public_prob'] = (temp['public_1']*temp['public_2'])/(1-temp['public_1']) + (temp['public_1']*temp['public_2'])/(1-temp['public_2'])
            
            # save place metrics
            temp['qplace'] = np.where((temp['place_1'].isin([1,2]))&(temp['place_2'].isin([1,2])),1,0)
            temp['qplace'] = np.where((temp['place_1'].isin([1,3]))&(temp['place_2'].isin([1,3])),2,temp['place'])
            temp['qplace'] = np.where((temp['place_1'].isin([2,3]))&(temp['place_2'].isin([2,3])),3,temp['qplace'])
        
        return temp.reset_index(drop=True)
      
    # !! Broken
    # generate tierce or trio df
    elif target in ['Tierce','Trio']:
        # merge to generate 1,2,3 pairs
        temp_cols = ['place','combined_prob','public_prob','race_id']
        temp2 = temp.merge(temp[temp_cols], on='race_id', suffixes=['_1','_2']) # generate pairs
        temp = temp2.merge(temp[temp_cols], on='race_id') # generate pairs
        temp = temp.rename(columns={'combined_prob_1':'target_1','combined_prob_2':'target_2',
                                    'combined_prob':'target_3','public_prob_1':'public_1',
                                    'public_prob_2':'public_2','public_prob':'public_3','place':'place_3'})
        temp = temp[temp['place_1'] != temp['place_2']] # remove 1,2 self-pairs
        temp = temp[temp['place_1'] != temp['place_3']] # remove 1,3 self-pairs
        temp = temp[temp['place_2'] != temp['place_3']] # remove 2,3 self-pairs
        
        if target == 'Tierce':
            # calc 1,2,3 in order
            temp['combined_prob'] = (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_1'])*(1-temp['target_2']-temp['target_1']))
            temp['public_prob'] = (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_1'])*(1-temp['public_2']-temp['public_1']))
            temp['place'] = np.where((temp['place_1'] == 1)&(temp['place_2'] == 2)&(temp['place_3'] == 3),1,0)
            return temp.reset_index(drop=True)
        
        # remove duplicate trios
        races = temp['race_id'].unique()
        temp3 = pd.DataFrame()
        for race in races:
            temp2 = temp[temp['race_id'] == race]
            for i in temp2.index:
                try: p1, p2, p3 = temp2.loc[i,'place_1'], temp2.loc[i,'place_2'], temp2.loc[i,'place_3']
                except: break
                for j in temp2.index:
                    if j != i:
                        p11, p22, p33 = temp2.loc[j,'place_1'], temp2.loc[j,'place_2'], temp2.loc[j,'place_3']
                        if p1 == p22 and p2 == p33 and p3 == p11: temp2 = temp2.drop(index=j)
                        elif p1 == p22 and p2 == p11 and p3 == p22: temp2 = temp2.drop(index=j)
                        elif p1 == p33 and p2 == p11 and p3 == p22: temp2 = temp2.drop(index=j)
            temp3 = temp3.append(temp2)
        temp = temp3

        # calc 1,2,3 in any order
        temp['combined_prob'] = (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_1'])*(1-temp['target_1']-temp['target_2']))
        temp['combined_prob'] += (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_1'])*(1-temp['target_1']-temp['target_3']))
        temp['combined_prob'] += (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_2'])*(1-temp['target_2']-temp['target_1']))
        temp['combined_prob'] += (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_2'])*(1-temp['target_2']-temp['target_3']))
        temp['combined_prob'] += (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_3'])*(1-temp['target_3']-temp['target_1']))
        temp['combined_prob'] += (temp['target_1']*temp['target_2']*temp['target_3'])/((1-temp['target_3'])*(1-temp['target_3']-temp['target_2']))
        temp['public_prob'] = (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_1'])*(1-temp['public_1']-temp['public_2']))
        temp['public_prob'] += (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_1'])*(1-temp['public_1']-temp['public_3']))
        temp['public_prob'] += (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_2'])*(1-temp['public_2']-temp['public_1']))
        temp['public_prob'] += (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_2'])*(1-temp['public_2']-temp['public_3']))
        temp['public_prob'] += (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_3'])*(1-temp['public_3']-temp['public_1']))
        temp['public_prob'] += (temp['public_1']*temp['public_2']*temp['public_3'])/((1-temp['public_3'])*(1-temp['public_3']-temp['public_2']))
        temp['place'] = np.where((temp['place_1'].isin([1,2,3]))&(temp['place_2'].isin([1,2,3]))&(temp['place_3'].isin([1,2,3])),1,0)
        return temp.reset_index(drop=True)
        
    '''
    elif target == 'First4':
    elif target == 'Quartet':
    '''

# define bet making function
def make_bets(df, target='Win', strat='Unit', w=1000, k=0.25, u=1):
    
    # define take
    if target in ['Forecast','Trio']: take = .195
    elif target in ['Tierce','First4','Quartet']: take = .25
    else: take = .175
        
    # calc earnings values
    df['div'] = (1 - take)/df['public_prob'] # calc'ed divs -> only used for approx. expected return
    df['er'] = df['combined_prob']*df['div'] # expected return (er)
    df['adv'] = df['er'] - 1
    
    # select betting strategy
    if strat == 'Unit': df['units'] = 1*round(u)
    elif strat == 'Kelly' and w <= 0: df['units'] = 0
    else: df['units'] = round(((df['adv']/(df['div'] - 1))*k*w)/10) # bet units from Kelly optimization
    
    # calc betting performance
    bets = df[df['er'] > 1].reset_index(drop=True)
    
    # initiate wins df
    cols = ['win_winnings','win_units','place_winnings','place_units','quinella_winnings',
            'quinella_units','qplace_winnings','qplace_units','forecast_winnings','forecast_units',
            'trio_winnings','trio_units','tierce_winnings','tierce_units']
    for col in cols: bets[col] = 0
    
    # asign winning dividend
    if target == 'Win': 
        bets['win_winnings'] = np.where(bets['place']==1,bets['win_div']*bets['units'],0)
        bets['win_units'] = bets['units']
    elif target in ['Place','WPlace']:
        bets['place_winnings'] = np.where(bets['place']==1,bets['place_div_1']*bets['units'],0)
        bets['place_winnings'] = np.where(bets['place']==2,bets['place_div_2']*bets['units'],bets['place_winnings'])
        bets['place_winnings'] = np.where(bets['place']==3,bets['place_div_3']*bets['units'],bets['place_winnings'])
        bets['place_units'] = bets['units']
    elif target == 'Quinella': 
        bets['quinella_winnings'] = np.where(bets['place']==1,bets['quinella_div']*bets['units'],0)
        bets['quinella_units'] = bets['units']
    elif target == 'QPlace':
        bets['qplace_winnings'] = np.where(bets['qplace']==1,bets['quinella_place_div_1']*bets['units'],0)
        bets['qplace_winnings'] = np.where(bets['qplace']==2,bets['quinella_place_div_2']*bets['units'],bets['qplace_winnings'])
        bets['qplace_winnings'] = np.where(bets['qplace']==3,bets['quinella_place_div_3']*bets['units'],bets['qplace_winnings'])
        bets['qplace_units'] = bets['units']
    elif target == 'Forecast':
        bets['forecast_winnings'] = np.where(bets['place']==1,bets['forecast_div']*bets['units'],0)
        bets['forecast_units'] = bets['units']
    elif target == 'Trio':
        bets['trio_winnings'] = np.where(bets['place']==1,bets['trio_div']*bets['units'],0)
        bets['trio_units'] = bets['units']
    elif target == 'Tierce':
        bets['tierce_winnings'] = np.where(bets['place']==1,bets['tierce_div']*bets['units'],0)
        bets['tierce_units'] = bets['units']
        
    # combine winnings and merge
    bets['winnings'] = sum([bets['win_winnings'], bets['place_winnings'], bets['quinella_winnings'],
                            bets['qplace_winnings'], bets['forecast_winnings'], bets['trio_winnings'],
                            bets['tierce_winnings']])
    bets['bet_type'] = target    
    return bets

# define function to combile bets into a results output
def compile_bets(bets, w=1000):
    out = pd.DataFrame(np.zeros(1))
    
    # calculate bets made
    out['Win Bets'] = round(bets[bets['bet_type'] == 'Win'].shape[0])
    out['Place Bets'] = round(bets[bets['bet_type'].isin(['Place','WPlace'])].shape[0])
    out['Quinella Bets'] = round(bets[bets['bet_type'] == 'Quinella'].shape[0])
    out['QPlace Bets'] = round(bets[bets['bet_type'] == 'QPlace'].shape[0])
    out['Forecast Bets'] = round(bets[bets['bet_type'] == 'Forecast'].shape[0])
    out['Trio Bets'] = round(bets[bets['bet_type'] == 'Trio'].shape[0])
    out['Tierce Bets'] = round(bets[bets['bet_type'] == 'Tierce'].shape[0])
    out['Bets'] = sum([out['Win Bets'], out['Place Bets'], out['Quinella Bets'], out['QPlace Bets'], 
                       out['Forecast Bets'], out['Trio Bets'], out['Tierce Bets']])
    
    # combile total costs
    out['Win Cost'] = round(bets['win_units'].sum()*10,2)
    out['Place Cost'] = round(bets['place_units'].sum()*10,2)
    out['Quinella Cost'] = round(bets['quinella_units'].sum()*10,2)
    out['QPlace Cost'] = round(bets['qplace_units'].sum()*10,2)
    out['Forecast Cost'] = round(bets['forecast_units'].sum()*10,2)
    out['Trio Cost'] = round(bets['trio_units'].sum()*10,2)
    out['Tierce Cost'] = round(bets['tierce_units'].sum()*10,2)
    out['Cost'] = sum([out['Win Cost'], out['Place Cost'], out['Quinella Cost'], out['QPlace Cost'], 
                       out['Forecast Cost'], out['Trio Cost'], out['Tierce Cost']])
    
    # combile total wins
    out['Win Wins'] = round(bets[bets['win_winnings'] > 0].shape[0])
    out['Place Wins'] = round(bets[bets['place_winnings'] > 0].shape[0])
    out['Quinella Wins'] = round(bets[bets['quinella_winnings'] > 0].shape[0])
    out['QPlace Wins'] = round(bets[bets['qplace_winnings'] > 0].shape[0])
    out['Forecast Wins'] = round(bets[bets['forecast_winnings'] > 0].shape[0])
    out['Trio Wins'] = round(bets[bets['trio_winnings'] > 0].shape[0])
    out['Tierce Wins'] = round(bets[bets['tierce_winnings'] > 0].shape[0])
    out['Wins'] = sum([out['Win Wins'], out['Place Wins'], out['Quinella Wins'], out['QPlace Wins'], 
                       out['Forecast Wins'], out['Trio Wins'], out['Tierce Wins']])
    
    # combile total winnings
    out['Win Winnings'] = round(bets['win_winnings'].sum(),2)
    out['Place Winnings'] = round(bets['place_winnings'].sum(),2)
    out['Quinella Winnings'] = round(bets['quinella_winnings'].sum(),2)
    out['QPlace Winnings'] = round(bets['qplace_winnings'].sum(),2)
    out['Forecast Winnings'] = round(bets['forecast_winnings'].sum(),2)
    out['Trio Winnings'] = round(bets['trio_winnings'].sum(),2)
    out['Tierce Winnings'] = round(bets['tierce_winnings'].sum(),2)
    out['Winnings'] = round(bets['winnings'].sum(),2)
    
    # combile total profit
    out['Win Profit'] = out['Win Winnings'] - out['Win Cost']
    out['Place Profit'] = out['Place Winnings'] - out['Place Cost']
    out['Quinella Profit'] = out['Quinella Winnings'] - out['Quinella Cost']
    out['QPlace Profit'] = out['QPlace Winnings'] - out['QPlace Cost']
    out['Forecast Profit'] = out['Forecast Winnings'] - out['Forecast Cost']
    out['Trio Profit'] = out['Trio Winnings'] - out['Trio Cost']
    out['Tierce Profit'] = out['Tierce Winnings'] - out['Tierce Cost']
    out['Profit'] = sum([out['Win Profit'], out['Place Profit'], out['Quinella Profit'], out['QPlace Profit'], 
                       out['Forecast Profit'], out['Trio Profit'], out['Tierce Profit']])
    out['Profit per Bet'] = round(out['Profit']/out['Bets'],2)
    
    # combile total returns and final wealth
    out['Win Return'] = round(((out['Win Winnings'] - out['Win Cost'])/out['Win Cost'])*100,2)
    out['Place Return'] = round(((out['Place Winnings'] - out['Place Cost'])/out['Place Cost'])*100,2)
    out['Quinella Return'] = round(((out['Quinella Winnings'] - out['Quinella Cost'])/out['Quinella Cost'])*100,2)
    out['QPlace Return'] = round(((out['QPlace Winnings'] - out['QPlace Cost'])/out['QPlace Cost'])*100,2)
    out['Forecast Return'] = round(((out['Forecast Winnings'] - out['Forecast Cost'])/out['Forecast Cost'])*100,2)
    out['Trio Return'] = round(((out['Trio Winnings'] - out['Trio Cost'])/out['Trio Cost'])*100,2)
    out['Tierce Return'] = round(((out['Tierce Winnings'] - out['Tierce Cost'])/out['Tierce Cost'])*100,2)
    out['Return'] = round(((out['Winnings'] - out['Cost'])/out['Cost'])*100,2)
    out['Wealth'] = w + out['Profit']
    out = out.iloc[:,1:].T[0]
    return out

In [7]:
# define expected earnings function
def expected_earnings(other_test, other_train, wo, strat='Unit', w=1000, k=0.25, u=1, targets=['Win']):
    
    # get combined df, tranform, make bets, & output results for each target
    bets = pd.DataFrame()
    df = combine_prob(other_test, other_train, wo)
    for target in targets:
        bet_df = bet_trasformer(df, target=target)
        bet = make_bets(bet_df, target=target, strat=strat, w=w, k=k, u=u)
        bets = bets.append(bet)
    out = compile_bets(bets, w=w)
    
    return out, bets

#### Expected Earnings & Bets Test/Exploration

In [184]:
t = 'Trio'
#df, df2 = other_test.copy(), other_train.copy()
#bets_df = combine_prob(df, df2, wo)
#df3 = bet_trasformer(bets_df, target=t)
#bet2 = make_bets(df3, target=t)

In [172]:
df3[df3['place']==1][['race_id','pool','place','combined_prob','tierce_div','div','er']]
#bet2[['date','race_id','bet_type','place','target_prob_1','public_prob','combined_prob','win_odds','win_div','div','er']]

Unnamed: 0,race_id,pool,place,combined_prob,tierce_div,div,er
0,737,750000,1,0.0001276906,1417.0,140.515406,0.017942
990,738,750000,1,5.043957e-06,2747.0,98.582738,0.000497
1980,739,1000000,1,1.366235e-06,7271.0,408.794745,0.000559
3308,740,1000000,1,1.566304e-05,3335.0,103.122745,0.001615
4620,741,1000000,1,7.822077e-05,2247.0,130.442934,0.010203
6924,742,1000000,1,3.525897e-06,9394.0,734.646865,0.00259
8988,743,1500000,1,1.827231e-05,2774.0,167.695647,0.003064
14400,746,2200000,1,6.621634e-07,6006.0,1830.847963,0.001212


In [149]:
targets = ['Win','WPlace'] #,'Place','Quinella','Forecast','Tierce']
ee, bets = expected_earnings(other_test, other_train, wo, w=1000, targets=targets)

In [186]:
bets_df

Unnamed: 0,race_id,place,target_prob,pool,win_odds,win_div,place_div_1,place_div_2,place_div_3,quinella_div,...,quinella_place_div_3,forecast_div,tierce_div,trio_div,first4_div,quartet_div,horse_no,prob,public_prob,combined_prob
0,737,1,0.418182,750000,2.6,26.5,14.0,37.5,21.0,187.5,...,119.0,320.0,1417.0,291.0,505.0,9208.0,5,0.384615,0.316644,0.412666
1,737,2,0.030303,750000,13.0,26.5,14.0,37.5,21.0,187.5,...,119.0,320.0,1417.0,291.0,505.0,9208.0,11,0.076923,0.063329,0.003719
2,737,3,0.175758,750000,7.3,26.5,14.0,37.5,21.0,187.5,...,119.0,320.0,1417.0,291.0,505.0,9208.0,10,0.136986,0.112777,0.028521
3,737,4,0.181818,750000,7.2,26.5,14.0,37.5,21.0,187.5,...,119.0,320.0,1417.0,291.0,505.0,9208.0,1,0.138889,0.114344,0.030945
4,737,5,0.018182,750000,36.0,26.5,14.0,37.5,21.0,187.5,...,119.0,320.0,1417.0,291.0,505.0,9208.0,7,0.027778,0.022869,0.002932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,744,11,,3000000,13.0,80.5,33.5,85.5,41.5,1415.0,...,621.0,2225.0,26413.0,4317.0,4043.0,153205.0,9,0.076923,0.075306,0.007038
118,745,7,,1500000,13.0,24.0,13.5,24.0,92.0,116.5,...,388.5,179.0,2698.0,1318.0,2601.0,29104.0,5,0.076923,0.068379,0.006339
119,745,14,,1500000,200.0,24.0,13.5,24.0,92.0,116.5,...,388.5,179.0,2698.0,1318.0,2601.0,29104.0,2,0.005000,0.004445,0.002332
120,746,1,,2200000,5.4,54.0,16.5,32.0,60.0,321.5,...,347.5,454.0,6006.0,2067.0,1215.0,25777.0,2,0.185185,0.150873,0.020923


#### Manual Bet Check

In [None]:
#out, bets = expected_earnings(other_train, other_test)
won = bets[bets['place']==1][['race_id','place','win_odds','win_div','div']]
ex = pd.read_csv('extracted.csv')
ex = ex[ex['race_id'].isin(won['race_id'])]
ex = ex[ex['place']==1][['race_id','date','race','horse_no']]
ex = pd.merge(ex, won, on='race_id')
ex#[['date','race','race_id']]

## Backtest

#### Initialize Back Test

In [2]:
# open extracted and pre-process for backtest
backtest_df = pd.read_csv('extracted.csv')

# remove any duplicate races
temp = backtest_df[['date','race','place']].drop_duplicates()
temp = temp[[]]
backtest_df = backtest_df.merge(temp, right_index=True, left_index=True)

# codifiy win (target) feature
backtest_df['won'] = np.where(backtest_df['place'] == 1, 1, 0)

# drop columns no longer needed
cols = ['race_idx','race_id.1','horse_id','position_finish_time','length_behind_winner',
        'pos_1','pos_2','pos_3','pos_4','pos_5','pos_6','time1','time2','time3','time4',
        'time5','time6','sec1','sec2','sec3','sec4','sec5','sec6','horse','jockey','trainer',
        'win_combination','place_combination_1','place_combination_2','place_combination_3']
cats = ['class','ratings','going','surface_type','g_race','country',
        'sire','dam','dams_sire','race_track','sex','colour','surface']
backtest_df = backtest_df.drop(columns=cols)

# define general dataset & isolate races & dates for indexing
df = backtest_df.copy()
index = df['date'].unique()
s = len(index[index <= '2020-09-06']) - 1
m = len(index)

In [3]:
backtest_df

Unnamed: 0,date,race,place,horse_no,weight_adj,weight_horse_declared,draw,win_odds,race_id,class,...,surface_pref,venue_pref,recent_jockey_perf,average_jockey_perf,recent_trainer_perf,average_trainer_perf,recent_jockey_skill,average_jockey_skill,going_pref,won
0,2019-09-01,1,1,3,131,1072,6,2.2,0,Class 5,...,8.000000,6.857143,3.0,4.411837,3.0,5.960986,4.845238,6.541275,,1
1,2019-09-01,1,2,4,129,1179,7,4.9,0,Class 5,...,10.000000,9.200000,7.0,4.594679,8.0,6.718232,7.042885,7.274772,,0
2,2019-09-01,1,3,7,121,1053,4,18.0,0,Class 5,...,5.583333,5.375000,4.0,5.592417,6.0,7.538077,4.523358,6.227424,,0
3,2019-09-01,1,4,2,132,1076,3,5.7,0,Class 5,...,6.656250,9.571429,7.0,7.762857,1.0,6.557377,8.557312,6.867467,,0
4,2019-09-01,1,5,1,133,1155,8,7.0,0,Class 5,...,6.695652,6.400000,6.0,8.539171,3.0,6.708618,5.594980,6.274691,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9788,2021-01-01,11,10,14,116,1188,9,15.0,1003,Class 2,...,3.333333,5.000000,10.0,6.861732,2.0,5.891173,4.632900,5.486506,4.333333,0
9789,2021-01-01,11,11,1,133,1148,8,29.0,1003,Class 2,...,5.384615,4.909091,5.0,5.820433,8.0,6.062500,7.363636,5.816784,5.625000,0
9790,2021-01-01,11,12,7,122,973,2,70.0,1003,Class 2,...,,,11.0,7.213732,10.0,7.274684,,,,0
9791,2021-01-01,11,13,5,127,1072,10,59.0,1003,Class 2,...,10.000000,10.000000,4.0,6.978814,10.0,8.076271,4.633441,6.978814,10.000000,0


#### Back Test Test

In [179]:
# optimize initial set
date = index[s]
#dtc = DecisionTreeClassifier(random_state=10, criterion='entropy', class_weight=None)
#dtc = BaggingClassifier(base_estimator=dtc, n_estimators=165, random_state=10)
X_train, X_test, y_train, y_test, other_train, other_test, split, wo = backtest_split(date, df, cat=True)
#dtc.fit(X_train, y_train)
other_test['target_prob'] = pd.DataFrame(dtc.predict_proba(X_test)).iloc[:,0]
other_train['target_prob'] = pd.DataFrame(dtc.predict_proba(X_train)).iloc[:,0]
#ee, bets = expected_earnings(other_test, other_train, wo, w=1000)

#### Main Back Test

In [187]:
# define general dataset & isolate races & dates for indexing
df = backtest_df.copy()
index = df['date'].unique()
s = len(index[index <= '2020-09-06']) - 1
m = len(index)

# test parameters
'''
for c in ['gini', 'entropy']:
for md in [1,5,10,15,None]:
for n in [0,10,25,50,80,100,150,500,1000]:
for w in [True,1000,5000,10000]:
'''

# define constants & structures
feature, cat, st = False, True, 'Unit'
c, md, wealth, n = 'entropy', None, 1000, 215
#out1 = pd.DataFrame(np.zeros(1))
features = pd.DataFrame(np.zeros(1))

# main backtest loop
n=25
output, bets = pd.DataFrame(), pd.DataFrame()
for i in range(s,m)[:]:

    # process data and get split
    d = index[i]
    X_train, X_test, y_train, y_test, other_train, other_test, split, wo = backtest_split(d, df, cat=cat)

    # run & fit model and get target probability
    dtc = DecisionTreeClassifier(random_state=10, criterion=c, class_weight=None, max_depth=md)
    if n > 0: dtc = BaggingClassifier(base_estimator=dtc, n_estimators=n, random_state=10)
    dtc.fit(X_train, y_train)
    other_test['target_prob'] = pd.DataFrame(dtc.predict_proba(X_test)).iloc[:,0]
    other_train['target_prob'] = pd.DataFrame(dtc.predict_proba(X_train)).iloc[:,0]

    # get expected earnings, test predictions, & prf scores
    targets = ['Win']#,'Place','Quinella','Forecast']
    ee, b = expected_earnings(other_test, other_train, wo, w=wealth, targets=targets)
    wealth = ee['Wealth']
    test_pred = dtc.predict(X_test)
    prf = precision_recall_fscore_support(y_test, test_pred, average=None, beta=1.0, zero_division=0)
    if feature: feature_importances = np.mean([tree.feature_importances_ for tree in dtc.estimators_], axis=0)
    if feature: features = features.append(pd.DataFrame(feature_importances).T)

    # calc results
    out = pd.DataFrame(np.zeros(1))
    out['Train Runs'], out['Test Runs'] = split.iloc[0,0], split.iloc[1,0]
    out['Train Acc'] = round(dtc.score(X_train, y_train),4)
    out['Test Acc'] = round(dtc.score(X_test, y_test),4)
    out['R2'] = round(r2_score(y_test, test_pred),4)
    try: out['AUC'] = round(roc_auc_score(y_test, dtc.predict_proba(X_test), multi_class='ovr'),4)
    except: out['AUC'] = np.nan
    out['TPR'], out['FPR'] = p_accuracy(test_pred)
    out['precision'], out['recall'] = prf[0][0], prf[1][0]
    out['fscore'], out['support']  = prf[2][0], prf[3][0]
    out = out.merge(pd.DataFrame(ee).T, left_index=True, right_index=True)
    cols = str(n) + ', ' + str(d)
    output[cols] = out.iloc[:,1:].T[0]
    b['date'] = d
    bets = bets.append(b)

    print(i,m,cols)

# save results
col = 'N: ' + str(n)    
out2 = output.T.apply(sum).round(3)
out2['Col'], out2['n'] = col, n
avg_cols = ['Train Runs','Test Runs','Train Acc','Test Acc','R2','AUC','TPR',
            'FPR','precision','recall','fscore','support']
out2[avg_cols] = output.T.apply(np.mean)[avg_cols].round(3)
out2['Wealth'] = output.T.loc[cols,'Wealth']
out2['Return'] = round(((out2['Winnings'] - out2['Cost'])/out2['Cost'])*100,2)
out2['Profit per Bet'] = round(out2['Profit']/out2['Bets'],2)
out2['Wealth'] = output.T.loc[:,'Wealth'][-1]
out1 = out1.append(out2, ignore_index=True)
print(col,'\n')

end = True

75 103 25, 2020-09-06
76 103 25, 2020-09-09
77 103 25, 2020-09-13
78 103 25, 2020-09-16
79 103 25, 2020-09-20
80 103 25, 2020-09-23
81 103 25, 2020-09-27
82 103 25, 2020-10-01
83 103 25, 2020-10-07
84 103 25, 2020-10-11
85 103 25, 2020-10-14
86 103 25, 2020-10-21
87 103 25, 2020-10-24
88 103 25, 2020-10-28
89 103 25, 2020-11-08
90 103 25, 2020-11-11
91 103 25, 2020-11-14
92 103 25, 2020-11-18
93 103 25, 2020-11-25
94 103 25, 2020-12-02
95 103 25, 2020-12-06
96 103 25, 2020-12-09
97 103 25, 2020-12-13
98 103 25, 2020-12-16
99 103 25, 2020-12-20
100 103 25, 2020-12-23
101 103 25, 2020-12-26
102 103 25, 2021-01-01
N: 25 



#### Bet Output

In [189]:
bets[['date','race_id','bet_type','place','target_prob','public_prob','combined_prob','win_odds','win_div','div','er']]

Unnamed: 0,date,race_id,bet_type,place,target_prob,public_prob,combined_prob,win_odds,win_div,div,er
0,2020-09-06,737,Win,1,0.52,0.316644,0.765167,2.6,26.5,2.605448,1.993603
1,2020-09-06,743,Win,13,0.32,0.141676,0.198656,5.9,30.0,5.823140,1.156800
2,2020-09-06,745,Win,1,0.64,0.370387,0.937088,2.4,24.0,2.227397,2.087266
0,2020-09-09,752,Win,11,0.12,0.003090,0.013235,267.0,37.5,266.986062,3.533473
0,2020-09-13,762,Win,1,0.68,0.598100,0.934008,1.4,14.0,1.379369,1.288341
...,...,...,...,...,...,...,...,...,...,...,...
0,2021-01-01,994,Win,1,0.44,0.288233,0.555316,3.0,30.0,2.862266,1.589464
1,2021-01-01,994,Win,3,0.28,0.078609,0.122795,11.0,30.0,10.494976,1.288735
2,2021-01-01,995,Win,5,0.68,0.508463,0.955912,1.6,75.0,1.622538,1.551004
3,2021-01-01,1000,Win,2,0.40,0.264176,0.421630,3.1,87.0,3.122919,1.316716


#### Single Output

In [72]:
# explore single backtest output
o = output.T.copy()
o = output.T.apply(sum).round(3)
avg_cols = ['Train Runs','Test Runs','Train Acc','Test Acc','R2','AUC','TPR',
            'FPR','precision','recall','fscore','support']
o[avg_cols] = output.T.apply(np.mean)[avg_cols].round(3)
o['Return'] = round(((o['Winnings'] - o['Cost'])/o['Cost'])*100,2)
o['Profit per Bet'] = round(o['Profit']/o['Bets'],2)
o['Wealth'] = output.T.loc[:,'Wealth'][-1]
o = pd.DataFrame(o).dropna()
o

Unnamed: 0,0
Train Runs,6555.036
Test Runs,79.964
Train Acc,1.0
Test Acc,0.124
R2,-0.201
AUC,0.64
TPR,0.033
FPR,0.155
precision,0.194
recall,0.37


#### Compliled Output

In [190]:
cols = ['recall','R2','Test Acc','AUC','Bets','Wins','Cost','Winnings','Profit','Return','Wealth']
o = out3.copy()
o = o.set_index('Col',drop=True).iloc[1:,1:].round(3)
o = o.sort_values(by='n', ascending=True)[cols]
o#[2:]['Profit'].plot()

Unnamed: 0_level_0,recall,R2,Test Acc,AUC,Bets,Wins,Cost,Winnings,Profit,Return,Wealth
Col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
N: 0,0.192,-0.481,0.106,0.513,197.0,37.0,1970.0,1373.5,-596.5,-30.28,403.5
N: 10,0.329,-0.354,0.112,0.576,170.0,31.0,1700.0,1427.5,-272.5,-16.03,131.0
N: 25,0.345,-0.257,0.118,0.607,67.0,19.0,670.0,575.5,-94.5,-14.1,36.5
N: 50,0.387,-0.223,0.127,0.617,36.0,13.0,360.0,286.5,-73.5,-20.42,-37.0
N: 80,0.369,-0.229,0.126,0.622,29.0,12.0,290.0,243.0,-47.0,-16.21,-84.0
N: 100,0.418,-0.195,0.136,0.632,24.0,11.0,240.0,225.5,-14.5,-6.04,-98.5
N: 150,0.39,-0.173,0.125,0.633,20.0,9.0,200.0,182.5,-17.5,-8.75,-116.0
N: 155,0.376,-0.185,0.126,0.633,20.0,9.0,200.0,182.5,-17.5,-8.75,982.5
N: 160,0.384,-0.175,0.125,0.632,20.0,9.0,200.0,182.5,-17.5,-8.75,982.5
N: 165,0.377,-0.186,0.126,0.633,19.0,10.0,190.0,200.0,10.0,5.26,992.5


In [191]:
# explore compiled backtest outputs
'''
cols = ['recall','R2','Test Acc','AUC','Bets','Win Wins','Place Wins','Wins','Win Cost',
        'Place Cost','Cost','Win Profit','Place Profit','Profit','Win Winnings','Place Winnings',
        'Winnings','Win Profit','Place Profit','Profit','Win Return','Place Return','Return','Wealth']
'''
cols = ['recall','R2','Test Acc','AUC','Bets','Wins','Cost','Winnings','Profit','Return','Wealth']
o = out1.copy()
o = o.set_index('Col',drop=True).iloc[1:,1:].round(3)
o.sort_values(by='n', ascending=False)[cols]
#o['Profit'].plot()

# explore metric-PnL correlation
#metrics = [Profit','recall','precision','fscore','R2','Train Acc','Test Acc','AUC','TPR','FPR']
#o[metrics].corr().apply(lambda x: round(x,4))['Profit'].sort_values(ascending=False)

Unnamed: 0_level_0,recall,R2,Test Acc,AUC,Bets,Wins,Cost,Winnings,Profit,Return,Wealth
Col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
N: 215,0.394,-0.204,0.127,0.637,17.0,10.0,170.0,205.5,35.5,20.88,1035.5
N: 25,0.345,-0.257,0.118,0.607,67.0,19.0,670.0,575.5,-94.5,-14.1,905.5
N: 10,0.329,-0.354,0.112,0.576,170.0,31.0,1700.0,1427.5,-272.5,-16.03,727.5


#### Feature Importance

In [334]:
# plot feature importance
f = pd.DataFrame(features.iloc[1:,:].apply(np.mean)).rename(columns={0:'importance'})
f['feature'] = list(X_test.columns)
f = f.set_index('feature',drop=True).sort_values('importance')
f.plot(kind='barh', figsize=(12,8))

Unnamed: 0_level_0,recall,R2,Test Acc,AUC,Bets Made,Cost,Wins,Winnings,Profit,Return
Col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
N: 500,0.378,-0.186,0.13,0.637,18.0,180.0,10.0,217.5,37.5,20.83
N: 475,0.383,-0.188,0.128,0.636,18.0,180.0,10.0,217.5,37.5,20.83
N: 450,0.383,-0.178,0.131,0.637,19.0,190.0,10.0,200.5,10.5,5.53
N: 425,0.383,-0.175,0.129,0.636,20.0,200.0,11.0,216.5,16.5,8.25
N: 400,0.386,-0.178,0.13,0.635,20.0,200.0,11.0,216.0,16.0,8.0
N: 375,0.379,-0.179,0.13,0.636,21.0,210.0,11.0,216.0,6.0,2.86
N: 350,0.383,-0.178,0.13,0.636,21.0,210.0,12.0,257.5,47.5,22.62
N: 325,0.386,-0.181,0.13,0.637,21.0,210.0,11.0,213.5,3.5,1.67
N: 300,0.381,-0.189,0.134,0.637,18.0,180.0,8.0,175.5,-4.5,-2.5
N: 275,0.381,-0.172,0.134,0.636,22.0,220.0,10.0,221.0,1.0,0.45


#### Statistical Tests