In [74]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import ensemble
from sklearn.preprocessing import QuantileTransformer, quantile_transform
from sklearn.neural_network import MLPRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge

def split_random(df, N_samples):
    
    """this creates test, dev splits for consistent comparison 
    across models
    
    it returns a dictionary of test and train data sets by fold"""
    
    N_dev = int(df.shape[0] * .1)
    
    N_folds = int(N_samples/N_dev)

    
    folds = {}
    
    for fold in range(N_folds):
    
        df_train = df.sample(frac=.9, random_state = fold).copy()
        df_dev = df.drop(df_train.index).copy()
        
        folds[fold] = {'train': df_train, 'test': df_dev}
    
    return folds

def train_probs_lr(df_train, points_model, points_features, prob_features):
    
    """logistic regression model to predict probability of team winning"""
    
    y_pred = points_model.predict(df_train[points_features])
    
    df_train['model_home_p'] = y_pred[:,1]
    df_train['model_away_p'] = y_pred[:,0]
    
    df_train['model_total_p'] = df_train['model_home_p']+df_train['model_away_p']
    df_train['diff_p'] = df_train['model_away_p']-df_train['model_home_p']
    
    df_train['away_win'] = np.where(df_train['AWAY_points'] > df_train['HOME_points'],1,0)
    
    lr = linear_model.LogisticRegression(C=100.,solver='liblinear', intercept_scaling =100)
    
    lr.fit(df_train[prob_features], df_train['away_win'])
    
    df_train['away_win_p'] = lr.predict_proba(df_train[prob_features])[:,1]
    
    return df_train, lr



def run_drake_v1(df_train, df_test):
    
    """single function that trains points model and win probability model on
    training data and then produces predictions on test data for use in evaluation"""
    
    points_features = ['AWAY_p_allowed', 'HOME_p_allowed', 'AWAY_p_scored', 'HOME_p_scored']
    prob_features = ['diff_p'] + ['model_total_p'] 
    
    points_model = linear_model.Ridge(alpha=50.)
    #points_model = ensemble.RandomForestRegressor(min_samples_leaf = 15,n_estimators = 200)
    
    points_model.fit(df_train[points_features], df_train[['AWAY_points','HOME_points']])
    
    df_train, prob_model = train_probs_lr(df_train, points_model, points_features, prob_features)
    
    predicted_points = points_model.predict(df_test[points_features])
    
    df_test['model_away_p'] = predicted_points[:,0]
    df_test['model_home_p'] = predicted_points[:,1]
    
    df_test['model_total_p'] = df_test['model_away_p']  + df_test['model_home_p']
    df_test['diff_p'] = df_test['model_away_p']-df_test['model_home_p']
                         
    away_p = prob_model.predict(df_test[prob_features])
    df_test['away_win_p'] = prob_model.predict_proba(df_test[prob_features])[:,1]
    df_test['away_win'] = np.where(df_test['AWAY_points'] > df_test['HOME_points'],1,0)
    
    return df_train, df_test


def run_layer_one(df_train, df_test, points_model, points_features = 
                  ['AWAY_p_allowed', 'HOME_p_allowed', 'AWAY_p_scored', 
                       'HOME_p_scored', 'combined_pace','abs_eff_diff']):
    
    """single function that trains points model and win probability model on
    training data and then produces predictions on test data for use in evaluation"""
    
    
    
    points_model.fit(df_train[points_features], df_train[['AWAY_points','HOME_points']])
    
    
    predicted_points = points_model.predict(df_test[points_features])
    predicted_points_train = points_model.predict(df_train[points_features])
    
    df_train['model_away_p'] = predicted_points_train[:,0]
    df_train['model_home_p'] = predicted_points_train[:,1]
    
    df_train['model_total_p'] = df_train['model_away_p']  + df_train['model_home_p']
    df_train['diff_p'] = df_train['model_away_p']-df_train['model_home_p']
    
    df_train['away_win'] = np.where(df_train['AWAY_points'] > df_train['HOME_points'],1,0)

    df_test['model_away_p'] = predicted_points[:,0]
    df_test['model_home_p'] = predicted_points[:,1]
    
    df_test['model_total_p'] = df_test['model_away_p']  + df_test['model_home_p']
    df_test['diff_p'] = df_test['model_away_p']-df_test['model_home_p']
    df_test['away_win'] = np.where(df_test['AWAY_points'] > df_test['HOME_points'],1,0)

    prob_model = linear_model.LogisticRegression(C=1., solver='liblinear', intercept_scaling =100)
    
    prob_features = ['diff_p','model_total_p']
    prob_model.fit(df_train[prob_features],df_train['away_win'])

    df_test['away_win_p'] = prob_model.predict_proba(df_test[prob_features])[:,1]
    
    return df_train, df_test

def moneyline_return(df_test):
    
    """produces a simple money line score for betting the same amount on 
    every positive expected value moneyline bet. Useful in comparing
    to the simple strategy of betting on all over/unders and spreads """
    
    #convert moneylines to payouts
    df_test['home_payout'] = np.where(df_test.home_moneyline > 0, 
                                      1 + df_test.home_moneyline/100., 
                                      1 + 100./(-1*df_test.away_money_line.astype(float)))
    
    df_test['away_payout'] = np.where(df_test.away_money_line > 0, 
                                      1 + df_test.away_money_line/100., 
                                      1 + 100/(-1*df_test.away_money_line.astype(float)))
    
    bet_away = df_test['away_win_p'] * df_test['away_payout'] > 1.03
    
    result_betting_away  = df_test['away_payout'] * df_test['away_win']
    
    bet_home = (df_test['away_win_p']-1)*df_test['home_payout'] > 1.03
    
    result_betting_home = df_test['home_payout'] * (1-df_test['away_win_p'])
    
    result_no_bet = 1
    
    game_net = -1 + np.where(bet_away, result_betting_away,
         np.where(bet_home, result_betting_home, result_no_bet))
    
    return game_net.mean()

def spread_score(df_test):
    
    """produces a simple spread score for betting the same amount on 
    every positive expected value spread bet. Useful in comparing
    to the simple strategy of betting on all moneyline and spreads """
    standard_payout = 100/110 + 1
    
    bet_home = df_test['home_v_spread_p'] > .523
    
    result_betting_home  =  standard_payout * df_test['home_v_spread']
    
    bet_away = df_test['home_v_spread_p'] < .523
    
    result_betting_away = standard_payout  * (1-df_test['home_v_spread'])
    
    result_no_bet = 1
    
    game_net = -1 + np.where(bet_away, result_betting_away,
         np.where(bet_home, result_betting_home, result_no_bet))
    
    return game_net.mean()

def compute_dev_metrics(df_test):
    
    standard_payout = 1 + 100./110
    
    accuracy = np.where((df_test['away_win_p'] > .5) == 
                 (df_test['AWAY_points']>df_test['HOME_points']), 1, 0).mean()
    
    mae_away = np.abs(df_test['model_away_p'] - df_test['AWAY_points']).mean()
    bias_away = (df_test['model_away_p'] - df_test['AWAY_points']).mean()
    mae_home = np.abs(df_test['model_home_p'] - df_test['HOME_points']).mean()
    bias_home = (df_test['model_home_p'] - df_test['HOME_points']).mean()

    
    beat_spread = ((df_test['diff_p'] < df_test['home_spread']) == 
                ((df_test.AWAY_points - df_test.HOME_points) < (df_test['home_spread']))).mean()
    
    beat_total_game = ((df_test['model_total_p'] > df_test['over-under']) == 
                ((df_test.AWAY_points + df_test.HOME_points) > (df_test['over-under'])))
    
    has_over_under = df_test['over-under'].notnull()
    beat_total = beat_total_game[has_over_under].mean()

    #beat_total_aug = ((df_test['over_p'] > .5) == 
    #            ((df_test.AWAY_points + df_test.HOME_points) > (df_test['over-under']))).mean()
    
    over_under_return =  (beat_total * standard_payout) - 1.

    spread_return =  (beat_spread * standard_payout) - 1.

    #stack_return = (beat_total_aug * standard_return) - 1.
    
    moneyline_net = moneyline_return(df_test)
    #stack_return = spread_score(df_test)
    
    return {'MAE_away': mae_away,
                        'MAE_home' : mae_home,
            'bias_away': bias_away,
            'bias_home': bias_home,
                        'Accuracy': accuracy,
                       'beat_spread': beat_spread,
            'beat_spread': beat_spread,            
           'moneyline_return': moneyline_net,
           'over_under_return': over_under_return,
           'spread_return': spread_return}

def compute_test_metrics(df_test):
    
    standard_return = 1 + 100./110
    
    accuracy = np.where((df_test['away_win_p'] > .5) == 
                 (df_test['AWAY_points']>df_test['HOME_points']), 1, 0).mean()
    
    mae_away = np.abs(df_test['model_away_p'] - df_test['AWAY_points']).mean()
    bias_away = (df_test['model_away_p'] - df_test['AWAY_points']).mean()
    mae_home = np.abs(df_test['model_home_p'] - df_test['HOME_points']).mean()
    bias_home = (df_test['model_home_p'] - df_test['HOME_points']).mean()


    
    beat_spread = ((df_test['diff'] < df_test['home_spread']) == 
                ((df_test.AWAY_points - df_test.HOME_points) < (df_test['home_spread']))).mean()
    
    beat_total_game = ((df_test['model_total_p'] > df_test['over-under']) == 
                ((df_test.AWAY_points + df_test.HOME_points) > (df_test['over-under'])))
    
    has_over_under = df_test['over-under'].notnull()
    beat_total = beat_total_game[has_over_under].mean()
    #beat_total_aug = ((df_test['over_p'] > .5) == 
    #            ((df_test.AWAY_points + df_test.HOME_points) > (df_test['over-under']))).mean()
    
    total_return =  (beat_total * standard_return) - 1.
    spread_return =  (beat_spread * standard_return) - 1.
    #stack_return = (beat_total_aug * standard_return) - 1.
    
    moneyline_net = moneyline_score(df_test)
    stack_return = spread_score(df_test)
    
    return {'MAE_away': mae_away,
                        'MAE_home' : mae_home,
            'bias_away': bias_away,
            'bias_home': bias_home,
                        'Accuracy': accuracy,
                       'beat_spread': beat_spread,
                       'beat_total' : beat_total,
           'moneyline_return': moneyline_net,
           'total_return': total_return,
           'spread_return': spread_return,
           'stack_return': stack_return}

def run_stack(dev, test):
    
    """single function that trains points model and win probability model on
    training data and then produces predictions on test data for use in evaluation"""
    
    
    spread_model = linear_model.LogisticRegression(C=0.03,solver='liblinear', intercept_scaling =100)
    win_model = linear_model.LogisticRegression(C=100,solver='liblinear', intercept_scaling =100)
    
    #spread_model = ensemble.RandomForestClassifier(n_estimators=200, min_samples_leaf=200)
    #win_model = ensemble.RandomForestClassifier(n_estimators=200, min_samples_leaf=200)
    
    dev['home_v_spread'] = np.where((dev['HOME_points']-dev['AWAY_points']) 
                                         + dev['home_spread'] > 0,1, 0)
    test['home_v_spread'] = np.where((test['HOME_points']-test['AWAY_points']) 
                                         + test['home_spread'] > 0,1, 0)
    
    spread_features = ['diff_diff','spread','model_home_p']
    win_features = ['diff_p','home_spread']
    
    dev['diff_diff'] = dev['home_spread'] - dev['diff_p']
    test['diff_diff'] = test['home_spread'] - test['diff_p']
    
    
    spread_model.fit(dev[spread_features], dev['home_v_spread'])
    win_model.fit(dev[win_features], dev['away_win'])
               
    test['home_v_spread_p'] = spread_model.predict_proba(test[spread_features])[:,1]
    test['away_win_p'] = win_model.predict_proba(test[win_features])[:,1]
    
    return dev, test

# read data

In [60]:
backtest_data = pd.read_csv('../data/raptor_elo_backtest_2015_2019.csv')
backtest_data.drop(columns='Unnamed: 0',inplace=True)

In [25]:
backtest_data.game_index.max()

'202001190SAS'

# add some features

In [61]:
backtest_data['season'] = backtest_data.date_y.map(lambda x: int(x[:4]) if int(x[:6:8]) < 7 else int(x[:4])+1)
#don't fill the missing over unders unless you need to
#backtest_data['over-under'].fillna(backtest_data['over-under'].mean())
backtest_data['combined_pace'] = backtest_data.AWAY_pace + backtest_data.HOME_pace
backtest_data['abs_eff_diff'] = np.abs((backtest_data['AWAY_ORT']-backtest_data['AWAY_DRT']) - (backtest_data['HOME_ORT']-backtest_data['HOME_DRT']))
backtest_data['total_eff_diff'] = (backtest_data['AWAY_ORT']-backtest_data['AWAY_DRT']) - (backtest_data['HOME_ORT']-backtest_data['HOME_DRT'])

# create folds. The folds are deterministic given the number input

In [62]:
backtest_data.shape

(4745, 106)

In [63]:
folds = split_random(backtest_data, 20000)
fold_list = folds.keys()

# baseline metrics from Drake v1

In [71]:
dev_metrics = {}
test_metrics = {}
dev = pd.DataFrame() 
for fold in fold_list:
    
    train = folds[fold]['train']
    fold_dev = folds[fold]['test']
    train, fold_dev = run_drake_v1(train, fold_dev)
    dev_metrics[fold] = compute_dev_metrics(fold_dev)
    dev = dev.append(fold_dev)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sor

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sor

In [72]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy             0.702055
MAE_away             8.489154
MAE_home             8.416574
beat_spread          0.576291
bias_away            0.117332
bias_home           -0.051319
moneyline_return     0.042086
over_under_return    0.016510
spread_return        0.100191
dtype: float64

# define models

In [64]:
regr_trans = TransformedTargetRegressor(
    regressor=Ridge(alpha=25.0),
    transformer=QuantileTransformer(n_quantiles=200,
                                    output_distribution='normal'))
ridge_model = linear_model.Ridge(alpha=40.0)
#NN = MLPRegressor(hidden_layer_sizes=(2,), max_iter=200, learning_rate_init=0.04, alpha=10.0)
#points_model = ensemble.GradientBoostingRegressor(max_depth=1, learning_rate=0.1, n_estimators = 100)

In [73]:
backtest_data.iloc[0]

game_index                  201601250CHI
AWAY_p_allowed                   100.588
HOME_p_allowed                   100.821
AWAY_p_scored                    97.3429
HOME_p_scored                    102.973
AWAY_team                     MIAMI HEAT
HOME_team                  CHICAGO BULLS
AWAY_points                           89
HOME_points                           84
AWAY_ORT                        -1.36963
HOME_ORT                        -0.13328
AWAY_DRT                        0.539817
HOME_DRT                        -0.25506
AWAY_pace                       -1.94756
HOME_pace                     -0.0955707
M_02                                   0
M_03                                   0
M_04                                   0
M_10                                   0
M_11                                   0
M_12                                   0
a_BOSTON CELTICS                       0
a_BROOKLYN NETS                        0
a_CHARLOTTE HORNETS                    0
a_CHICAGO BULLS 

In [76]:
dev_metrics = {}
test_metrics = {}
dev = pd.DataFrame() 
for fold in fold_list:
    
    train = folds[fold]['train']
    fold_dev = folds[fold]['test']
    features = ['elo1_pre','elo2_pre']
    train, fold_dev = run_layer_one(train, fold_dev, ridge_model, features)
    dev_metrics[fold] = compute_dev_metrics(fold_dev)
    dev = dev.append(fold_dev)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sor

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sor

In [77]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy             0.660802
MAE_away             9.451037
MAE_home             9.420936
beat_spread          0.526566
bias_away            0.170245
bias_home            0.000835
moneyline_return    -0.026288
over_under_return   -0.024969
spread_return        0.005263
dtype: float64

In [56]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy             0.702351
MAE_away             8.505703
MAE_home             8.418481
beat_spread          0.577112
bias_away            0.173534
bias_home           -0.062355
moneyline_return     0.051511
over_under_return    0.020053
spread_return        0.101760
dtype: float64

In [53]:
backtest_data.head()

Unnamed: 0,game_index,AWAY_p_allowed,HOME_p_allowed,AWAY_p_scored,HOME_p_scored,AWAY_team,HOME_team,AWAY_points,HOME_points,AWAY_ORT,...,game_index_favorite,game_index_underdog,home_spread,away_spread,home_moneyline,away_money_line,season,combined_pace,abs_eff_diff,total_eff_diff
0,201601250CHI,100.588325,100.821416,97.342892,102.973224,MIAMI HEAT,CHICAGO BULLS,89.0,84.0,-1.369625,...,201601250CHI,201601250MIA,-6.5,6.5,-283.0,250.0,2016,-2.04313,2.031222,-2.031222
1,201601250CLE,106.088071,97.817883,100.597678,108.961157,MINNESOTA TIMBERWOLVES,CLEVELAND CAVALIERS,107.0,114.0,-0.310434,...,201601250CLE,201601250MIN,-13.0,13.0,-1400.0,998.0,2016,-0.467693,2.097884,-2.097884
2,201601250GSW,96.189858,99.886379,101.899167,111.936744,SAN ANTONIO SPURS,GOLDEN STATE WARRIORS,90.0,120.0,3.182644,...,201601250GSW,201601250SAS,-4.5,4.5,-200.0,180.0,2016,1.059827,4.549126,-4.549126
3,201601250MEM,103.402286,97.644368,100.804548,101.057761,ORLANDO MAGIC,MEMPHIS GRIZZLIES,92.82,98.28,0.172127,...,201601250MEM,201601250ORL,-5.5,5.5,-205.0,184.0,2016,-1.787644,1.955246,1.955246
4,201601250NOP,107.54266,101.708551,104.050443,103.069717,HOUSTON ROCKETS,NEW ORLEANS PELICANS,112.0,111.0,0.440217,...,201601250NOP,201601250HOU,-4.5,4.5,-177.0,159.0,2016,3.092877,1.563145,1.563145


In [54]:
backtest_data.iloc[0]

game_index                   201601250CHI
AWAY_p_allowed                    100.588
HOME_p_allowed                    100.821
AWAY_p_scored                     97.3429
HOME_p_scored                     102.973
AWAY_team                      MIAMI HEAT
HOME_team                   CHICAGO BULLS
AWAY_points                            89
HOME_points                            84
AWAY_ORT                         -1.36963
HOME_ORT                         -0.13328
AWAY_DRT                         0.539817
HOME_DRT                         -0.25506
AWAY_pace                        -1.94756
HOME_pace                      -0.0955707
M_02                                    0
M_03                                    0
M_04                                    0
M_10                                    0
M_11                                    0
M_12                                    0
a_BOSTON CELTICS                        0
a_BROOKLYN NETS                         0
a_CHARLOTTE HORNETS               

In [29]:
dev_metrics

{0: {'MAE_away': 8.519314316746048,
  'MAE_home': 7.976206412687761,
  'bias_away': 0.4463637146987764,
  'bias_home': 0.3953981380569974,
  'Accuracy': 0.7063197026022305,
  'beat_spread': 0.5780669144981413,
  'moneyline_return': 0.05383884424768941,
  'total_return': -0.03114658360911049,
  'spread_return': 0.10358229131463337},
 1: {'MAE_away': 8.14458581666863,
  'MAE_home': 8.606519733667422,
  'bias_away': 0.2722832008867959,
  'bias_home': -0.25983633013647073,
  'Accuracy': 0.7211895910780669,
  'beat_spread': 0.570631970260223,
  'moneyline_return': 0.03128594031755592,
  'total_return': 0.014705882352941124,
  'spread_return': 0.08938830686042598},
 2: {'MAE_away': 8.455076711567731,
  'MAE_home': 8.363625349250817,
  'bias_away': 0.09273545285037524,
  'bias_home': 0.23836743955974887,
  'Accuracy': 0.7397769516728625,
  'beat_spread': 0.5464684014869888,
  'moneyline_return': 0.016020016243828983,
  'total_return': 0.010210092283526429,
  'spread_return': 0.043257857384251

In [52]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy            0.702200
MAE_away            8.340874
MAE_home            8.291356
beat_spread         0.577112
bias_away           0.187390
bias_home          -0.050507
moneyline_return    0.053776
spread_return       0.101760
total_return        0.039409
dtype: float64

In [18]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy            0.702086
MAE_away            8.348181
MAE_home            8.379620
beat_spread         0.581991
bias_away           0.139381
bias_home          -0.151108
moneyline_return    0.051777
spread_return       0.111074
total_return        0.023917
dtype: float64

In [449]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy            0.702200
MAE_away            8.340874
MAE_home            8.291356
beat_spread         0.577112
moneyline_return    0.053776
spread_return       0.101760
total_return        0.039409
dtype: float64

In [371]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy            0.702200
MAE_away            8.340881
MAE_home            8.291352
beat_spread_a       0.577112
beat_spread_b       0.577112
beat_total_a        0.544625
beat_total_b        0.544625
moneyline_return    0.053776
spread_return_a     0.101760
spread_return_b     0.101760
total_return_a      0.039739
total_return_b      0.039739
dtype: float64

In [304]:
dev_metrics = {}
test_metrics = {}
dev = pd.DataFrame() 
for fold in fold_list:
    
    train = folds[fold]['train']
    fold_dev = folds[fold]['test']
    train, fold_dev = run_drake_v1(train, fold_dev)
    dev_metrics[fold] = compute_dev_metrics(fold_dev)
    dev = dev.append(fold_dev)

In [314]:
1./(1+100/110.)

0.5238095238095238

In [32]:
dev = dev.drop_duplicates('game_index')

In [33]:
dev['total_diff'] = dev['over-under'] - dev['model_total_p']

In [34]:
dev['total_diff_p'] = np.round(dev['total_diff'],0)

In [36]:
dev['diff_diff'] = dev['home_spread'] - dev['diff_p']

In [37]:
dev['diff_diff_r'] = np.round(dev['diff_diff'],0)

In [38]:
dev['home_spread_r'] = np.round(dev['home_spread'],0)

In [42]:
dev['over'] = np.where((dev['HOME_points']+dev['AWAY_points']) > dev['over-under'],1, 0)

In [43]:
dev['home_v_spread'] = np.where((dev['HOME_points']-dev['AWAY_points']) 
                                         + dev['home_spread'] > 0,1, 0)

In [45]:
dev[['home_spread','diff_p','diff_diff','home_v_spread','HOME_points','AWAY_points']].head()

Unnamed: 0,home_spread,diff_p,diff_diff,home_v_spread,HOME_points,AWAY_points
0,-6.5,-2.848812,-3.651188,0,84.0,89.0
21,-5.5,-7.014735,1.514735,1,103.0,83.0
24,-8.0,-5.645706,-2.354294,1,113.0,94.0
25,-8.0,-9.609249,1.609249,1,91.0,79.0
63,-7.0,-8.162282,1.162282,0,84.0,81.0


In [46]:
an = dev.groupby(['diff_diff_r']).home_v_spread.agg(['count','mean'])

In [342]:
an = dev[dev['over-under'].notnull()].groupby(['total_diff_p']).over.agg(['count','mean'])

In [47]:
an

Unnamed: 0_level_0,count,mean
diff_diff_r,Unnamed: 1_level_1,Unnamed: 2_level_1
-17.0,2,0.0
-15.0,5,0.0
-14.0,5,0.0
-12.0,9,0.111111
-11.0,9,0.222222
-10.0,9,0.222222
-9.0,19,0.263158
-8.0,29,0.172414
-7.0,68,0.397059
-6.0,110,0.363636


In [48]:
dev_folds = split_random(dev, 20000)
dev_fold_list = dev_folds.keys()

In [51]:
for fold_num in dev_fold_list:
    
    dev_fold = dev_folds[fold_num]['train']
    test = dev_folds[fold_num]['test']
    dev_fold, test = run_stack(dev_fold, test)
    test_metrics[fold_num] = compute_test_metrics(test)


KeyError: "['spread'] not in index"

In [330]:
pd.DataFrame(test_metrics).T.mean()

Accuracy            0.701014
MAE_away            8.466144
MAE_home            8.378733
beat_spread         0.573045
beat_total          0.535913
moneyline_return    0.049513
spread_return       0.093994
stack_return        0.104157
total_return        0.023107
dtype: float64

In [262]:
pd.DataFrame(test_metrics).T.mean()

Accuracy            0.701014
MAE_away            8.466144
MAE_home            8.378733
beat_spread         0.573045
beat_total          0.535913
moneyline_return    0.049564
spread_return       0.093994
stack_return        0.095460
total_return        0.023107
dtype: float64

In [240]:
pd.DataFrame(dev_metrics).T.mean()

Accuracy            0.702351
MAE_away            8.505703
MAE_home            8.418481
beat_spread         0.576610
beat_total          0.534314
moneyline_return    0.051511
spread_return       0.100801
total_return        0.020053
dtype: float64

In [144]:
pd.DataFrame(evaluation).T.mean()

Accuracy            0.693811
MAE_away            8.536424
MAE_home            8.461620
beat_spread         0.564001
beat_total          0.540455
moneyline_return    0.030449
spread_return       0.076729
total_return        0.031777
dtype: float64

In [108]:
pd.DataFrame(evaluation).T.mean()

Accuracy            0.693761
MAE_away            8.537651
MAE_home            8.454859
beat_spread         0.564302
beat_total          0.597358
moneyline_return    0.028756
spread_return       0.077304
total_return        0.140410
dtype: float64

In [102]:
pd.DataFrame(evaluation).T.mean()

Accuracy            0.693962
MAE_away            8.545354
MAE_home            8.458927
beat_spread         0.563247
beat_total          0.598814
moneyline_return    0.025250
spread_return       0.075290
total_return        0.143191
dtype: float64

In [85]:
pd.DataFrame(evaluation).T.mean()

Accuracy            0.702351
MAE_away            8.505703
MAE_home            8.418481
beat_spread         0.576610
beat_total          0.594142
moneyline_return    0.051511
spread_return       0.100801
total_return        0.134272
dtype: float64

In [150]:
pd.DataFrame(evaluation).T.mean()

Accuracy            0.702071
MAE_away            8.546218
MAE_home            8.449309
beat_spread         0.560011
beat_total          0.541423
moneyline_return    0.032336
spread_return       0.069111
total_return        0.033627
dtype: float64

In [147]:
pd.DataFrame(evaluation).T.mean()

Accuracy            0.692605
MAE_away            8.542138
MAE_home            8.463545
beat_spread         0.563699
beat_total          0.541128
moneyline_return    0.027913
spread_return       0.076153
total_return        0.033063
dtype: float64

In [62]:
df_train.columns[-30:]

Index(['h_PHILADELPHIA 76ERS', 'h_PHOENIX SUNS', 'h_PORTLAND TRAIL BLAZERS',
       'h_SACRAMENTO KINGS', 'h_SAN ANTONIO SPURS', 'h_TORONTO RAPTORS',
       'h_UTAH JAZZ', 'h_WASHINGTON WIZARDS', 'date_x', 'time', 'favorite',
       'underdog', 'favorite moneyline', 'underdog moneyline', 'line',
       'over-under', 'date_y', 'game_index_favorite', 'game_index_underdog',
       'home_spread', 'away_spread', 'home_moneyline', 'away_money_line',
       'season', 'model_home_p', 'model_away_p', 'model_total_p', 'diff',
       'away_win', 'away_win_p'],
      dtype='object')

In [92]:
backtest_data[backtest_data['home_moneyline'].isnull()].groupby('season').count()

Unnamed: 0_level_0,game_index,AWAY_p_allowed,HOME_p_allowed,AWAY_p_scored,HOME_p_scored,AWAY_team,HOME_team,AWAY_points,HOME_points,AWAY_ORT,...,underdog moneyline,line,over-under,date_y,game_index_favorite,game_index_underdog,home_spread,away_spread,home_moneyline,away_money_line
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [91]:
backtest_data.columns

Index(['game_index', 'AWAY_p_allowed', 'HOME_p_allowed', 'AWAY_p_scored',
       'HOME_p_scored', 'AWAY_team', 'HOME_team', 'AWAY_points', 'HOME_points',
       'AWAY_ORT', 'HOME_ORT', 'AWAY_DRT', 'HOME_DRT', 'AWAY_pace',
       'HOME_pace', 'M_02', 'M_03', 'M_04', 'M_10', 'M_11', 'M_12',
       'a_BOSTON CELTICS', 'a_BROOKLYN NETS', 'a_CHARLOTTE HORNETS',
       'a_CHICAGO BULLS', 'a_CLEVELAND CAVALIERS', 'a_DALLAS MAVERICKS',
       'a_DENVER NUGGETS', 'a_DETROIT PISTONS', 'a_GOLDEN STATE WARRIORS',
       'a_HOUSTON ROCKETS', 'a_INDIANA PACERS', 'a_LOS ANGELES CLIPPERS',
       'a_LOS ANGELES LAKERS', 'a_MEMPHIS GRIZZLIES', 'a_MIAMI HEAT',
       'a_MILWAUKEE BUCKS', 'a_MINNESOTA TIMBERWOLVES',
       'a_NEW ORLEANS PELICANS', 'a_NEW YORK KNICKS',
       'a_OKLAHOMA CITY THUNDER', 'a_ORLANDO MAGIC', 'a_PHILADELPHIA 76ERS',
       'a_PHOENIX SUNS', 'a_PORTLAND TRAIL BLAZERS', 'a_SACRAMENTO KINGS',
       'a_SAN ANTONIO SPURS', 'a_TORONTO RAPTORS', 'a_UTAH JAZZ',
       'a_WASHINGTON 

In [124]:
beat_total = ((df_test['model_total_p'] > df_test['over-under']) == 
                ((df_test.AWAY_points + df_test.HOME_points) > (df_test['over-under'])))

In [130]:
i = df_test['over-under'].isnull()

In [132]:
beat_total[i].sum()

64

In [140]:
backtest_data[backtest_data['home_spread'].isnull()].shape

(0, 95)

In [136]:
675/float(backtest_data.shape[0])

0.1254413677755064

In [145]:
backtest_data['home_spread'].describe()

count    5381.000000
mean       -2.387567
std         6.246701
min       -22.000000
25%        -7.000000
50%        -2.500000
75%         2.000000
max        17.500000
Name: home_spread, dtype: float64

In [159]:
((100/110.) + 1)*x=1

0.9090909090909091

In [160]:
1/((100/110.) + 1)

0.5238095238095238

In [184]:
df_test[['home_spread','diff','model_home_p','model_away_p','diff_diff','home_v_spread','home_v_spread_p','HOME_points','AWAY_points']].head()

Unnamed: 0,home_spread,diff,model_home_p,model_away_p,diff_diff,home_v_spread,home_v_spread_p,HOME_points,AWAY_points
2,-4.5,-4.376849,102.719612,98.342763,-0.123151,1,0.492048,120.0,90.0
7,-3.0,2.19329,100.15325,102.34654,-5.19329,0,0.310842,91.0,116.0
11,-5.0,-5.599851,106.364546,100.764695,0.599851,1,0.51814,112.0,97.0
21,-5.5,-6.927591,102.02492,95.097329,1.427591,1,0.550959,103.0,83.0
23,-7.5,-7.085173,105.228102,98.142929,-0.414827,0,0.480656,113.0,117.0


In [179]:
df_test.head()

Unnamed: 0,game_index,AWAY_p_allowed,HOME_p_allowed,AWAY_p_scored,HOME_p_scored,AWAY_team,HOME_team,AWAY_points,HOME_points,AWAY_ORT,...,model_total_p,diff,away_win_p,away_win,home_payout,away_payout,total_diff,home_v_spread,home_v_spread_p,diff_diff
2,201601250GSW,96.189858,99.886379,101.899167,111.936744,SAN ANTONIO SPURS,GOLDEN STATE WARRIORS,90.0,120.0,3.182644,...,201.062375,-4.376849,0.32902,0,0.444444,2.8,-107.780388,1,0.535317,-8.876849
7,201601250WAS,102.873361,102.437562,103.466121,103.282532,BOSTON CELTICS,WASHINGTON WIZARDS,116.0,91.0,0.819735,...,202.49979,2.19329,0.580819,1,0.264706,2.36,-113.84675,0,0.514049,-0.80671
11,201601260POR,108.26392,102.183572,102.562446,105.01874,SACRAMENTO KINGS,PORTLAND TRAIL BLAZERS,97.0,112.0,-1.447698,...,207.129241,-5.599851,0.291389,0,0.456522,2.84,-108.135454,1,0.527938,-10.599851
21,201601280MEM,105.999604,97.672727,99.034616,100.738179,MILWAUKEE BUCKS,MEMPHIS GRIZZLIES,83.0,103.0,-1.416279,...,197.122249,-6.927591,0.245057,0,0.481865,2.93,-90.97508,1,0.55079,-12.427591
23,201601280WAS,107.30365,100.994562,100.249065,103.987884,DENVER NUGGETS,WASHINGTON WIZARDS,117.0,113.0,-1.09221,...,203.37103,-7.085173,0.243631,1,0.628253,3.69,-106.771898,0,0.544103,-14.585173
