In [56]:
import pandas as pd
import numpy as np
import warnings
import pickle
import inspect 
import json
from datetime import datetime
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import Huber
import tensorflow as tf 
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

## Time Series Cross Validation Strategy
1. Because of the potential for underlying changes in the data generation process (in this case, changes in how teams strategize/play in the NBA), we should avoid using traditional k-fold cross validation and instead use a time series cross validation approach. This will also help us avoid any unintended data leakage.

2. Identify optimal hyper-parameters.
    
    a. Use "blocked" rolling train/test splits (eg. train on 2015, test on random 25-50% of 2016, etc.) and average the MSE across the years to find the best parameters. 

3. After identifying optimal hyper-parameters, train on a 4-year rolling window basis (ie. train on 2015-2018, test on 2019). Repeat this several times so that you get a better sense of what your profit/loss will look like by seeing how you would have done in 2019, 2020, 2021, 2022, etc.

#### Profit / Loss Simulation
1. Use rolling 4 year training period (eg. 2015-2018) and test P/L on following year (eg. 2019) at various point spread differentials (predicted spread vs. actual spread > X, for a series of X values)

# Model 1: Random Forest

The first thing we want to do is figure out which hyper-parameters work best for each model we are going to build. We figure this out by training on 1 year of data and then testing on a portion of the following year. By repeating this many times for different hyper-parameter combinations and taking the average MSE, we can figure out the best options. Then, we will use those config parameters when we train the actual models (using a rolling 4 years of data and testing on the entire subsequent year).

In [8]:
train_df = pd.read_csv('../../generated_datasets/train_df.csv')

# Hyper-Parameter Tuning (RANDOM FOREST)
def custom_time_series_splits(df, season_col='SEASON'):
    for season in range(min(train_df.SEASON), max(train_df.SEASON)):  # For seasons 2015 to 2021 (2022 not included because we don't have 2023 season data to validate on yet)
        train_indices = df[df[season_col] == season].index.values
        test_indices = df[df[season_col] == season + 1].index.values
        test_indices = np.random.choice(test_indices, size=int(1 * len(test_indices)), replace=False)
        yield train_indices, test_indices

# Create a Random Forest Regressor object
rf = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [250, 500, 1000],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    # 'min_samples_split': ['auto',5,10,20],
    # 'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define a scorer (using MSE)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Features/Predictors
X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
# Target
y = train_df['HOME PLUS MINUS']

# Use the generator for custom CV splits
cv_splits = list(custom_time_series_splits(train_df))

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=-1, verbose=0)
grid_search.fit(X, y)

# Obtain the cross-validation results
cv_results = grid_search.cv_results_

# Print the average validation score (MSE) for each parameter combination
#for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
#    print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")
best_index = grid_search.best_index_

# Extract individual test scores for the best parameters
test_scores = []
for i in range(len(cv_splits)):  
    split_score_key = f"split{i}_test_score"
    test_scores.append(np.abs(grid_search.cv_results_[split_score_key][best_index]))

print(f"Individual test scores for the best parameters: {test_scores}")
print(f"Average MSE: {np.abs(np.mean(test_scores))}")


Best parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 1000}
Individual test scores for the best parameters: [173.33191075932405, 167.21777385437267, 178.83493002132587, 174.03943791618076, 205.30173915523574, 217.5988714489783, 171.85858765701377]
Average MSE: 184.02617868749016


In [9]:
# Now that we have identified the optimal hyper-parameters, let's build a series of 4-year training/1 year validation models to track MSE (and $ won or lost) over multiple years

def train_and_validate_models(df, best_params, season_col='SEASON'):
    trained_models = []
    validation_scores = []

    # Define the range of starting seasons for training data (2015 to 2018 inclusive) 
    # So, Model 1 will train using 2015 - 2018 and test on 2019. Model 2 will train on 2016 - 2019 and test on 2020, etc.
    for start_season in range(2015, 2019):
        # Split the data based on the seasons
        train_data = df[(df[season_col] >= start_season) & (df[season_col] <= start_season + 3)]
        test_data = df[df[season_col] == start_season + 4]

        # Extract features and target variable
        X_train = train_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_train = train_data['HOME PLUS MINUS']

        X_test = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_test = test_data['HOME PLUS MINUS']

        # Train the model using best hyperparameters identified previously
        model = RandomForestRegressor(**best_params)
        model.fit(X_train, y_train)

        # Validate the model on the test set
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Store the trained model and validation score
        trained_models.append(model)
        validation_scores.append(mse)

        print(f"Trained for seasons {start_season}-{start_season+3}, MSE on season {start_season+4}: {mse:.4f}")

    return trained_models, validation_scores, X_test, y_test

# Call the function
rf_models, rf_scores, X_test, y_test = train_and_validate_models(train_df, best_params)


Trained for seasons 2015-2018, MSE on season 2019: 174.1848
Trained for seasons 2016-2019, MSE on season 2020: 200.8323
Trained for seasons 2017-2020, MSE on season 2021: 214.6412
Trained for seasons 2018-2021, MSE on season 2022: 169.0366


In [10]:
# Get feature importances for a given model (i = 0 is 2015-2018, i = 1 is 2016-2019 etc.)
i = 0
importances = rf_models[i].feature_importances_

# Sort and display the features by their importance
feature_importances = pd.DataFrame(rf_models[i].feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
HOME TEAM WIN PCT,0.142549
AWAY TEAM WIN PCT,0.104766
HOME TEAM PP100P,0.094644
AWAY TEAM PP100P,0.067472
AWAY TEAM 2PT PCT,0.0576
HOME TEAM OPP PP100P,0.052597
HOME TEAM WIN PCT LAST 10,0.052054
HOME TEAM SOS,0.047305
HOME TEAM 2PT PCT,0.046686
AWAY TEAM DRB PCT,0.034361


In [11]:
START_YEAR = min(train_df.SEASON) # For our data, the first year should be 2015
# Bet amount each game
BET_AMOUNT = 100
# Average betting site take rate
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)
for THRESHOLD in [8,10,12,14]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(rf_models):
        test_year = START_YEAR + 4 + i
        #print('Test Year:', test_year)
        test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)



---------------------------------------
THRESHOLD: 8
Test Year 2019 Summary | Bets made: 32 | Bets won: 19 | Bets lost: 13 | Bets pushed: 0 | Game count: 997 | Bet win rate: 0.59 | Final bankroll: 410.0
Test Year 2020 Summary | Bets made: 44 | Bets won: 18 | Bets lost: 25 | Bets pushed: 1 | Game count: 1029 | Bet win rate: 0.42 | Final bankroll: -880.0
Test Year 2021 Summary | Bets made: 46 | Bets won: 26 | Bets lost: 19 | Bets pushed: 1 | Game count: 1180 | Bet win rate: 0.58 | Final bankroll: 440.0
Test Year 2022 Summary | Bets made: 89 | Bets won: 41 | Bets lost: 46 | Bets pushed: 2 | Game count: 1074 | Bet win rate: 0.47 | Final bankroll: -910.0
---------------------------------------
THRESHOLD: 10
Test Year 2019 Summary | Bets made: 15 | Bets won: 8 | Bets lost: 7 | Bets pushed: 0 | Game count: 997 | Bet win rate: 0.53 | Final bankroll: 20.0
Test Year 2020 Summary | Bets made: 10 | Bets won: 5 | Bets lost: 5 | Bets pushed: 0 | Game count: 1029 | Bet win rate: 0.5 | Final bankroll:

In [44]:
# Save model files!

def pickle_models(model_list):
    """
    Save a list of models to individual files using pickle. Filenames are derived 
    from the variable name of the model list.

    Parameters:
    - model_list: The actual variable containing the list of models.
    
    Returns:
    - None
    """

    # Get the variable's name
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    model_var_name = [var_name for var_name, var_val in callers_local_vars if var_val is model_list][0]

    for idx, model in enumerate(model_list):
        filename = f"../../saved_models/{model_var_name}_{idx}.pkl"
        with open(filename, 'wb') as file:
            pickle.dump(model, file)
    
    print("Models saved successfully!")

In [46]:
pickle_models(rf_models)
pickle_models(nn_models)
pickle_models(xgb_models)

Models saved successfully!
Models saved successfully!
Models saved successfully!


In [17]:
# Save the model to disk
model = rf_models[-1] # Most recent RF model
filename = '../random_forest_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

Result interpretation: the maximum return is generated when we set our THRESHOLD value to 14 (meaning our model differs from the projected line by 14 or more points). Our bet accuracy tends to increase with this THRESHOLD value, but the number of bets that the algorithm makes continues to decrease. 

Let's try out some other models.

# Model 2: Neural Network

Again, the first step will be to identify the optimal hyper-parameters for the network. Then we will build our models and simulate the profit and loss.

In [18]:
# NN attempt 2 with different scaling mechanisms

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

train_df = pd.read_csv('../../generated_datasets/train_df.csv')

def custom_time_series_splits(df, season_col='SEASON'):
    for season in range(min(train_df.SEASON), max(train_df.SEASON)): 
        train_indices = df[df[season_col] == season].index.values
        test_indices = df[df[season_col] == season + 1].index.values
        test_indices = np.random.choice(test_indices, size=int(1 * len(test_indices)), replace=False)
        yield train_indices, test_indices

X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
y = train_df['HOME PLUS MINUS']

# Scale the features because MLP is sensitive to feature scaling. We want to try a few different scaling options in our grid search.
# First, Define a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Placeholder, will be set by GridSearchCV
    ('mlp', MLPRegressor(max_iter=5000, random_state=42, tol=0.1))
])

# Define the parameter grid for MLP
param_grid = {
    'scaler': [StandardScaler(), RobustScaler(), PowerTransformer()],
    'mlp__hidden_layer_sizes': [(50, 50), (50, 100), (100,100), (100,200)],  # Test different neural net architectures
    'mlp__activation': ['relu','tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.1, 1, 10],  # Regularization term
    'mlp__learning_rate_init': [0.00001, 0.0001, 0.001],
    'mlp__learning_rate': ['adaptive', 'constant']
}

# Define a scorer (using MSE)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Use the generator for custom CV splits
cv_splits = list(custom_time_series_splits(train_df))

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=-1, verbose=0)
grid_search.fit(X, y)

# Obtain the cross-validation results
cv_results = grid_search.cv_results_

# Print the average validation score (MSE) for each parameter combination
#for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
#    print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")



Best parameters: {'mlp__activation': 'relu', 'mlp__alpha': 10, 'mlp__hidden_layer_sizes': (100, 100), 'mlp__learning_rate': 'adaptive', 'mlp__learning_rate_init': 0.0001, 'mlp__solver': 'adam', 'scaler': PowerTransformer()}


In [19]:
# What do the individual year MSE values look like? How much variance is there?
best_index = grid_search.best_index_

# Extract individual test scores for the best parameters
test_scores = []
for i in range(len(cv_splits)):  
    split_score_key = f"split{i}_test_score"
    test_scores.append(np.abs(grid_search.cv_results_[split_score_key][best_index]))

print(f"Individual test scores for the best parameters: {test_scores}")
print(f"Average MSE: {np.abs(np.mean(test_scores))}")

Individual test scores for the best parameters: [171.25189994802156, 166.30640118246978, 176.70143688281576, 167.1062586304809, 200.15068006639126, 216.3076761772237, 169.73099695820355]
Average MSE: 181.07933569222953


In [20]:
from sklearn.pipeline import Pipeline

def train_and_validate_models(df, best_params, best_scaler, season_col='SEASON'):
    trained_models = []
    validation_scores = []

    # Define the range of starting seasons for training data (2015 to 2018 inclusive)
    for start_season in range(2015, 2019):
        # Split the data based on the seasons
        train_data = df[(df[season_col] >= start_season) & (df[season_col] <= start_season + 3)]
        test_data = df[df[season_col] == start_season + 4]

        # Extract features and target variable
        X_train = train_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_train = train_data['HOME PLUS MINUS']

        X_test = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_test = test_data['HOME PLUS MINUS']

        # Setup a pipeline using best scaler and model parameters
        model = Pipeline([
            ('scaler', best_scaler),
            ('mlp', MLPRegressor(**best_params, random_state=42))
        ])

        # Train the model using best hyperparameters identified previously
        model.fit(X_train, y_train)

        # Validate the model on the test set
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Store the trained model and validation score
        trained_models.append(model)
        validation_scores.append(mse)

        print(f"Trained for seasons {start_season}-{start_season+3}, MSE on season {start_season+4}: {mse:.4f}")

    return trained_models, validation_scores

# Extract the best parameters and scaler from GridSearchCV results
best_params = {k.replace('mlp__', ''): v for k, v in grid_search.best_params_.items() if 'mlp__' in k}
best_scaler = grid_search.best_params_['scaler']

# Call the function with the best parameters and the best scaler
nn_models, nn_scores = train_and_validate_models(train_df, best_params, best_scaler)


Trained for seasons 2015-2018, MSE on season 2019: 172.9143
Trained for seasons 2016-2019, MSE on season 2020: 194.9238
Trained for seasons 2017-2020, MSE on season 2021: 213.1592
Trained for seasons 2018-2021, MSE on season 2022: 168.1855


In [26]:
START_YEAR = 2015
BET_AMOUNT = 100
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)
# THRESHOLD = 4

for THRESHOLD in [7,8,9,10]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(nn_models):
        test_year = START_YEAR + 4 + i
        #print('Test Year:', test_year)
        test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)



---------------------------------------
THRESHOLD: 7
Test Year 2019 Summary | Bets made: 64 | Bets won: 31 | Bets lost: 33 | Bets pushed: 0 | Game count: 997 | Bet win rate: 0.48 | Final bankroll: -510.0
Test Year 2020 Summary | Bets made: 102 | Bets won: 58 | Bets lost: 44 | Bets pushed: 0 | Game count: 1029 | Bet win rate: 0.57 | Final bankroll: 820.0
Test Year 2021 Summary | Bets made: 102 | Bets won: 52 | Bets lost: 49 | Bets pushed: 1 | Game count: 1180 | Bet win rate: 0.51 | Final bankroll: -220.0
Test Year 2022 Summary | Bets made: 154 | Bets won: 80 | Bets lost: 71 | Bets pushed: 3 | Game count: 1074 | Bet win rate: 0.53 | Final bankroll: 100.0
---------------------------------------
THRESHOLD: 8
Test Year 2019 Summary | Bets made: 48 | Bets won: 25 | Bets lost: 23 | Bets pushed: 0 | Game count: 997 | Bet win rate: 0.52 | Final bankroll: -50.0
Test Year 2020 Summary | Bets made: 65 | Bets won: 43 | Bets lost: 22 | Bets pushed: 0 | Game count: 1029 | Bet win rate: 0.66 | Final b

In [27]:
# Save the model to disk
model = nn_models[-1] # Most recent NN model
filename = '../neural_network_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

# Model 3: XGBoost

In [28]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

train_df = pd.read_csv('../../generated_datasets/train_df.csv')

def custom_time_series_splits(df, season_col='SEASON'):
    for season in range(min(train_df.SEASON), max(train_df.SEASON)):  # For seasons 2015 to 2021
        train_indices = df[df[season_col] == season].index.values
        test_indices = df[df[season_col] == season + 1].index.values
        # Randomly sample x% of the test indices
        test_indices = np.random.choice(test_indices, size=int(1 * len(test_indices)), replace=False)
        
        yield train_indices, test_indices

X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
y = train_df['HOME PLUS MINUS']

# Create an XGBoost regressor object
xgb_regressor = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Define a scorer (using MSE)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Use the generator for custom CV splits
cv_splits = list(custom_time_series_splits(train_df))

grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=3, verbose=0)
grid_search.fit(X, y)

# Obtain the cross-validation results
cv_results = grid_search.cv_results_

# Print the average validation score (MSE) for each parameter combination
#for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
#    print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")



Best parameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 1, 'reg_lambda': 2, 'subsample': 0.8}


In [29]:
# What do the individual year MSE values look like? How much variance is there?
best_index = grid_search.best_index_

# Extract individual test scores for the best parameters
test_scores = []
for i in range(len(cv_splits)):  
    split_score_key = f"split{i}_test_score"
    test_scores.append(np.abs(grid_search.cv_results_[split_score_key][best_index]))

print(f"Individual test scores for the best parameters: {test_scores}")
print(f"Average MSE: {np.abs(np.mean(test_scores))}")

Individual test scores for the best parameters: [175.76775671915334, 170.14983697940517, 179.57394756528757, 173.84372050821688, 208.7242433165926, 224.0036140329535, 180.29722327116374]
Average MSE: 187.48004891325328


In [30]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def train_and_validate_models(df, best_params, season_col='SEASON'):
    trained_models = []
    validation_scores = []

    # Define the range of starting seasons for training data (2015 to 2018 inclusive)
    for start_season in range(2015, 2019):
        # Split the data based on the seasons
        train_data = df[(df[season_col] >= start_season) & (df[season_col] <= start_season + 3)]
        test_data = df[df[season_col] == start_season + 4]

        # Extract features and target variable
        X_train = train_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_train = train_data['HOME PLUS MINUS']

        X_test = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_test = test_data['HOME PLUS MINUS']

        # Train the model using best hyperparameters identified previously
        model = xgb.XGBRegressor(**best_params, random_state=42)
        model.fit(X_train, y_train)

        # Validate the model on the test set
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Store the trained model and validation score
        trained_models.append(model)
        validation_scores.append(mse)

        print(f"Trained for seasons {start_season}-{start_season+3}, MSE on season {start_season+4}: {mse:.4f}")

    return trained_models, validation_scores

# Call the function
xgb_models, xgb_scores = train_and_validate_models(train_df, best_params)


Trained for seasons 2015-2018, MSE on season 2019: 170.6674
Trained for seasons 2016-2019, MSE on season 2020: 199.2154
Trained for seasons 2017-2020, MSE on season 2021: 213.6061
Trained for seasons 2018-2021, MSE on season 2022: 168.2240


In [54]:
START_YEAR = 2015
BET_AMOUNT = 100
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)
# THRESHOLD = 4

for THRESHOLD in [1,4,7,10,12,14]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(xgb_models):
        test_year = START_YEAR + 4 + i
        #print('Test Year:', test_year)
        test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)

---------------------------------------
THRESHOLD: 1
Test Year 2019 Summary | Bets made: 749 | Bets won: 373 | Bets lost: 364 | Bets pushed: 12 | Game count: 997 | Bet win rate: 0.51 | Final bankroll: -2830.0
Test Year 2020 Summary | Bets made: 837 | Bets won: 419 | Bets lost: 408 | Bets pushed: 10 | Game count: 1029 | Bet win rate: 0.51 | Final bankroll: -3090.0
Test Year 2021 Summary | Bets made: 931 | Bets won: 419 | Bets lost: 497 | Bets pushed: 15 | Game count: 1180 | Bet win rate: 0.46 | Final bankroll: -11990.0
Test Year 2022 Summary | Bets made: 910 | Bets won: 432 | Bets lost: 470 | Bets pushed: 8 | Game count: 1074 | Bet win rate: 0.48 | Final bankroll: -8120.0
---------------------------------------
THRESHOLD: 4
Test Year 2019 Summary | Bets made: 199 | Bets won: 100 | Bets lost: 97 | Bets pushed: 2 | Game count: 997 | Bet win rate: 0.51 | Final bankroll: -700.0
Test Year 2020 Summary | Bets made: 329 | Bets won: 169 | Bets lost: 156 | Bets pushed: 4 | Game count: 1029 | Bet

In [32]:
# Save the model to disk
model = xgb_models[-1] # Most recent XGB model
filename = '../xgb_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [None]:
# TABNET AND AUTOINT models
#poetry add keras-tuner
#poetry add deepctr

# WEIGHT THE DIFFERENT MODELS IN THE META MODEL BY THEIR VALIDATION LOSS (lower loss = higher weight)

# Maybe keep all of the original vector data + the 3 model predictions to see if the performance improves

In [55]:
X_test.head()

Unnamed: 0,HOME TEAM DAYS REST,HOME TEAM HOME PRIOR,HOME TEAM SOS,HOME TEAM SOS LAST 10,HOME TEAM WIN PCT,HOME TEAM WIN PCT LAST 10,HOME TEAM 3PT PCT,HOME TEAM 2PT PCT,HOME TEAM PP100P,HOME TEAM ORB PCT,...,AWAY TEAM WIN PCT,AWAY TEAM WIN PCT LAST 10,AWAY TEAM 3PT PCT,AWAY TEAM 2PT PCT,AWAY TEAM PP100P,AWAY TEAM ORB PCT,AWAY TEAM DRB PCT,AWAY TEAM OPP 3PT PCT,AWAY TEAM OPP 2PT PCT,AWAY TEAM OPP PP100P
0,1,0,14.901235,17.2,0.530864,0.5,0.342593,0.538537,109.572427,0.211075,...,0.419753,0.5,0.345847,0.539955,108.212445,0.221682,0.717764,0.350301,0.566289,110.534516
1,1,0,14.777778,14.3,0.481481,0.5,0.362858,0.555081,111.053269,0.191156,...,0.209877,0.1,0.352247,0.516595,107.154384,0.234513,0.670035,0.361954,0.568362,115.0721
2,1,0,15.679012,15.2,0.493827,0.5,0.335133,0.524266,111.080029,0.258483,...,0.716049,0.7,0.368276,0.557308,112.196853,0.234792,0.76056,0.354302,0.513276,108.37322
3,1,1,16.296296,16.3,0.691358,0.7,0.375218,0.567315,114.9269,0.207902,...,0.506173,0.6,0.353204,0.547396,113.005593,0.234015,0.706579,0.35412,0.562514,112.705425
4,1,1,15.308642,18.6,0.555556,0.6,0.37824,0.560154,112.60185,0.188049,...,0.654321,0.5,0.386303,0.550897,114.626792,0.202014,0.730845,0.347762,0.553923,109.655383


In [58]:
# Mapping of old keys to new keys
key_mapping = {
    "HOME TEAM DAYS REST": "home_team_days_rest",
    "HOME TEAM HOME PRIOR": "home_team_home_prior",
    "HOME TEAM SOS": "home_team_sos",
    "HOME TEAM SOS LAST 10": "home_team_sos_last_10",
    "HOME TEAM WIN PCT": "home_team_win_pct",
    "HOME TEAM WIN PCT LAST 10": "home_team_win_pct_last_10",
    "HOME TEAM 3PT PCT": "home_team_3pt_pct",
    "HOME TEAM 2PT PCT": "home_team_2pt_pct",
    "HOME TEAM PP100P": "home_team_pp100p",
    "HOME TEAM ORB PCT": "home_team_orb_pct",
    "HOME TEAM DRB PCT": "home_team_drb_pct",
    "HOME TEAM OPP 3PT PCT": "home_team_opp_3pt_pct",
    "HOME TEAM OPP 2PT PCT": "home_team_opp_2pt_pct",
    "HOME TEAM OPP PP100P": "home_team_opp_pp100p",
    "AWAY TEAM DAYS REST": "away_team_days_rest",
    "AWAY TEAM HOME PRIOR": "away_team_home_prior",
    "AWAY TEAM SOS": "away_team_sos",
    "AWAY TEAM SOS LAST 10": "away_team_sos_last_10",
    "AWAY TEAM WIN PCT": "away_team_win_pct",
    "AWAY TEAM WIN PCT LAST 10": "away_team_win_pct_last_10",
    "AWAY TEAM 3PT PCT": "away_team_3pt_pct",
    "AWAY TEAM 2PT PCT": "away_team_2pt_pct",
    "AWAY TEAM PP100P": "away_team_pp100p",
    "AWAY TEAM ORB PCT": "away_team_orb_pct",
    "AWAY TEAM DRB PCT": "away_team_drb_pct",
    "AWAY TEAM OPP 3PT PCT": "away_team_opp_3pt_pct",
    "AWAY TEAM OPP 2PT PCT": "away_team_opp_2pt_pct",
    "AWAY TEAM OPP PP100P": "away_team_opp_pp100p"
}

In [64]:
# Get the first row of test data to send to our local API endpoint!
data_dict = X_test.iloc[0].to_dict()

# Create a new dictionary with the updated keys
new_data = {key_mapping[key]: value for key, value in data_dict.items() if key in key_mapping}

# The server expects the data to be under "games" and it should be a list of games
request_body = {
    "games": [new_data]
}

# Convert the request_body dictionary to a JSON string
json_str = json.dumps(request_body)

# Create the curl request
curl_command = f"curl -X POST -H 'Content-Type: application/json' localhost:8000/predict -d '{json_str}'"
print(curl_command)

curl -X POST -H 'Content-Type: application/json' localhost:8000/predict -d '{"games": [{"home_team_days_rest": 1.0, "home_team_home_prior": 0.0, "home_team_sos": 14.901234567901234, "home_team_sos_last_10": 17.2, "home_team_win_pct": 0.5308641975308642, "home_team_win_pct_last_10": 0.5, "home_team_3pt_pct": 0.3425925925925926, "home_team_2pt_pct": 0.5385365853658537, "home_team_pp100p": 109.57242744879647, "home_team_orb_pct": 0.21107544141252, "home_team_drb_pct": 0.6857142857142857, "home_team_opp_3pt_pct": 0.3672903672903673, "home_team_opp_2pt_pct": 0.5694227769110765, "home_team_opp_pp100p": 109.99381570810142, "away_team_days_rest": 1.0, "away_team_home_prior": 0.0, "away_team_sos": 14.728395061728396, "away_team_sos_last_10": 13.7, "away_team_win_pct": 0.419753086419753, "away_team_win_pct_last_10": 0.5, "away_team_3pt_pct": 0.3458466453674121, "away_team_2pt_pct": 0.5399553571428571, "away_team_pp100p": 108.21244455101306, "away_team_orb_pct": 0.2216815355501487, "away_team_drb

In [66]:
X_test.head()

Unnamed: 0,HOME TEAM DAYS REST,HOME TEAM HOME PRIOR,HOME TEAM SOS,HOME TEAM SOS LAST 10,HOME TEAM WIN PCT,HOME TEAM WIN PCT LAST 10,HOME TEAM 3PT PCT,HOME TEAM 2PT PCT,HOME TEAM PP100P,HOME TEAM ORB PCT,...,AWAY TEAM WIN PCT,AWAY TEAM WIN PCT LAST 10,AWAY TEAM 3PT PCT,AWAY TEAM 2PT PCT,AWAY TEAM PP100P,AWAY TEAM ORB PCT,AWAY TEAM DRB PCT,AWAY TEAM OPP 3PT PCT,AWAY TEAM OPP 2PT PCT,AWAY TEAM OPP PP100P
0,1,0,14.901235,17.2,0.530864,0.5,0.342593,0.538537,109.572427,0.211075,...,0.419753,0.5,0.345847,0.539955,108.212445,0.221682,0.717764,0.350301,0.566289,110.534516
1,1,0,14.777778,14.3,0.481481,0.5,0.362858,0.555081,111.053269,0.191156,...,0.209877,0.1,0.352247,0.516595,107.154384,0.234513,0.670035,0.361954,0.568362,115.0721
2,1,0,15.679012,15.2,0.493827,0.5,0.335133,0.524266,111.080029,0.258483,...,0.716049,0.7,0.368276,0.557308,112.196853,0.234792,0.76056,0.354302,0.513276,108.37322
3,1,1,16.296296,16.3,0.691358,0.7,0.375218,0.567315,114.9269,0.207902,...,0.506173,0.6,0.353204,0.547396,113.005593,0.234015,0.706579,0.35412,0.562514,112.705425
4,1,1,15.308642,18.6,0.555556,0.6,0.37824,0.560154,112.60185,0.188049,...,0.654321,0.5,0.386303,0.550897,114.626792,0.202014,0.730845,0.347762,0.553923,109.655383


In [65]:
y_test

0       13
1       22
2       16
3        6
4      -29
        ..
1069   -12
1070    -4
1071    -8
1072    -7
1073    14
Name: HOME PLUS MINUS, Length: 1074, dtype: int64

# Model 4: Stacked Meta Model Using Linear Regression

For each of our annual models (2018, 2019, 2020, 2021), predict the HOME PLUS MINUS using the values from the 3 base models (RF, NN, XGB). These predictions will generate a training dataset (features are base model predictions for a given game), where the target is the HOME PLUS MINUS (stacked model)



In [17]:
START_YEAR = 2015
num_years_modeled = len(rf_models)
stacked_df = pd.DataFrame()

for i in range(num_years_modeled):
    tmp_df = pd.DataFrame()
    test_year = START_YEAR + 4 + i
    test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

    # Prepare data for inference
    X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
    
    # Target values will be stored for training on the stacked model
    y = test_data['HOME PLUS MINUS']

    # Create your vector of predictions, to be used as features in the stacked model
    y_pred1 = rf_models[i].predict(X)
    y_pred2 = nn_models[i].predict(X)
    y_pred3 = xgb_models[i].predict(X)

    tmp_df['SEASON'], tmp_df['rf'], tmp_df['nn'], tmp_df['xgb'], tmp_df['HOME SPREAD'], tmp_df['HOME PLUS MINUS'] = test_data['SEASON'], y_pred1, y_pred2, y_pred3, test_data['HOME SPREAD'], y
    stacked_df = pd.concat([stacked_df, tmp_df], ignore_index=True)

stacked_df.head()

Unnamed: 0,SEASON,rf,nn,xgb,HOME SPREAD,HOME PLUS MINUS
0,2019,7.692683,7.738207,8.783263,-5.0,-14
1,2019,1.248847,-1.66238,0.387412,0.0,4
2,2019,1.316124,-3.000574,-0.178026,1.5,-20
3,2019,5.996429,10.033599,4.985899,-8.0,-2
4,2019,2.542743,4.219421,3.22963,-5.5,10


FYI: The 'SEASON' column here represents data in the year we are trying to predict. Therefore, the models that generated the rf, nn, and xgb predictions were trained on the 4 years PRIOR to 'SEASON'.

Now, let's build a STACKED model for each season. Start by training a first META MODEL on 2019 data (What was the best linear combination of the 2019 RF / NN / XGB model weights, based on the actual result in 2019 games?)

Then, use this 2019 STACKED META MODEL to make predictions on 2020 NBA games, etc.

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd

seasons = sorted(stacked_df['SEASON'].unique())
coefficients_list = []
mse_list = []
mse_list2 = []
predictions_list = []
avg_predictions_list = []
meta_models = []

# Train on one season and test on the next (to see if our stacked model can outperform the base models)
for i in range(len(seasons) - 1):
    train = stacked_df[stacked_df['SEASON'] == seasons[i]]
    test = stacked_df[stacked_df['SEASON'] == seasons[i + 1]]

    X_train = train[['rf', 'nn', 'xgb']]
    y_train = train['HOME PLUS MINUS']

    X_test = test[['rf', 'nn', 'xgb']]
    y_test = test['HOME PLUS MINUS']

    meta_model = LinearRegression()
    meta_model.fit(X_train, y_train)
    
    # Store coefficients
    coefficients = meta_model.coef_
    coefficients_list.append(coefficients)

    # Predict using the meta-model on the test set and store predictions
    y_pred = meta_model.predict(X_test)
    avg_y_pred = (X_test['rf'] + X_test['nn'] + X_test['xgb']) / 3.0
    predictions_list.extend(y_pred.tolist())
    avg_predictions_list.extend(avg_y_pred.tolist())

    # Calculate and store MSE for this season's predictions
    mse = mean_squared_error(y_test, y_pred)
    mse2 = mean_squared_error(y_test, avg_y_pred)
    mse_list.append(mse)
    mse_list2.append(mse2)

    # Store the trained models
    meta_models.append(meta_model)

# DataFrames for coefficients and MSE
coeff_df = pd.DataFrame(coefficients_list, columns=['random_forest_coef', 'neural_network_coef', 'xgboost_coef'])
coeff_df['SEASON'] = seasons[0:3]
mse_df = pd.DataFrame({'SEASON': seasons[1:], 'MSE': mse_list, 'MSE V2': mse_list2})

print('-- Optimal Model Coefficients ---')
display(coeff_df)
print('-- Updated Loss Calculations on Subsequent Season ---')
display(mse_df)


-- Optimal Model Coefficients ---


Unnamed: 0,random_forest_coef,neural_network_coef,xgboost_coef,SEASON
0,0.303533,0.272669,0.512543,2019
1,1.286994,0.624559,-0.468314,2020
2,0.444161,0.579428,0.024735,2021


-- Updated Loss Calculations on Subsequent Season ---


Unnamed: 0,SEASON,MSE,MSE V2
0,2020,194.422252,195.774449
1,2021,212.199944,210.950929
2,2022,166.46931,165.61388


The coefficients for the base models are not particularly consistent, but the mean square error values using the meta model predictions do out-perform all 3 individual base models individually, every year. 

Let's update our win-loss $ if we substitute in these meta model predictions for 2020, 2021, and 2022.

In [19]:
len(predictions_list), len(train_df[train_df['SEASON']==2020]) + len(train_df[train_df['SEASON']==2021]) + len(train_df[train_df['SEASON']==2022])

(3283, 3283)

In [20]:
stacked_df.head()

Unnamed: 0,SEASON,rf,nn,xgb,HOME SPREAD,HOME PLUS MINUS
0,2019,7.692683,7.738207,8.783263,-5.0,-14
1,2019,1.248847,-1.66238,0.387412,0.0,4
2,2019,1.316124,-3.000574,-0.178026,1.5,-20
3,2019,5.996429,10.033599,4.985899,-8.0,-2
4,2019,2.542743,4.219421,3.22963,-5.5,10


In [21]:
START_YEAR = 2020 #First year we can test our meta-model on
BET_AMOUNT = 100
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)

for THRESHOLD in [7,8,9,10,11,12]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(meta_models):
        test_year = START_YEAR + i
        test_data = stacked_df[stacked_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)

---------------------------------------
THRESHOLD: 7
Test Year 2020 Summary | Bets made: 54 | Bets won: 28 | Bets lost: 25 | Bets pushed: 1 | Game count: 1029 | Bet win rate: 0.53 | Final bankroll: 20.0
Test Year 2021 Summary | Bets made: 95 | Bets won: 50 | Bets lost: 42 | Bets pushed: 3 | Game count: 1180 | Bet win rate: 0.54 | Final bankroll: 300.0
Test Year 2022 Summary | Bets made: 120 | Bets won: 62 | Bets lost: 55 | Bets pushed: 3 | Game count: 1074 | Bet win rate: 0.53 | Final bankroll: 80.0
---------------------------------------
THRESHOLD: 8
Test Year 2020 Summary | Bets made: 28 | Bets won: 16 | Bets lost: 12 | Bets pushed: 0 | Game count: 1029 | Bet win rate: 0.57 | Final bankroll: 240.0
Test Year 2021 Summary | Bets made: 53 | Bets won: 29 | Bets lost: 24 | Bets pushed: 0 | Game count: 1180 | Bet win rate: 0.55 | Final bankroll: 210.0
Test Year 2022 Summary | Bets made: 79 | Bets won: 42 | Bets lost: 35 | Bets pushed: 2 | Game count: 1074 | Bet win rate: 0.55 | Final bankr

In [22]:
# Build the 2022 META model we will use to make predictions for the 2023 season! (still need to load vectors for inference from 2023 season when it starts, 
# #in order to generate the rf/nn/xgb predictions)

seasons = sorted(stacked_df['SEASON'].unique())
coefficients_list = []
mse_list = []
predictions_list = []
meta_models = []

# Train on one season and test on the next (to see if our stacked model can outperform the base models)
train = stacked_df[stacked_df['SEASON'] <= seasons[-1]]

X_train = train[['rf', 'nn', 'xgb']]
y_train = train['HOME PLUS MINUS']

meta_model = LinearRegression()
meta_model.fit(X_train, y_train)
    
# Store coefficients
coefficients = meta_model.coef_
coefficients_list.append(coefficients)

# Store the trained models. The SEASON value is the year of data to test the model on (not the year of data the model was trained on).
meta_models.append((seasons[-1]+1, meta_model))

# DataFrames for coefficients and MSE
coeff_df = pd.DataFrame(coefficients_list, columns=['random_forest_coef', 'neural_network_coef', 'xgboost_coef'])
coeff_df['SEASON'] = seasons[-1]

print('-- Optimal Model Coefficients ---')
display(coeff_df)
print('-- Updated Loss Calculations on Subsequent Season ---')
print('TBD!')


-- Optimal Model Coefficients ---


Unnamed: 0,random_forest_coef,neural_network_coef,xgboost_coef,SEASON
0,0.571915,0.522629,0.052365,2022


-- Updated Loss Calculations on Subsequent Season ---
TBD!


In [23]:
X_test.head()

Unnamed: 0,rf,nn,xgb
3206,1.588834,4.073304,1.704126
3207,6.802264,12.215064,7.053037
3208,-0.857822,4.021768,-0.930403
3209,5.333563,5.545704,5.755358
3210,1.456784,-0.820238,0.024593


In [25]:
# Save the model to disk
model = xgb_models[-1] # Most recent XGB model
filename = '../xgb_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [22]:
for col, dtype in X_test.dtypes.items():
    print(f"{col}: {dtype}")

HOME TEAM DAYS REST: int64
HOME TEAM HOME PRIOR: int64
HOME TEAM SOS: float64
HOME TEAM SOS LAST 10: float64
HOME TEAM WIN PCT: float64
HOME TEAM WIN PCT LAST 10: float64
HOME TEAM 3PT PCT: float64
HOME TEAM 2PT PCT: float64
HOME TEAM PP100P: float64
HOME TEAM ORB PCT: float64
HOME TEAM DRB PCT: float64
HOME TEAM OPP 3PT PCT: float64
HOME TEAM OPP 2PT PCT: float64
HOME TEAM OPP PP100P: float64
AWAY TEAM DAYS REST: int64
AWAY TEAM HOME PRIOR: int64
AWAY TEAM SOS: float64
AWAY TEAM SOS LAST 10: float64
AWAY TEAM WIN PCT: float64
AWAY TEAM WIN PCT LAST 10: float64
AWAY TEAM 3PT PCT: float64
AWAY TEAM 2PT PCT: float64
AWAY TEAM PP100P: float64
AWAY TEAM ORB PCT: float64
AWAY TEAM DRB PCT: float64
AWAY TEAM OPP 3PT PCT: float64
AWAY TEAM OPP 2PT PCT: float64
AWAY TEAM OPP PP100P: float64
