In [1]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer, mean_squared_error


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import Huber
import tensorflow as tf 
import matplotlib.pyplot as plt


warnings.filterwarnings("ignore")

2023-10-01 23:39:31.991257: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Time Series Cross Validation Strategy
1. Because of the potential for underlying changes in the data generation process (in this case, changes in how teams strategize/play in the NBA), we should avoid using traditional k-fold cross validation and instead use a time series cross validation approach. This will also help us avoid any unintended data leakage.

2. Identify optimal hyper-parameters.
    
    a. Use "blocked" rolling train/test splits (eg. train on 2015, test on random 25-50% of 2016, etc.) and average the MSE across the years to find the best parameters. 

3. After identifying optimal hyper-parameters, train on a 4-year rolling window basis (ie. train on 2015-2018, test on 2019). Repeat this several times so that you get a better sense of what your profit/loss will look like by seeing how you would have done in 2019, 2020, 2021, 2022, etc.

#### Profit / Loss Simulation
1. Use rolling 4 year training period (eg. 2015-2018) and test P/L on following year (eg. 2019) at various point spread differentials (predicted spread vs. actual spread > X, for a series of X values)

# Model 1: Random Forest

The first thing we want to do is figure out which hyper-parameters work best for each model we are going to build. We figure this out by training on 1 year of data and then testing on a portion of the following year. By repeating this many times for different hyper-parameter combinations and taking the average MSE, we can figure out the best options. Then, we will use those config parameters when we train the actual models (using a rolling 4 years of data and testing on the entire subsequent year).

In [25]:
train_df = pd.read_csv('train_df.csv')

# Hyper-Parameter Tuning (RANDOM FOREST)
def custom_time_series_splits(df, season_col='SEASON'):
    for season in range(min(train_df.SEASON), max(train_df.SEASON)):  # For seasons 2015 to 2021 (2022 not included because we don't have 2023 season data to validate on yet)
        train_indices = df[df[season_col] == season].index.values
        test_indices = df[df[season_col] == season + 1].index.values
        # Randomly sample 50% of the test indices
        test_indices = np.random.choice(test_indices, size=int(1 * len(test_indices)), replace=False)
        yield train_indices, test_indices

# Create a Random Forest Regressor object
rf = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [200, 400, 800],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    # 'min_samples_split': ['auto',5,10,20],
    # 'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define a scorer (using MSE)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Features/Predictors
X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
# Target
y = train_df['HOME PLUS MINUS']

# Use the generator for custom CV splits
cv_splits = list(custom_time_series_splits(train_df))

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=-1)
grid_search.fit(X, y)

# Obtain the cross-validation results
cv_results = grid_search.cv_results_

# Print the average validation score (MSE) for each parameter combination
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")
best_index = grid_search.best_index_

# Extract individual test scores for the best parameters
test_scores = []
for i in range(len(cv_splits)):  
    split_score_key = f"split{i}_test_score"
    test_scores.append(np.abs(grid_search.cv_results_[split_score_key][best_index]))

print(f"Individual test scores for the best parameters: {test_scores}")
print(f"Average MSE: {np.abs(np.mean(test_scores))}")

Average Loss (MSE): 186.6484 | Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}
Average Loss (MSE): 185.6764 | Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 400}
Average Loss (MSE): 185.4488 | Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 800}
Average Loss (MSE): 186.3076 | Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}
Average Loss (MSE): 185.4606 | Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'n_estimators': 400}
Average Loss (MSE): 185.0521 | Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'n_estimators': 800}
Average Loss (MSE): 185.0478 | Parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 200}
Average Loss (MSE): 184.6598 | Parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'n_estimato

In [28]:
# Now that we have identified the optimal hyper-parameters, let's build a series of 4-year training/1 year validation models to track MSE (and $ won or lost) over multiple years

def train_and_validate_models(df, best_params, season_col='SEASON'):
    trained_models = []
    validation_scores = []

    # Define the range of starting seasons for training data (2015 to 2018 inclusive) 
    # So, Model 1 will train using 2015 - 2018 and test on 2019. Model 2 will train on 2016 - 2019 and test on 2020, etc.
    for start_season in range(2015, 2019):
        # Split the data based on the seasons
        train_data = df[(df[season_col] >= start_season) & (df[season_col] <= start_season + 3)]
        test_data = df[df[season_col] == start_season + 4]

        # Extract features and target variable
        X_train = train_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_train = train_data['HOME PLUS MINUS']

        X_test = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_test = test_data['HOME PLUS MINUS']

        # Train the model using best hyperparameters identified previously
        model = RandomForestRegressor(**best_params)
        model.fit(X_train, y_train)

        # Validate the model on the test set
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Store the trained model and validation score
        trained_models.append(model)
        validation_scores.append(mse)

        print(f"Trained for seasons {start_season}-{start_season+3}, MSE on season {start_season+4}: {mse:.4f}")

    return trained_models, validation_scores

# Call the function
rf_models, rf_scores = train_and_validate_models(train_df, best_params)


Trained for seasons 2015-2018, MSE on season 2019: 173.6428
Trained for seasons 2016-2019, MSE on season 2020: 201.0587
Trained for seasons 2017-2020, MSE on season 2021: 214.2709
Trained for seasons 2018-2021, MSE on season 2022: 169.4954


In [29]:
# Get feature importances for a given model (i = 0 is 2015-2018, i = 1 is 2016-2019 etc.)
i = 0
importances = rf_models[i].feature_importances_

# Sort and display the features by their importance
feature_importances = pd.DataFrame(rf_models[i].feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
HOME TEAM WIN PCT,0.145571
AWAY TEAM WIN PCT,0.107585
HOME TEAM PP100P,0.096231
AWAY TEAM PP100P,0.067626
AWAY TEAM 2PT PCT,0.057603
HOME TEAM WIN PCT LAST 10,0.057143
HOME TEAM OPP PP100P,0.053099
HOME TEAM SOS,0.04369
HOME TEAM 2PT PCT,0.04334
AWAY TEAM DRB PCT,0.035919


In [30]:
START_YEAR = min(train_df.SEASON) # For our data, the first year should be 2015
# Bet amount each game
BET_AMOUNT = 100
# Average betting site take rate
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)
for THRESHOLD in [6,8,10,12,14]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(rf_models):
        test_year = START_YEAR + 4 + i
        #print('Test Year:', test_year)
        test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)



---------------------------------------
THRESHOLD: 6
Test Year 2019 Summary | Bets made: 104 | Bets won: 46 | Bets lost: 57 | Bets pushed: 1 | Game count: 997 | Bet win rate: 0.45 | Final bankroll: -1560.0
Test Year 2020 Summary | Bets made: 164 | Bets won: 73 | Bets lost: 88 | Bets pushed: 3 | Game count: 1029 | Bet win rate: 0.45 | Final bankroll: -2230.0
Test Year 2021 Summary | Bets made: 123 | Bets won: 55 | Bets lost: 66 | Bets pushed: 2 | Game count: 1180 | Bet win rate: 0.45 | Final bankroll: -1650.0
Test Year 2022 Summary | Bets made: 190 | Bets won: 91 | Bets lost: 95 | Bets pushed: 4 | Game count: 1074 | Bet win rate: 0.49 | Final bankroll: -1310.0
---------------------------------------
THRESHOLD: 8
Test Year 2019 Summary | Bets made: 32 | Bets won: 19 | Bets lost: 13 | Bets pushed: 0 | Game count: 997 | Bet win rate: 0.59 | Final bankroll: 410.0
Test Year 2020 Summary | Bets made: 45 | Bets won: 19 | Bets lost: 25 | Bets pushed: 1 | Game count: 1029 | Bet win rate: 0.43 | 

Result interpretation: the maximum return is generated when we set our THRESHOLD value to 8 (meaning our model differs from the projected line by 8 or more points). Our bet accuracy tends to increase with this THRESHOLD value, but the number of bets that the algorithm makes continues to decrease. 

Let's try out some other models.

# Model 2: Neural Network

Again, the first step will be to identify the optimal hyper-parameters for the network. Then we will build our models and simulate the profit and loss.

In [24]:
# import numpy as np
# import pandas as pd
# from sklearn.pipeline import Pipeline
# from sklearn.neural_network import MLPRegressor
# from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# train_df = pd.read_csv('train_df.csv')

# def custom_time_series_splits(df, season_col='SEASON'):
#     for season in range(min(train_df.SEASON), max(train_df.SEASON)): 
#         train_indices = df[df[season_col] == season].index.values
#         test_indices = df[df[season_col] == season + 1].index.values
#         # Randomly sample 50% of the test indices
#         test_indices = np.random.choice(test_indices, size=int(0.50 * len(test_indices)), replace=False)
#         yield train_indices, test_indices

# X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
# y = train_df['HOME PLUS MINUS']

# # Scale the features because MLP is sensitive to feature scaling
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

# # Create an MLPRegressor object
# MAX_ITER = 5000
# mlp = MLPRegressor(random_state=42, max_iter=MAX_ITER)  # Increased max_iter and increased tolerance to try to ensure convergence

# # Define the parameter grid for MLP
# param_grid = {
#     'hidden_layer_sizes': [(50), (50, 50), (100), (100,100), (200), (200,200)],  # Test different neural net architectures
#     'activation': ['tanh','relu'],
#     'solver': ['adam'],
#     'alpha': [0.01, 0.1, 0.25],  # Regularization term
#     'learning_rate_init': [0.00001, 0.0001, 0.001],
#     'learning_rate': ['adaptive']
# }

# # Define a scorer (using MSE)
# scorer = make_scorer(mean_squared_error, greater_is_better=False)

# # Use the generator for custom CV splits
# cv_splits = list(custom_time_series_splits(train_df))

# grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=-1)
# grid_search.fit(X, y)

# # Obtain the cross-validation results
# cv_results = grid_search.cv_results_

# # Print the average validation score (MSE) for each parameter combination
# for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
#     print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

# best_params = grid_search.best_params_
# print(f"\nBest parameters: {best_params}")


In [31]:
# NN attempt 2 with different scaling mechanisms

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

train_df = pd.read_csv('train_df.csv')

def custom_time_series_splits(df, season_col='SEASON'):
    for season in range(min(train_df.SEASON), max(train_df.SEASON)): 
        train_indices = df[df[season_col] == season].index.values
        test_indices = df[df[season_col] == season + 1].index.values
        # Randomly sample 50% of the test indices
        test_indices = np.random.choice(test_indices, size=int(1 * len(test_indices)), replace=False)
        yield train_indices, test_indices

X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
y = train_df['HOME PLUS MINUS']

# Scale the features because MLP is sensitive to feature scaling. We want to try a few different scaling options in our grid search.
# First, Define a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Placeholder, will be set by GridSearchCV
    ('mlp', MLPRegressor(max_iter=5000, random_state=42, tol=0.1))
])

# Define the parameter grid for MLP
param_grid = {
    'scaler': [StandardScaler(), RobustScaler(), PowerTransformer()],
    'mlp__hidden_layer_sizes': [(50, 50), (50, 100), (100,100), (100,200)],  # Test different neural net architectures
    'mlp__activation': ['relu','tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.1, 1, 10],  # Regularization term
    'mlp__learning_rate_init': [0.00001, 0.0001, 0.001],
    'mlp__learning_rate': ['adaptive', 'constant']
}

# Define a scorer (using MSE)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Use the generator for custom CV splits
cv_splits = list(custom_time_series_splits(train_df))

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=-1)
grid_search.fit(X, y)

# Obtain the cross-validation results
cv_results = grid_search.cv_results_

# Print the average validation score (MSE) for each parameter combination
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")


Average Loss (MSE): 210.4478 | Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'adaptive', 'mlp__learning_rate_init': 1e-05, 'mlp__solver': 'adam', 'scaler': StandardScaler()}
Average Loss (MSE): 213.1875 | Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'adaptive', 'mlp__learning_rate_init': 1e-05, 'mlp__solver': 'adam', 'scaler': RobustScaler()}
Average Loss (MSE): 210.4714 | Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'adaptive', 'mlp__learning_rate_init': 1e-05, 'mlp__solver': 'adam', 'scaler': PowerTransformer()}
Average Loss (MSE): 184.0928 | Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'adaptive', 'mlp__learning_rate_init': 0.0001, 'mlp__solver': 'adam', 'scaler': StandardScaler()}
Average Loss (M

In [33]:
# What do the individual year MSE values look like? How much variance is there?
best_index = grid_search.best_index_

# Extract individual test scores for the best parameters
test_scores = []
for i in range(len(cv_splits)):  
    split_score_key = f"split{i}_test_score"
    test_scores.append(np.abs(grid_search.cv_results_[split_score_key][best_index]))

print(f"Individual test scores for the best parameters: {test_scores}")
print(f"Average MSE: {np.abs(np.mean(test_scores))}")

Individual test scores for the best parameters: [171.2518999480216, 166.30640118246976, 176.70143688281576, 167.1062586304809, 200.15068006639126, 216.3076761772237, 169.82776654548795]
Average MSE: 181.09315991898444


In [34]:
from sklearn.pipeline import Pipeline

def train_and_validate_models(df, best_params, best_scaler, season_col='SEASON'):
    trained_models = []
    validation_scores = []

    # Define the range of starting seasons for training data (2015 to 2018 inclusive)
    for start_season in range(2015, 2019):
        # Split the data based on the seasons
        train_data = df[(df[season_col] >= start_season) & (df[season_col] <= start_season + 3)]
        test_data = df[df[season_col] == start_season + 4]

        # Extract features and target variable
        X_train = train_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_train = train_data['HOME PLUS MINUS']

        X_test = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_test = test_data['HOME PLUS MINUS']

        # Setup a pipeline using best scaler and model parameters
        model = Pipeline([
            ('scaler', best_scaler),
            ('mlp', MLPRegressor(**best_params, random_state=42))
        ])

        # Train the model using best hyperparameters identified previously
        model.fit(X_train, y_train)

        # Validate the model on the test set
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Store the trained model and validation score
        trained_models.append(model)
        validation_scores.append(mse)

        print(f"Trained for seasons {start_season}-{start_season+3}, MSE on season {start_season+4}: {mse:.4f}")

    return trained_models, validation_scores

# Extract the best parameters and scaler from GridSearchCV results
best_params = {k.replace('mlp__', ''): v for k, v in grid_search.best_params_.items() if 'mlp__' in k}
best_scaler = grid_search.best_params_['scaler']

# Call the function with the best parameters and the best scaler
nn_models, nn_scores = train_and_validate_models(train_df, best_params, best_scaler)


Trained for seasons 2015-2018, MSE on season 2019: 172.9143
Trained for seasons 2016-2019, MSE on season 2020: 194.9238
Trained for seasons 2017-2020, MSE on season 2021: 213.1592
Trained for seasons 2018-2021, MSE on season 2022: 168.0938


In [35]:
START_YEAR = 2015
BET_AMOUNT = 100
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)
# THRESHOLD = 4

for THRESHOLD in [5,6,7,8,9,10]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(nn_models):
        test_year = START_YEAR + 4 + i
        #print('Test Year:', test_year)
        test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)



---------------------------------------
THRESHOLD: 5
Test Year 2019 Summary | Bets made: 177 | Bets won: 88 | Bets lost: 86 | Bets pushed: 3 | Game count: 997 | Bet win rate: 0.51 | Final bankroll: -680.0
Test Year 2020 Summary | Bets made: 240 | Bets won: 121 | Bets lost: 116 | Bets pushed: 3 | Game count: 1029 | Bet win rate: 0.51 | Final bankroll: -710.0
Test Year 2021 Summary | Bets made: 277 | Bets won: 137 | Bets lost: 136 | Bets pushed: 4 | Game count: 1180 | Bet win rate: 0.5 | Final bankroll: -1270.0
Test Year 2022 Summary | Bets made: 301 | Bets won: 143 | Bets lost: 153 | Bets pushed: 5 | Game count: 1074 | Bet win rate: 0.48 | Final bankroll: -2430.0
---------------------------------------
THRESHOLD: 6
Test Year 2019 Summary | Bets made: 115 | Bets won: 55 | Bets lost: 59 | Bets pushed: 1 | Game count: 997 | Bet win rate: 0.48 | Final bankroll: -950.0
Test Year 2020 Summary | Bets made: 160 | Bets won: 81 | Bets lost: 78 | Bets pushed: 1 | Game count: 1029 | Bet win rate: 0

# Model 3: XGBoost

In [36]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

train_df = pd.read_csv('train_df.csv')

def custom_time_series_splits(df, season_col='SEASON'):
    for season in range(min(train_df.SEASON), max(train_df.SEASON)):  # For seasons 2015 to 2021
        train_indices = df[df[season_col] == season].index.values
        test_indices = df[df[season_col] == season + 1].index.values
        # Randomly sample 50% of the test indices
        test_indices = np.random.choice(test_indices, size=int(1 * len(test_indices)), replace=False)
        
        yield train_indices, test_indices

X = train_df.drop(['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'], axis=1)
y = train_df['HOME PLUS MINUS']

# Create an XGBoost regressor object
xgb_regressor = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Define a scorer (using MSE)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Use the generator for custom CV splits
cv_splits = list(custom_time_series_splits(train_df))

grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=cv_splits, scoring=scorer, n_jobs=3)
grid_search.fit(X, y)

# Obtain the cross-validation results
cv_results = grid_search.cv_results_

# Print the average validation score (MSE) for each parameter combination
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(f"Average Loss (MSE): {-mean_score:.4f} | Parameters: {params}")

best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")


Average Loss (MSE): 189.2777 | Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}
Average Loss (MSE): 190.3559 | Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1}
Average Loss (MSE): 189.2690 | Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1.5, 'subsample': 0.8}
Average Loss (MSE): 190.3090 | Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1.5, 'subsample': 1}
Average Loss (MSE): 189.2944 | Parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 2, 'subsample': 0.8}
Average Loss (MSE): 190.4122 | Parameter

In [37]:
# What do the individual year MSE values look like? How much variance is there?
best_index = grid_search.best_index_

# Extract individual test scores for the best parameters
test_scores = []
for i in range(len(cv_splits)):  
    split_score_key = f"split{i}_test_score"
    test_scores.append(np.abs(grid_search.cv_results_[split_score_key][best_index]))

print(f"Individual test scores for the best parameters: {test_scores}")
print(f"Average MSE: {np.abs(np.mean(test_scores))}")

# best was -193

Individual test scores for the best parameters: [175.7677567191533, 170.14983697940517, 179.57394853183948, 173.84372050821688, 208.72424331659263, 224.00361434710646, 182.18299647101057]
Average MSE: 187.74944526761777


In [39]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def train_and_validate_models(df, best_params, season_col='SEASON'):
    trained_models = []
    validation_scores = []

    # Define the range of starting seasons for training data (2015 to 2018 inclusive)
    for start_season in range(2015, 2019):
        # Split the data based on the seasons
        train_data = df[(df[season_col] >= start_season) & (df[season_col] <= start_season + 3)]
        test_data = df[df[season_col] == start_season + 4]

        # Extract features and target variable
        X_train = train_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_train = train_data['HOME PLUS MINUS']

        X_test = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
        y_test = test_data['HOME PLUS MINUS']

        # Train the model using best hyperparameters identified previously
        model = xgb.XGBRegressor(**best_params, random_state=42)
        model.fit(X_train, y_train)

        # Validate the model on the test set
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Store the trained model and validation score
        trained_models.append(model)
        validation_scores.append(mse)

        print(f"Trained for seasons {start_season}-{start_season+3}, MSE on season {start_season+4}: {mse:.4f}")

    return trained_models, validation_scores

# Call the function
xgb_models, xgb_scores = train_and_validate_models(train_df, best_params)


Trained for seasons 2015-2018, MSE on season 2019: 170.6674
Trained for seasons 2016-2019, MSE on season 2020: 199.2154
Trained for seasons 2017-2020, MSE on season 2021: 213.6061
Trained for seasons 2018-2021, MSE on season 2022: 168.4851


In [40]:
START_YEAR = 2015
BET_AMOUNT = 100
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)
# THRESHOLD = 4

for THRESHOLD in [0,5,10,15]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(xgb_models):
        test_year = START_YEAR + 4 + i
        #print('Test Year:', test_year)
        test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)

---------------------------------------
THRESHOLD: 0
Test Year 2019 Summary | Bets made: 997 | Bets won: 490 | Bets lost: 489 | Bets pushed: 18 | Game count: 997 | Bet win rate: 0.5 | Final bankroll: -4800.0
Test Year 2020 Summary | Bets made: 1029 | Bets won: 512 | Bets lost: 504 | Bets pushed: 13 | Game count: 1029 | Bet win rate: 0.5 | Final bankroll: -4320.0
Test Year 2021 Summary | Bets made: 1180 | Bets won: 538 | Bets lost: 625 | Bets pushed: 17 | Game count: 1180 | Bet win rate: 0.46 | Final bankroll: -14080.0
Test Year 2022 Summary | Bets made: 1074 | Bets won: 509 | Bets lost: 555 | Bets pushed: 10 | Game count: 1074 | Bet win rate: 0.48 | Final bankroll: -9690.0
---------------------------------------
THRESHOLD: 5
Test Year 2019 Summary | Bets made: 134 | Bets won: 67 | Bets lost: 67 | Bets pushed: 0 | Game count: 997 | Bet win rate: 0.5 | Final bankroll: -670.0
Test Year 2020 Summary | Bets made: 228 | Bets won: 113 | Bets lost: 112 | Bets pushed: 3 | Game count: 1029 | Bet

# Model 4: Stacked Meta Model Using Linear Regression

For each of our annual models (2018, 2019, 2020, 2021), predict the HOME PLUS MINUS using the values from the 3 base models (RF, NN, XGB). These predictions will generate a training dataset (features are base model predictions for a given game), where the target is the HOME PLUS MINUS (stacked model)



In [41]:
START_YEAR = 2015
num_years_modeled = len(rf_models)
stacked_df = pd.DataFrame()

for i in range(num_years_modeled):
    tmp_df = pd.DataFrame()
    test_year = START_YEAR + 4 + i
    test_data = train_df[train_df['SEASON'] == test_year].reset_index(drop=True)

    # Prepare data for inference
    X = test_data.drop(columns=['SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP', 'HOME SPREAD', 'HOME PLUS MINUS'])
    
    # Target values will be stored for training on the stacked model
    y = test_data['HOME PLUS MINUS']

    # Create your vector of predictions, to be used as features in the stacked model
    y_pred1 = rf_models[i].predict(X)
    y_pred2 = nn_models[i].predict(X)
    y_pred3 = xgb_models[i].predict(X)

    tmp_df['SEASON'], tmp_df['rf'], tmp_df['nn'], tmp_df['xgb'], tmp_df['HOME SPREAD'], tmp_df['HOME PLUS MINUS'] = test_data['SEASON'], y_pred1, y_pred2, y_pred3, test_data['HOME SPREAD'], y
    stacked_df = pd.concat([stacked_df, tmp_df], ignore_index=True)

stacked_df.head()

Unnamed: 0,SEASON,rf,nn,xgb,HOME SPREAD,HOME PLUS MINUS
0,2019,7.737118,7.738207,8.783263,-5.0,-14
1,2019,0.987922,-1.66238,0.387412,0.0,4
2,2019,0.987425,-3.000574,-0.178026,1.5,-20
3,2019,5.702579,10.033599,4.985899,-8.0,-2
4,2019,2.993093,4.219421,3.22963,-5.5,10


FYI: The 'SEASON' column here represents data in the year we are trying to predict. Therefore, the models that generated the rf, nn, and xgb predictions were trained on the 4 years PRIOR to 'SEASON'.

Now, let's build a STACKED model for each season. Start by training a first META MODEL on 2019 data (What was the best linear combination of the 2019 RF / NN / XGB model weights, based on the actual result in 2019 games?)

Then, use this 2019 STACKED META MODEL to make predictions on 2020 NBA games, etc.

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd

seasons = sorted(stacked_df['SEASON'].unique())
coefficients_list = []
mse_list = []
mse_list2 = []
predictions_list = []
avg_predictions_list = []
meta_models = []

# Train on one season and test on the next (to see if our stacked model can outperform the base models)
for i in range(len(seasons) - 1):
    train = stacked_df[stacked_df['SEASON'] == seasons[i]]
    test = stacked_df[stacked_df['SEASON'] == seasons[i + 1]]

    X_train = train[['rf', 'nn', 'xgb']]
    y_train = train['HOME PLUS MINUS']

    X_test = test[['rf', 'nn', 'xgb']]
    y_test = test['HOME PLUS MINUS']

    meta_model = LinearRegression()
    meta_model.fit(X_train, y_train)
    
    # Store coefficients
    coefficients = meta_model.coef_
    coefficients_list.append(coefficients)

    # Predict using the meta-model on the test set and store predictions
    y_pred = meta_model.predict(X_test)
    avg_y_pred = (X_test['rf'] + X_test['nn'] + X_test['xgb']) / 3.0
    predictions_list.extend(y_pred.tolist())
    avg_predictions_list.extend(avg_y_pred.tolist())

    # Calculate and store MSE for this season's predictions
    mse = mean_squared_error(y_test, y_pred)
    mse2 = mean_squared_error(y_test, avg_y_pred)
    mse_list.append(mse)
    mse_list2.append(mse2)

    # Store the trained models
    meta_models.append(meta_model)

# DataFrames for coefficients and MSE
coeff_df = pd.DataFrame(coefficients_list, columns=['random_forest_coef', 'neural_network_coef', 'xgboost_coef'])
coeff_df['SEASON'] = seasons[0:3]
mse_df = pd.DataFrame({'SEASON': seasons[1:], 'MSE': mse_list, 'MSE V2': mse_list2})

print('-- Optimal Model Coefficients ---')
display(coeff_df)
print('-- Updated Loss Calculations on Subsequent Season ---')
display(mse_df)


-- Optimal Model Coefficients ---


Unnamed: 0,random_forest_coef,neural_network_coef,xgboost_coef,SEASON
0,0.411091,0.270739,0.43499,2019
1,1.082348,0.648043,-0.315557,2020
2,0.469297,0.576307,0.008662,2021


-- Updated Loss Calculations on Subsequent Season ---


Unnamed: 0,SEASON,MSE,MSE V2
0,2020,194.360633,195.956411
1,2021,212.116781,210.920808
2,2022,166.31941,165.64039


The coefficients for the base models are not particularly consistent, but the mean square error values using the meta model predictions do out-perform all 3 individual base models individually, every year. 

Let's update our win-loss $ if we substitute in these meta model predictions for 2020, 2021, and 2022.

In [43]:
len(predictions_list), len(train_df[train_df['SEASON']==2020]) + len(train_df[train_df['SEASON']==2021]) + len(train_df[train_df['SEASON']==2022])

(3283, 3283)

In [44]:
stacked_df.head()

Unnamed: 0,SEASON,rf,nn,xgb,HOME SPREAD,HOME PLUS MINUS
0,2019,7.737118,7.738207,8.783263,-5.0,-14
1,2019,0.987922,-1.66238,0.387412,0.0,4
2,2019,0.987425,-3.000574,-0.178026,1.5,-20
3,2019,5.702579,10.033599,4.985899,-8.0,-2
4,2019,2.993093,4.219421,3.22963,-5.5,10


In [51]:
START_YEAR = 2020 #First year we can test our meta-model on
BET_AMOUNT = 100
TAKE_RATE = 0.1

# Set your margin for level of confidence (higher value means the model needs to be more confident in order to place a bet)

for THRESHOLD in [7,8,9,10,11,12]:
    print('---------------------------------------')
    print('THRESHOLD:', THRESHOLD)


    # Loop through all models constructed and create a dataframe with the bet decision and the bet outcome
    for i, model in enumerate(meta_models):
        test_year = START_YEAR + i
        test_data = stacked_df[stacked_df['SEASON'] == test_year].reset_index(drop=True)

        # Prepare data for inference
        X = test_data.drop(columns=['SEASON', 'HOME SPREAD', 'HOME PLUS MINUS'])

        # Create your vector of predictions
        y_pred = model.predict(X)

        decisions = []
        outcomes = []
        bets_made = 0 
        bets_won = 0
        bets_lost = 0
        bets_push = 0
        games = len(test_data)
        profit = 0
        net_bets_made = 0

        for index, row in test_data.iterrows():

            bet = None
            home_favored_bet_home = False 
            home_favored_bet_away = False
            away_favored_bet_home = False
            away_favored_bet_away = False
            no_favorite_bet_home = False
            no_favorite_bet_away = False 
            
            # Bet when home is favored AND you think home team will outperform the spread by THRESHOLD
            home_favored_bet_home = row['HOME SPREAD'] < 0 and (y_pred[index] - THRESHOLD > abs(row['HOME SPREAD']))

            # Bet when home is favored BUT you think the away team will outperform the spread by THRESHOLD
            home_favored_bet_away = row['HOME SPREAD'] < 0 and ((y_pred[index] < 0 and (abs(row['HOME SPREAD'] + y_pred[index]) > THRESHOLD)) or (y_pred[index] >= 0 and y_pred[index] + THRESHOLD < abs(row['HOME SPREAD'])))

            # Bet when away is favored BUT you think the home team will outperform the spread (you think home team wins outright, by at least a particular relative margin OR you think the away team wins, but not by as much as the market thinks)
            away_favored_bet_home = row['HOME SPREAD'] > 0 and (((y_pred[index] > 0 and y_pred[index] + row['HOME SPREAD'] > THRESHOLD)) or ((y_pred[index] <= 0) and abs(y_pred[index]) + THRESHOLD < row['HOME SPREAD']))

            # Bet when away is favored AND you think away team will outperform the spread by THRESHOLD
            away_favored_bet_away = row['HOME SPREAD'] > 0 and y_pred[index] < 0 and (abs(y_pred[index]) - THRESHOLD > row['HOME SPREAD'])

            # Bet when there is no favorite, but you think HOME will win by more than THRESHOLD
            no_favorite_bet_home = row['HOME SPREAD'] == 0 and (y_pred[index] > THRESHOLD)

            # Bet when there is no favorite, but you think AWAY will win by more than THRESHOLD
            no_favorite_bet_away = row['HOME SPREAD'] == 0 and (abs(y_pred[index]) > THRESHOLD)


            # Bet home or away
            if home_favored_bet_home or away_favored_bet_home or no_favorite_bet_home:
                bet = 'Bet on Home'
            elif home_favored_bet_away or away_favored_bet_away or no_favorite_bet_away:
                bet = 'Bet on Away'
            else: 
                bet = 'No Bet'

            # Outcomes
            if bet == 'Bet on Home':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']:  # HOME COVERED THE SPREAD
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
                else:
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                    
            elif bet == 'Bet on Away':
                bets_made += 1
                if row['HOME PLUS MINUS'] == -row['HOME SPREAD']:   # HOME MATCHED SPREAD
                    outcome = 'Push'
                    bets_push += 1
                elif row['HOME PLUS MINUS'] > -row['HOME SPREAD']: # HOME COVERED THE SPREAD
                    outcome = 'Lost'
                    bets_lost += 1
                    net_bets_made += 1
                    profit -= BET_AMOUNT
                else: 
                    outcome = 'Won'
                    bets_won += 1
                    net_bets_made += 1
                    profit += (1-TAKE_RATE)*BET_AMOUNT
            else: 
                outcome = 'No Bet Placed'

            decisions.append(bet)
            outcomes.append(outcome)        
                        
            #print('Market home plus minus:', -row['HOME SPREAD'], '| Predicted home plus minus:', round(y_pred[index],1), '| Bet:', bet, ' | Actual home plus minus:', row['HOME PLUS MINUS'], '| Bet outcome:', outcome)

        if net_bets_made == 0:
            win_rate = 0
        else:
            win_rate = round(bets_won / net_bets_made, 2)

        print('Test Year {} Summary |'.format(test_year), 'Bets made:', bets_made, '| Bets won:', bets_won, '| Bets lost:', bets_lost, '| Bets pushed:', bets_push, '| Game count:', games, '| Bet win rate:', win_rate, '| Final bankroll:', profit)

---------------------------------------
THRESHOLD: 7
Test Year 2020 Summary | Bets made: 53 | Bets won: 28 | Bets lost: 24 | Bets pushed: 1 | Game count: 1029 | Bet win rate: 0.54 | Final bankroll: 120.0
Test Year 2021 Summary | Bets made: 103 | Bets won: 55 | Bets lost: 45 | Bets pushed: 3 | Game count: 1180 | Bet win rate: 0.55 | Final bankroll: 450.0
Test Year 2022 Summary | Bets made: 119 | Bets won: 61 | Bets lost: 55 | Bets pushed: 3 | Game count: 1074 | Bet win rate: 0.53 | Final bankroll: -10.0
---------------------------------------
THRESHOLD: 8
Test Year 2020 Summary | Bets made: 28 | Bets won: 16 | Bets lost: 12 | Bets pushed: 0 | Game count: 1029 | Bet win rate: 0.57 | Final bankroll: 240.0
Test Year 2021 Summary | Bets made: 55 | Bets won: 30 | Bets lost: 25 | Bets pushed: 0 | Game count: 1180 | Bet win rate: 0.55 | Final bankroll: 200.0
Test Year 2022 Summary | Bets made: 76 | Bets won: 41 | Bets lost: 33 | Bets pushed: 2 | Game count: 1074 | Bet win rate: 0.55 | Final ba

In [47]:
# Build the 2022 META model we will use to make predictions for the 2023 season! (still need to load vectors for inference from 2023 season when it starts, 
# #in order to generate the rf/nn/xgb predictions)

seasons = sorted(stacked_df['SEASON'].unique())
coefficients_list = []
mse_list = []
predictions_list = []
meta_models = []

# Train on one season and test on the next (to see if our stacked model can outperform the base models)
train = stacked_df[stacked_df['SEASON'] <= seasons[-1]]

X_train = train[['rf', 'nn', 'xgb']]
y_train = train['HOME PLUS MINUS']

meta_model = LinearRegression()
meta_model.fit(X_train, y_train)
    
# Store coefficients
coefficients = meta_model.coef_
coefficients_list.append(coefficients)

# Store the trained models. The SEASON value is the year of data to test the model on (not the year of data the model was trained on).
meta_models.append((seasons[-1]+1, meta_model))

# DataFrames for coefficients and MSE
coeff_df = pd.DataFrame(coefficients_list, columns=['random_forest_coef', 'neural_network_coef', 'xgboost_coef'])
coeff_df['SEASON'] = seasons[-1]

print('-- Optimal Model Coefficients ---')
display(coeff_df)
print('-- Updated Loss Calculations on Subsequent Season ---')
print('TBD!')


-- Optimal Model Coefficients ---


Unnamed: 0,random_forest_coef,neural_network_coef,xgboost_coef,SEASON
0,0.597334,0.530263,0.024527,2022


-- Updated Loss Calculations on Subsequent Season ---
TBD!
