In [1]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer, mean_squared_error

warnings.filterwarnings("ignore")

In [3]:
# Load training data
train_df = pd.read_csv('../../generated_datasets/train_df.csv')

# Limit training set to before 2023
X_train = train_df[train_df['DATE']<'2023-01-01']
y_train = train_df[train_df['DATE']<'2023-01-01']['HOME PLUS MINUS']

X_test = train_df[train_df['DATE']>='2023-01-01']
y_test = train_df[train_df['DATE']>='2023-01-01']['HOME PLUS MINUS']

# Drop unneccesary features and the target variable (HOME PLUS MINUS)
X_train = X_train.drop(['HOME PLUS MINUS', 'SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP'], axis=1)
X_test = X_test.drop(['HOME PLUS MINUS', 'SEASON', 'DATE', 'HOME', 'AWAY', 'HOME_PRIOR_GAME_DATE', 'AWAY_PRIOR_GAME_DATE', 'HOME TEAM GP', 'AWAY TEAM GP'], axis=1)

X_train.head()

Unnamed: 0,HOME SPREAD,HOME TEAM DAYS REST,HOME TEAM HOME PRIOR,HOME TEAM SOS,HOME TEAM SOS LAST 10,HOME TEAM WIN PCT,HOME TEAM WIN PCT LAST 10,HOME TEAM 3PT PCT,HOME TEAM 2PT PCT,HOME TEAM PP100P,...,AWAY TEAM WIN PCT,AWAY TEAM WIN PCT LAST 10,AWAY TEAM 3PT PCT,AWAY TEAM 2PT PCT,AWAY TEAM PP100P,AWAY TEAM ORB PCT,AWAY TEAM DRB PCT,AWAY TEAM OPP 3PT PCT,AWAY TEAM OPP 2PT PCT,AWAY TEAM OPP PP100P
685,1.5,0,1,14.342857,16.2,0.457143,0.5,0.370297,0.541829,110.121346,...,0.611111,0.6,0.367908,0.54864,111.271316,0.21764,0.723443,0.365582,0.521225,106.159822
686,2.5,1,0,14.2,14.8,0.428571,0.4,0.346712,0.525734,108.373462,...,0.588235,0.8,0.381333,0.541032,110.523256,0.183908,0.72167,0.333035,0.546171,106.493506
687,3.5,1,0,11.971429,13.9,0.285714,0.3,0.331691,0.518637,105.88397,...,0.5,0.5,0.3392,0.53125,111.000538,0.264822,0.688901,0.342688,0.537716,109.409544
688,7.0,1,1,14.971429,16.5,0.342857,0.5,0.345614,0.537894,106.379498,...,0.555556,0.7,0.359521,0.577793,111.5374,0.176581,0.702114,0.355932,0.549502,109.92642
689,-4.5,0,0,14.184211,12.9,0.5,0.4,0.365918,0.557135,112.203732,...,0.5,0.6,0.341705,0.533037,107.749586,0.199297,0.681873,0.367456,0.561993,107.973103


In [4]:
# Create a Random Forest Regressor object
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the training sets
rf.fit(X_train, y_train)

# Use the model to make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 163.51200218978101


In [5]:
# Create a Random Forest Regressor object
rf = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 5, 10],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define a scorer
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Define the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring=scorer, n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print(f"Best parameters: {best_params}")

# Fit the model with the best parameters
rf_best = RandomForestRegressor(**best_params, random_state=42)
rf_best.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100}


In [6]:
# Get the best parameters
best_params = grid_search.best_params_

# Set the number of estimators higher 
best_params['n_estimators'] = 300

print(f"Best parameters: {best_params}")

# Fit the model with the best parameters
rf_best = RandomForestRegressor(**best_params, random_state=42)
rf_best.fit(X_train, y_train)

# Use the model to make predictions
y_pred = rf_best.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Best parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 300}
Mean Squared Error: 159.5558709267445


In [7]:
# Establishing a baseline: comparing our initial model against simply using the home spread value (taken from the betting sites) 
mean_squared_error(list(X_test['HOME SPREAD']), -y_test)

154.57007299270072

154 < 159. This outcome suggests that simply using the betting line generates slightly better predictions than our model, which uses the betting line along with additional custom features. Let's now see how important the various input features were in making our decision trees.

In [8]:
# Get importance
importances = rf_best.feature_importances_

# Summarize feature importances

# To sort and visualize the features by their importance
feature_importances = pd.DataFrame(rf_best.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

feature_importances

Unnamed: 0,importance
HOME SPREAD,0.815416
HOME TEAM ORB PCT,0.019249
HOME TEAM SOS LAST 10,0.011784
AWAY TEAM SOS LAST 10,0.011183
AWAY TEAM 3PT PCT,0.011089
HOME TEAM OPP 3PT PCT,0.010115
HOME TEAM DRB PCT,0.009116
AWAY TEAM OPP 3PT PCT,0.008845
HOME TEAM OPP PP100P,0.008216
HOME TEAM WIN PCT,0.00802


Unsurprisingly, the HOME SPREAD feature (from the betting sites) is by far the most important in our model's predictions. We need to find a way to beat the spread performance, so future models will not incorporate this feature!