# Finalizing a Prediction Strategy

In previous notebooks, we explored different data preprocessing techniques, feature engineering, and testing various sklearn models for predicting NBA games.

Now, we will attempt to finalize a prediction strategy using the best performing models and other techniques.

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

## Read the dataset

In [2]:
path_to_data = "../../data/processed/processed_team_dataset.csv"

df = pd.read_csv(path_to_data, index_col=0)
df

Unnamed: 0,date,season,team,team_opp,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,2015-10-29,2016,NYK,ATL,1356.009483,1541.442237,0.379472,-3.051170,1,0,...,41.20,24.80,35.60,3.20,4.70,33.30,23.60,132.0,104.0,106.0
1,2015-10-29,2016,ATL,NYK,1541.442237,1356.009483,0.620528,3.051170,0,1,...,20.50,20.30,34.90,3.40,6.80,50.00,32.50,140.0,137.0,97.0
2,2015-10-29,2016,LAC,DAL,1652.435518,1559.753058,0.751974,6.881516,1,0,...,38.50,28.10,32.20,4.20,4.40,66.70,27.90,162.0,114.0,95.0
3,2015-10-29,2016,IND,MEM,1499.317744,1557.619147,0.559723,1.489236,1,0,...,34.50,23.50,30.60,8.40,12.20,100.00,28.00,149.0,89.0,106.0
4,2015-10-29,2016,MEM,IND,1557.619147,1499.317744,0.440277,-1.489236,0,1,...,42.50,24.80,30.20,2.60,4.80,100.00,29.70,134.0,105.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,2024-02-15,2024,MIL,MEM,1539.547927,1410.179450,0.542164,1.048874,0,1,...,35.17,26.01,46.02,5.42,9.31,36.66,30.13,176.2,115.0,114.4
21752,2024-02-15,2024,MEM,MIL,1410.179450,1539.547927,0.457836,-1.048874,1,0,...,40.78,23.44,43.28,5.62,7.04,37.04,43.73,175.3,122.7,113.4
21753,2024-02-15,2024,MIN,POR,1676.024592,1333.586609,0.801482,8.658499,0,1,...,33.23,22.67,39.22,13.52,8.59,36.08,38.80,180.1,124.0,114.5
21754,2024-02-15,2024,POR,MIN,1333.586609,1676.024592,0.198518,-8.658499,1,0,...,38.21,23.52,44.58,4.41,9.39,45.89,35.88,191.6,129.7,103.6


In [3]:
# Check that we have no null values

# Finding the number of missing values in each column
missing_values = df.isnull().sum()

# Printing the number of missing values in each column
print(sum(missing_values))

0


## Separate relevant features from the labels

In [4]:
# drop these columns (most of the are non_numeric or are not useful for machine learning)
drop_columns = ["date", "season", "team", "team_opp", "won"]

selected_columns = df.columns[~df.columns.isin(drop_columns)]

# Selected features
features_df = df[selected_columns]
features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,1356.009483,1541.442237,0.379472,-3.051170,1,0,42.0,93.0,0.4520,9.0,...,41.20,24.80,35.60,3.20,4.70,33.30,23.60,132.0,104.0,106.0
1,1541.442237,1356.009483,0.620528,3.051170,0,1,37.0,82.0,0.4510,8.0,...,20.50,20.30,34.90,3.40,6.80,50.00,32.50,140.0,137.0,97.0
2,1652.435518,1559.753058,0.751974,6.881516,1,0,42.0,80.0,0.5250,6.0,...,38.50,28.10,32.20,4.20,4.40,66.70,27.90,162.0,114.0,95.0
3,1499.317744,1557.619147,0.559723,1.489236,1,0,32.0,86.0,0.3720,9.0,...,34.50,23.50,30.60,8.40,12.20,100.00,28.00,149.0,89.0,106.0
4,1557.619147,1499.317744,0.440277,-1.489236,0,1,29.0,82.0,0.3540,2.0,...,42.50,24.80,30.20,2.60,4.80,100.00,29.70,134.0,105.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,1539.547927,1410.179450,0.542164,1.048874,0,1,42.0,87.1,0.4819,15.0,...,35.17,26.01,46.02,5.42,9.31,36.66,30.13,176.2,115.0,114.4
21752,1410.179450,1539.547927,0.457836,-1.048874,1,0,37.4,85.4,0.4387,14.3,...,40.78,23.44,43.28,5.62,7.04,37.04,43.73,175.3,122.7,113.4
21753,1676.024592,1333.586609,0.801482,8.658499,0,1,42.0,83.8,0.5023,13.9,...,33.23,22.67,39.22,13.52,8.59,36.08,38.80,180.1,124.0,114.5
21754,1333.586609,1676.024592,0.198518,-8.658499,1,0,40.4,86.0,0.4683,10.4,...,38.21,23.52,44.58,4.41,9.39,45.89,35.88,191.6,129.7,103.6


In [5]:
# Label we want to predict
label = df["won"]
label

0        False
1         True
2         True
3        False
4         True
         ...  
21751    False
21752     True
21753     True
21754    False
21755    False
Name: won, Length: 21756, dtype: bool

### Remember that elo is about 64.48% accurate at predicting winner of a game

In [6]:
projected_point_spread = df["team_point_diff_proj"]
projected_win_from_elo = projected_point_spread > 0

print(metrics.accuracy_score(label, projected_win_from_elo))

0.6485567199852914


## Scale data for Logistic Regression

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_features_df = pd.DataFrame(scaler.fit_transform(features_df), columns=features_df.columns)
scaled_features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,0.240976,0.517230,0.374891,0.438248,1.0,0.0,0.548387,0.454545,0.562044,0.391304,...,0.264080,0.135632,0.182741,0.062500,0.105856,0.218970,0.000000,0.120419,0.303571,0.447761
1,0.517230,0.240976,0.625109,0.561752,0.0,1.0,0.387097,0.204545,0.558394,0.347826,...,0.005006,0.083908,0.173858,0.071429,0.153153,0.414520,0.116492,0.162304,0.892857,0.313433
2,0.682585,0.544509,0.761551,0.639273,1.0,0.0,0.548387,0.159091,0.828467,0.260870,...,0.230288,0.173563,0.139594,0.107143,0.099099,0.610070,0.056283,0.277487,0.482143,0.283582
3,0.454473,0.541330,0.561992,0.530140,1.0,0.0,0.225806,0.295455,0.270073,0.391304,...,0.180225,0.120690,0.119289,0.294643,0.274775,1.000000,0.057592,0.209424,0.035714,0.447761
4,0.541330,0.454473,0.438008,0.469860,0.0,1.0,0.129032,0.204545,0.204380,0.086957,...,0.280350,0.135632,0.114213,0.035714,0.108108,1.000000,0.079843,0.130890,0.321429,0.447761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,0.514407,0.321677,0.543767,0.521228,0.0,1.0,0.548387,0.320455,0.671168,0.652174,...,0.188611,0.149540,0.314975,0.161607,0.209685,0.258314,0.085471,0.351832,0.500000,0.573134
21752,0.321677,0.514407,0.456233,0.478772,1.0,0.0,0.400000,0.281818,0.513504,0.621739,...,0.258824,0.120000,0.280203,0.170536,0.158559,0.262763,0.263482,0.347120,0.637500,0.558209
21753,0.717727,0.207571,0.812940,0.675237,0.0,1.0,0.548387,0.245455,0.745620,0.604348,...,0.164330,0.111149,0.228680,0.523214,0.193468,0.251522,0.198953,0.372251,0.660714,0.574627
21754,0.207571,0.717727,0.187060,0.324763,1.0,0.0,0.496774,0.295455,0.621533,0.452174,...,0.226658,0.120920,0.296701,0.116518,0.211486,0.366393,0.160733,0.432461,0.762500,0.411940


## Test Train Split

In [8]:
from sklearn.model_selection import train_test_split

test_ratio = 0.3
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing

x_train_scaled

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
13651,0.539817,0.411547,0.754146,0.634516,1.0,0.0,0.438710,0.272727,0.572628,0.630435,...,0.130413,0.108046,0.238198,0.191518,0.275450,0.238056,0.129843,0.336126,0.728571,0.531343
2095,0.018138,0.394220,0.101624,0.245251,0.0,1.0,0.374194,0.327273,0.442701,0.391304,...,0.325782,0.203563,0.255330,0.111161,0.265315,0.300703,0.186780,0.346597,0.571429,0.507463
94,0.457548,0.435536,0.375185,0.438399,0.0,1.0,0.333333,0.371212,0.362530,0.304348,...,0.255632,0.227586,0.218909,0.327009,0.211712,0.548595,0.128599,0.251309,0.330357,0.347015
12820,0.474473,0.571897,0.551523,0.525013,1.0,0.0,0.564516,0.357955,0.661040,0.625000,...,0.226220,0.101580,0.292671,0.118862,0.163007,0.241511,0.129254,0.287304,0.595982,0.447761
6183,0.389620,0.629227,0.170127,0.311466,0.0,1.0,0.483871,0.334091,0.586861,0.313043,...,0.252566,0.151264,0.143020,0.275893,0.125450,0.223419,0.156806,0.391099,0.498214,0.352239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4556,0.216379,0.491725,0.149696,0.294126,0.0,1.0,0.490323,0.372727,0.557664,0.391304,...,0.198373,0.099425,0.210787,0.233929,0.212838,0.179742,0.115052,0.329319,0.644643,0.443284
3607,0.146125,0.350106,0.445054,0.473313,1.0,0.0,0.409677,0.247727,0.563869,0.439130,...,0.149937,0.081494,0.271701,0.220089,0.256757,0.346604,0.170157,0.362304,0.585714,0.368657
6725,0.299001,0.482997,0.464940,0.483010,1.0,0.0,0.354839,0.209091,0.518248,0.478261,...,0.204255,0.120000,0.370305,0.196429,0.236486,0.450937,0.210733,0.237173,0.471429,0.356716
14665,0.590980,0.456489,0.758860,0.637534,1.0,0.0,0.532258,0.309091,0.664234,0.600000,...,0.259324,0.170575,0.247716,0.114732,0.194595,0.224941,0.139136,0.427225,0.639286,0.704478


In [9]:
from sklearn.linear_model import LogisticRegression 

# Logistic Regression with max_iter=200 
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
log_model.fit(x_train_scaled, y_train_scaled)
y_pred_log = log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred_log))

0.6465451202696492


From our previous tests, we found that logistic regression gave around 65% accuracy. Let us try to find the optimal parameters.

In [10]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'C': [0.001, 0.01, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10, 100],
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

# log_model = LogisticRegression(max_iter=1000)
# clf = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', verbose=3)

In [11]:
# clf.fit(x_train_scaled, y_train_scaled)

In [12]:
# print("Best Parameters:", clf.best_params_)

### Train using best parameters

We have found "optimal" parameters with GridSearchCV:

Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

Let's evaluate the model using these parameters.

In [13]:
opt_log_model = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')
opt_log_model.fit(x_train_scaled, y_train_scaled)
y_pred = opt_log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred))

[LibLinear]0.6535927685000766


It seems like the parameters do not give much of a performance bump. Let's just use the LogisticRegression with max_iter=1000 and default params.

### See accuracy of "high" confidence predictions

In [14]:
# Predict with a probability for each class
y_pred_prob_log = log_model.predict_proba(x_test_scaled)
y_pred_prob_log

array([[0.59336021, 0.40663979],
       [0.68064233, 0.31935767],
       [0.64623736, 0.35376264],
       ...,
       [0.63341582, 0.36658418],
       [0.84923681, 0.15076319],
       [0.38202781, 0.61797219]])

In [15]:
prob_threshold = 0.7

# Identify predictions where the probability of either class 0 or class 1 meets or exceeds the threshold
high_confidence_indices = np.where((y_pred_prob_log[:, 0] >= prob_threshold) | (y_pred_prob_log[:, 1] >= prob_threshold))

high_confidence_indices = list(high_confidence_indices[0])
print(len(high_confidence_indices))

2252


In [16]:
# Extract the predictions and actual labels for these high confidence predictions
high_conf_predictions = np.argmax(y_pred_prob_log[high_confidence_indices], axis=1)
high_conf_actual = y_test_scaled.reset_index(drop=True)
high_conf_actual = high_conf_actual[high_confidence_indices]
high_conf_actual

3       False
4       False
13       True
14       True
20      False
        ...  
6515     True
6517    False
6518     True
6523     True
6525    False
Name: won, Length: 2252, dtype: bool

In [17]:
print(metrics.accuracy_score(high_conf_actual, high_conf_predictions))

0.761101243339254


In [18]:
## What about the accuracy of those outside of the "high_conf_predictions"?

low_confidence_indices = [i for i in range(len(y_pred_prob_log)) if i not in high_confidence_indices]
print(len(low_confidence_indices))
assert len(low_confidence_indices) + len(high_confidence_indices) == len(y_pred_prob_log)

4275


In [19]:
# Extract the predictions and actual labels for these "low" confidence predictions
low_conf_predictions = np.argmax(y_pred_prob_log[low_confidence_indices], axis=1)
low_conf_actual = y_test_scaled.reset_index(drop=True)
low_conf_actual = low_conf_actual[low_confidence_indices]
low_conf_actual

0        True
1        True
2       False
5        True
6        True
        ...  
6520    False
6521     True
6522    False
6524    False
6526     True
Name: won, Length: 4275, dtype: bool

In [20]:
print(metrics.accuracy_score(low_conf_actual, low_conf_predictions))

0.5861988304093567


In [21]:
x_test_low_confidence = x_test_scaled.reset_index(drop=True)
x_test_low_confidence = x_test_low_confidence.loc[low_confidence_indices]
assert(x_test_low_confidence.index.values.tolist() == low_conf_actual.index.values.tolist())
x_test_low_confidence

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,0.275880,0.536034,0.390206,0.446059,1.0,0.0,0.445161,0.261364,0.597445,0.226087,...,0.226158,0.112989,0.369162,0.221875,0.182883,0.410890,0.142539,0.300524,0.539286,0.395522
1,0.681301,0.835746,0.226401,0.352785,0.0,1.0,0.438710,0.250000,0.590146,0.421739,...,0.348310,0.170920,0.314213,0.084375,0.204279,0.326347,0.249738,0.338220,0.619643,0.382090
2,0.411814,0.544538,0.242474,0.363323,0.0,1.0,0.480645,0.243182,0.653650,0.391304,...,0.405006,0.229885,0.291244,0.132143,0.250676,0.349883,0.118586,0.446597,0.612500,0.568657
5,0.494017,0.447567,0.398519,0.450256,0.0,1.0,0.387097,0.156818,0.609124,0.313043,...,0.170588,0.127816,0.364467,0.099107,0.216667,0.226347,0.126178,0.379581,0.639286,0.383582
6,0.389349,0.373968,0.659479,0.579744,1.0,0.0,0.590323,0.540909,0.550000,0.569565,...,0.159700,0.095862,0.445812,0.233036,0.313964,0.372717,0.139005,0.472775,0.466071,0.480597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6520,0.535131,0.422145,0.742327,0.627100,1.0,0.0,0.583871,0.281818,0.754015,0.517391,...,0.180976,0.123448,0.336294,0.172768,0.158559,0.256440,0.122644,0.295288,0.498214,0.385075
6521,0.487232,0.558438,0.577400,0.537733,1.0,0.0,0.567742,0.338636,0.695985,0.391304,...,0.154443,0.157586,0.320051,0.229464,0.161261,0.331265,0.207068,0.398429,0.537500,0.441791
6522,0.315983,0.513299,0.196724,0.331985,0.0,1.0,0.351613,0.275000,0.457664,0.552174,...,0.215394,0.113908,0.235279,0.180804,0.184685,0.310773,0.100262,0.200524,0.619643,0.402985
6524,0.517388,0.603091,0.279584,0.386137,0.0,1.0,0.506452,0.220455,0.716423,0.339130,...,0.249937,0.122299,0.316371,0.133482,0.158333,0.427986,0.169764,0.352880,0.589286,0.383582


In [22]:
elo_proj_low_confidence = x_test_low_confidence["team_point_diff_proj"] > 0
elo_proj_low_confidence

0       True
1       True
2       True
5       True
6       True
        ... 
6520    True
6521    True
6522    True
6524    True
6526    True
Name: team_point_diff_proj, Length: 4275, dtype: bool

In [23]:
print(metrics.accuracy_score(low_conf_actual, elo_proj_low_confidence))

0.49777777777777776


What we can try to do is a [k-fold cross validation](https://www.youtube.com/watch?v=kituDjzXwfE&t=698s).