# Finalizing a Prediction Strategy

In previous notebooks, we explored different data preprocessing techniques, feature engineering, and testing various sklearn models for predicting NBA games.

Now, we will attempt to finalize a prediction strategy based on what we've learned.

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

## Read the dataset

In [2]:
path_to_data = "../../data/processed/processed_team_dataset.csv"

df = pd.read_csv(path_to_data, index_col=0)
df

Unnamed: 0,date,season,team,team_opp,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,...,orb%_max_allowed_last_10_opp,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp
0,2015-10-29,2016,NYK,ATL,1356.009483,1541.442237,0.379472,-3.051170,1,0,...,18.50,41.20,24.80,35.60,3.20,4.70,33.30,23.60,132.0,104.0
1,2015-10-29,2016,ATL,NYK,1541.442237,1356.009483,0.620528,3.051170,0,1,...,29.60,20.50,20.30,34.90,3.40,6.80,50.00,32.50,140.0,137.0
2,2015-10-29,2016,LAC,DAL,1652.435518,1559.753058,0.751974,6.881516,1,0,...,19.30,38.50,28.10,32.20,4.20,4.40,66.70,27.90,162.0,114.0
3,2015-10-29,2016,IND,MEM,1499.317744,1557.619147,0.559723,1.489236,1,0,...,21.50,34.50,23.50,30.60,8.40,12.20,100.00,28.00,149.0,89.0
4,2015-10-29,2016,MEM,IND,1557.619147,1499.317744,0.440277,-1.489236,0,1,...,22.30,42.50,24.80,30.20,2.60,4.80,100.00,29.70,134.0,105.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,2024-02-15,2024,MIL,MEM,1539.547927,1410.179450,0.542164,1.048874,0,1,...,21.53,35.17,26.01,46.02,5.42,9.31,36.66,30.13,176.2,115.0
21752,2024-02-15,2024,MEM,MIL,1410.179450,1539.547927,0.457836,-1.048874,1,0,...,24.05,40.78,23.44,43.28,5.62,7.04,37.04,43.73,175.3,122.7
21753,2024-02-15,2024,MIN,POR,1676.024592,1333.586609,0.801482,8.658499,0,1,...,23.31,33.23,22.67,39.22,13.52,8.59,36.08,38.80,180.1,124.0
21754,2024-02-15,2024,POR,MIN,1333.586609,1676.024592,0.198518,-8.658499,1,0,...,18.39,38.21,23.52,44.58,4.41,9.39,45.89,35.88,191.6,129.7


In [3]:
# Check that we have no null values

# Finding the number of missing values in each column
missing_values = df.isnull().sum()

# Printing the number of missing values in each column
print(sum(missing_values))

0


## Separate relevant features from the labels

In [4]:
# drop these columns (most of the are non_numeric or are not useful for machine learning)
drop_columns = ["date", "season", "team", "team_opp", "won"]

selected_columns = df.columns[~df.columns.isin(drop_columns)]

# Selected features
features_df = df[selected_columns]
features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_max_allowed_last_10_opp,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp
0,1356.009483,1541.442237,0.379472,-3.051170,1,0,42.0,93.0,0.4520,9.0,...,18.50,41.20,24.80,35.60,3.20,4.70,33.30,23.60,132.0,104.0
1,1541.442237,1356.009483,0.620528,3.051170,0,1,37.0,82.0,0.4510,8.0,...,29.60,20.50,20.30,34.90,3.40,6.80,50.00,32.50,140.0,137.0
2,1652.435518,1559.753058,0.751974,6.881516,1,0,42.0,80.0,0.5250,6.0,...,19.30,38.50,28.10,32.20,4.20,4.40,66.70,27.90,162.0,114.0
3,1499.317744,1557.619147,0.559723,1.489236,1,0,32.0,86.0,0.3720,9.0,...,21.50,34.50,23.50,30.60,8.40,12.20,100.00,28.00,149.0,89.0
4,1557.619147,1499.317744,0.440277,-1.489236,0,1,29.0,82.0,0.3540,2.0,...,22.30,42.50,24.80,30.20,2.60,4.80,100.00,29.70,134.0,105.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,1539.547927,1410.179450,0.542164,1.048874,0,1,42.0,87.1,0.4819,15.0,...,21.53,35.17,26.01,46.02,5.42,9.31,36.66,30.13,176.2,115.0
21752,1410.179450,1539.547927,0.457836,-1.048874,1,0,37.4,85.4,0.4387,14.3,...,24.05,40.78,23.44,43.28,5.62,7.04,37.04,43.73,175.3,122.7
21753,1676.024592,1333.586609,0.801482,8.658499,0,1,42.0,83.8,0.5023,13.9,...,23.31,33.23,22.67,39.22,13.52,8.59,36.08,38.80,180.1,124.0
21754,1333.586609,1676.024592,0.198518,-8.658499,1,0,40.4,86.0,0.4683,10.4,...,18.39,38.21,23.52,44.58,4.41,9.39,45.89,35.88,191.6,129.7


In [5]:
# Label we want to predict
label = df["won"]
label

0        False
1         True
2         True
3        False
4         True
         ...  
21751    False
21752     True
21753     True
21754    False
21755    False
Name: won, Length: 21756, dtype: bool

### Remember that elo is about 64.48% accurate at predicting winner of a game

In [6]:
projected_point_spread = df["team_point_diff_proj"]
projected_win_from_elo = projected_point_spread > 0

print(metrics.accuracy_score(label, projected_win_from_elo))

0.6485567199852914


## Scale data for Logistic Regression

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_features_df = pd.DataFrame(scaler.fit_transform(features_df), columns=features_df.columns)
scaled_features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_max_allowed_last_10_opp,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp
0,0.240976,0.517230,0.374891,0.438248,1.0,0.0,0.548387,0.454545,0.562044,0.391304,...,0.143007,0.264080,0.135632,0.182741,0.062500,0.105856,0.218970,0.000000,0.120419,0.303571
1,0.517230,0.240976,0.625109,0.561752,0.0,1.0,0.387097,0.204545,0.558394,0.347826,...,0.259727,0.005006,0.083908,0.173858,0.071429,0.153153,0.414520,0.116492,0.162304,0.892857
2,0.682585,0.544509,0.761551,0.639273,1.0,0.0,0.548387,0.159091,0.828467,0.260870,...,0.151420,0.230288,0.173563,0.139594,0.107143,0.099099,0.610070,0.056283,0.277487,0.482143
3,0.454473,0.541330,0.561992,0.530140,1.0,0.0,0.225806,0.295455,0.270073,0.391304,...,0.174553,0.180225,0.120690,0.119289,0.294643,0.274775,1.000000,0.057592,0.209424,0.035714
4,0.541330,0.454473,0.438008,0.469860,0.0,1.0,0.129032,0.204545,0.204380,0.086957,...,0.182965,0.280350,0.135632,0.114213,0.035714,0.108108,1.000000,0.079843,0.130890,0.321429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,0.514407,0.321677,0.543767,0.521228,0.0,1.0,0.548387,0.320455,0.671168,0.652174,...,0.174869,0.188611,0.149540,0.314975,0.161607,0.209685,0.258314,0.085471,0.351832,0.500000
21752,0.321677,0.514407,0.456233,0.478772,1.0,0.0,0.400000,0.281818,0.513504,0.621739,...,0.201367,0.258824,0.120000,0.280203,0.170536,0.158559,0.262763,0.263482,0.347120,0.637500
21753,0.717727,0.207571,0.812940,0.675237,0.0,1.0,0.548387,0.245455,0.745620,0.604348,...,0.193586,0.164330,0.111149,0.228680,0.523214,0.193468,0.251522,0.198953,0.372251,0.660714
21754,0.207571,0.717727,0.187060,0.324763,1.0,0.0,0.496774,0.295455,0.621533,0.452174,...,0.141851,0.226658,0.120920,0.296701,0.116518,0.211486,0.366393,0.160733,0.432461,0.762500


## Test Train Split

In [8]:
from sklearn.model_selection import train_test_split

test_ratio = 0.3
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing

x_train_scaled

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_max_allowed_last_10_opp,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp
3541,0.712284,0.580949,0.756475,0.636003,1.0,0.0,0.454839,0.277273,0.591606,0.526087,...,0.135752,0.184606,0.110230,0.231726,0.199107,0.185360,0.496487,0.240445,0.276440,0.582143
12776,0.299858,0.455009,0.493810,0.497005,1.0,0.0,0.435484,0.405303,0.475669,0.521739,...,0.308798,0.206925,0.161303,0.329949,0.156250,0.218844,0.206089,0.165358,0.581152,0.544643
20545,0.462103,0.518898,0.303895,0.400163,0.0,1.0,0.625806,0.422727,0.679197,0.700000,...,0.196215,0.303504,0.166782,0.301777,0.165625,0.162838,0.302108,0.142539,0.330366,0.601786
3524,0.696263,0.830160,0.241588,0.362754,0.0,1.0,0.500000,0.306818,0.628467,0.447826,...,0.171399,0.267084,0.136207,0.198096,0.196875,0.134009,0.385480,0.125654,0.209948,0.578571
6196,0.304575,0.253758,0.690945,0.596937,1.0,0.0,0.470968,0.297727,0.596350,0.434783,...,0.143323,0.184731,0.108966,0.136041,0.202679,0.172297,0.185363,0.104581,0.333508,0.492857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3671,0.482331,0.362842,0.747396,0.630255,1.0,0.0,0.474194,0.243182,0.645985,0.352174,...,0.160358,0.179474,0.105977,0.271447,0.225893,0.204505,0.263934,0.247775,0.259686,0.580357
11283,0.468316,0.623950,0.493326,0.496770,1.0,0.0,0.487097,0.388636,0.543431,0.508696,...,0.138696,0.312641,0.181494,0.331345,0.167411,0.272748,0.415105,0.144110,0.354450,0.608929
19086,0.479554,0.168773,0.868766,0.723067,1.0,0.0,0.612903,0.425000,0.669708,0.539130,...,0.132913,0.196120,0.106552,0.258629,0.198661,0.224775,0.209602,0.150785,0.303141,0.594643
4141,0.230657,0.595680,0.106255,0.250616,0.0,1.0,0.432258,0.297727,0.540876,0.352174,...,0.154890,0.268711,0.156897,0.312944,0.337500,0.185586,0.411827,0.197644,0.327225,0.646429


In [9]:
from sklearn.linear_model import LogisticRegression 

# Logistic Regression with max_iter=200 
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
log_model.fit(x_train_scaled, y_train_scaled)
y_pred_log = log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred_log))

0.6445533935958327


From our previous tests, we found that logistic regression gave around 65% accuracy. Let us try to find the optimal parameters.

In [10]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'C': [0.001, 0.01, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10, 100],
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

# log_model = LogisticRegression(max_iter=1000)
# clf = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', verbose=3)

In [11]:
# clf.fit(x_train_scaled, y_train_scaled)

In [12]:
# print("Best Parameters:", clf.best_params_)

### Train using best parameters

We have found "optimal" parameters with GridSearchCV:

Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

Let's evaluate the model using these parameters.

In [13]:
opt_log_model = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')
opt_log_model.fit(x_train_scaled, y_train_scaled)
y_pred = opt_log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred))

[LibLinear]0.649456105408304


It seems like the parameters do not give much of a performance bump. Nonetheless, let's use this "optimized" model further.

### See accuracy of "high" confidence predictions

In [14]:
# Predict with a probability for each class
y_pred_prob_log = opt_log_model.predict_proba(x_test_scaled)
y_pred_prob_log

array([[0.54708913, 0.45291087],
       [0.63392029, 0.36607971],
       [0.76140046, 0.23859954],
       ...,
       [0.40357684, 0.59642316],
       [0.47767976, 0.52232024],
       [0.329696  , 0.670304  ]])

In [15]:
prob_threshold = 0.7

# Identify predictions where the probability of either class 0 or class 1 meets or exceeds the threshold
high_confidence_indices = np.where((y_pred_prob_log[:, 0] >= prob_threshold) | (y_pred_prob_log[:, 1] >= prob_threshold))

high_confidence_indices = list(high_confidence_indices[0])
print(len(high_confidence_indices))

1392


In [16]:
# Extract the predictions and actual labels for these high confidence predictions
high_conf_predictions = np.argmax(y_pred_prob_log[high_confidence_indices], axis=1)
high_conf_actual = y_test_scaled.reset_index(drop=True)
high_conf_actual = high_conf_actual[high_confidence_indices]
high_conf_actual

2       False
14       True
23      False
28       True
29      False
        ...  
6505    False
6511     True
6512     True
6513     True
6515     True
Name: won, Length: 1392, dtype: bool

In [17]:
print(metrics.accuracy_score(high_conf_actual, high_conf_predictions))

0.7995689655172413


### See accuracy of "low" confidence predictions

In [18]:
## What about the accuracy of those outside of the "high_conf_predictions"?

low_confidence_indices = [i for i in range(len(y_pred_prob_log)) if i not in high_confidence_indices]
print(len(low_confidence_indices))
assert len(low_confidence_indices) + len(high_confidence_indices) == len(y_pred_prob_log)

5135


In [19]:
# Extract the predictions and actual labels for these "low" confidence predictions
low_conf_predictions = np.argmax(y_pred_prob_log[low_confidence_indices], axis=1)
low_conf_actual = y_test_scaled.reset_index(drop=True)
low_conf_actual = low_conf_actual[low_confidence_indices]
low_conf_actual

0       False
1       False
3        True
4        True
5       False
        ...  
6522    False
6523    False
6524    False
6525    False
6526     True
Name: won, Length: 5135, dtype: bool

In [20]:
print(metrics.accuracy_score(low_conf_actual, low_conf_predictions))

0.6087633885102239


In [21]:
x_test_low_confidence = x_test_scaled.reset_index(drop=True)
x_test_low_confidence = x_test_low_confidence.loc[low_confidence_indices]
assert(x_test_low_confidence.index.values.tolist() == low_conf_actual.index.values.tolist())
x_test_low_confidence

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_max_allowed_last_10_opp,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp
0,0.556984,0.522867,0.386682,0.444271,0.0,1.0,0.458065,0.268182,0.610584,0.656522,...,0.147003,0.209887,0.138966,0.174112,0.168304,0.117568,0.428103,0.147775,0.412042,0.625000
1,0.304070,0.427329,0.249692,0.367915,0.0,1.0,0.529032,0.281818,0.682117,0.473913,...,0.221030,0.310013,0.169080,0.326777,0.179911,0.273649,0.259602,0.138613,0.387958,0.560714
3,0.522361,0.474493,0.399887,0.450944,0.0,1.0,0.690323,0.402273,0.791606,0.539130,...,0.128812,0.215519,0.118046,0.198096,0.180804,0.228378,0.415691,0.130628,0.451309,0.701786
4,0.399824,0.498209,0.550569,0.524547,1.0,0.0,0.448387,0.250000,0.600365,0.369565,...,0.107150,0.189862,0.094368,0.290736,0.210268,0.120721,0.238173,0.117016,0.366492,0.523214
5,0.592161,0.577232,0.368515,0.434962,0.0,1.0,0.519355,0.368182,0.601095,0.426087,...,0.113670,0.246934,0.127011,0.228046,0.163393,0.120270,0.167330,0.133508,0.338220,0.591071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6522,0.585460,0.563481,0.375153,0.438383,0.0,1.0,0.593548,0.368182,0.689416,0.639130,...,0.119558,0.184981,0.114138,0.315736,0.135268,0.202027,0.324941,0.212565,0.410471,0.587500
6523,0.599567,0.598091,0.355977,0.428435,0.0,1.0,0.425806,0.170455,0.643796,0.482609,...,0.133859,0.214143,0.091954,0.182614,0.207589,0.221396,0.271077,0.141230,0.391099,0.533929
6524,0.599134,0.265668,0.677532,0.589510,0.0,1.0,0.529032,0.413636,0.574088,0.382609,...,0.110410,0.207509,0.117471,0.220558,0.170982,0.274550,0.485012,0.160864,0.386387,0.566071
6525,0.534701,0.495856,0.680478,0.591128,1.0,0.0,0.612903,0.340909,0.743431,0.513043,...,0.154784,0.227660,0.135287,0.269162,0.143750,0.152252,0.434895,0.096204,0.323037,0.642857


In [22]:
elo_proj_low_confidence = x_test_low_confidence["team_point_diff_proj"] > 0
elo_proj_low_confidence

0       True
1       True
3       True
4       True
5       True
        ... 
6522    True
6523    True
6524    True
6525    True
6526    True
Name: team_point_diff_proj, Length: 5135, dtype: bool

In [23]:
print(metrics.accuracy_score(low_conf_actual, elo_proj_low_confidence))

0.5036027263875366


## K-fold Cross Validation

Let's see if our log model is good for future data. What we can try to do is a [k-fold cross validation](https://www.youtube.com/watch?v=kituDjzXwfE&t=698s).

In [24]:
# Set X and Y as the data and labels
X = scaled_features_df
Y = label

In [25]:
# Set up the model again
opt_log_model_cv = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')

In [26]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, shuffle=True, random_state=42)

# Performing k-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(opt_log_model_cv, X, Y, cv=kf, scoring="accuracy")

# Prining CV scores
print("CV scores:", cv_scores)

# Averaging CV scores
print("Average score:", cv_scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CV scores: [0.64430147 0.65900735 0.67463235 0.67003676 0.63878676 0.65165441
 0.63694853 0.63511029 0.64797794 0.64613971 0.61764706 0.64981618
 0.64981618 0.69209559 0.66360294 0.65533088 0.6448942  0.63845446
 0.64213431 0.65593376]
Average score: 0.6507160577547486


In [27]:
opt_log_model_cv.fit(X, Y)

[LibLinear]

#### Cross Validation with 10 folds showed us that this model is about 65% accurate on average. This is fine but is not much of an improvement on our pure Elo evaluation. Let's investigate which features are most influential.

In [28]:
log_model_coefficients = opt_log_model_cv.coef_
log_model_coefficients

array([[ 6.52433378e-01, -6.76189366e-01,  1.10471060e+00,
         6.40903684e-01,  1.04605210e-01, -1.48656592e-01,
         3.69526158e-02, -2.79059515e-02,  6.79410944e-02,
         5.34090034e-02,  8.25321244e-02, -2.41812142e-02,
        -4.19155531e-03, -2.10899175e-02,  3.34533832e-02,
        -2.93765486e-02,  5.58544238e-02,  3.56077993e-02,
         9.78601655e-02,  3.84256307e-02,  4.73079143e-02,
        -7.19672763e-02, -4.57150737e-02,  6.57746922e-02,
         9.55603976e-02,  8.97496604e-02,  9.42888016e-02,
        -1.40297244e-02, -2.33119338e-02,  6.10389509e-02,
         5.60338303e-02,  8.50025601e-02,  4.50426010e-02,
         6.01817633e-02, -6.67497956e-02,  1.02640207e-01,
        -9.17284355e-02,  1.06670698e-01,  6.66675670e-02,
         2.00576807e-02,  6.66288045e-02,  3.47540093e-02,
         1.11120356e-02,  1.42762314e-01,  1.39948948e-01,
        -7.40138160e-02,  4.66517126e-02, -1.35566926e-02,
         1.25043877e-01,  2.36847439e-03,  4.37338844e-0

In [29]:
feature_names = scaled_features_df.columns
feature_weights = list(zip(feature_names, log_model_coefficients[0]))

In [30]:
feature_weights

[('team_elo_before', 0.6524333783340259),
 ('team_opp_elo_before', -0.6761893663509744),
 ('team_expected_win_probability', 1.1047106045470267),
 ('team_point_diff_proj', 0.6409036837040998),
 ('home', 0.10460520997877565),
 ('home_opp', -0.14865659161124012),
 ('fg_last_10', 0.03695261582172239),
 ('fga_last_10', -0.02790595146815075),
 ('fg%_last_10', 0.06794109436674131),
 ('3p_last_10', 0.053409003359368504),
 ('3pa_last_10', 0.0825321244274476),
 ('3p%_last_10', -0.0241812141842167),
 ('ft_last_10', -0.004191555314634797),
 ('fta_last_10', -0.02108991754763542),
 ('ft%_last_10', 0.03345338324036947),
 ('orb_last_10', -0.0293765486086777),
 ('drb_last_10', 0.05585442376821025),
 ('trb_last_10', 0.03560779927418314),
 ('ast_last_10', 0.09786016547779923),
 ('stl_last_10', 0.038425630653151695),
 ('blk_last_10', 0.04730791434470171),
 ('tov_last_10', -0.07196727627533767),
 ('pf_last_10', -0.04571507372970097),
 ('pts_last_10', 0.06577469218386267),
 ('ts%_last_10', 0.095560397616535

## Training a model without the Elo features

We can observe from the feature weights that Elo related features are most influential when determining the outcome of a game. This might be the reason why our accuracy is so close to the elo-based accuracy. Let's train a model without these elo features and see what is most influential.

In [31]:
# Set up a new model to train
non_elo_log_model = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')

In [32]:
elo_features = ["team_elo_before", "team_opp_elo_before", "team_expected_win_probability", "team_point_diff_proj"]
non_elo_features = scaled_features_df.columns[~scaled_features_df.columns.isin(elo_features)]

In [33]:
X_non_elo = scaled_features_df[non_elo_features]
Y = label

In [34]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, shuffle=True, random_state=42)

# Performing k-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(non_elo_log_model, X_non_elo, Y, cv=kf, scoring="accuracy")

# Prining CV scores
print("CV scores:", cv_scores)

# Averaging CV scores
print("Average score:", cv_scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CV scores: [0.62408088 0.64522059 0.66360294 0.67371324 0.63602941 0.62132353
 0.62591912 0.61580882 0.64613971 0.62775735 0.61764706 0.63786765
 0.63878676 0.65349265 0.64522059 0.65257353 0.63201472 0.63477461
 0.61821527 0.64673413]
Average score: 0.6378461276990098


We achieved an average of 64% accuracy without elo features...

In [35]:
non_elo_log_model.fit(X_non_elo, Y)

[LibLinear]

In [36]:
non_elo_log_model_coefficients = non_elo_log_model.coef_

In [37]:
non_elo_log_model_coefficients = list(zip(X_non_elo.columns, non_elo_log_model_coefficients[0]))
non_elo_log_model_coefficients

[('home', 0.30420682676336186),
 ('home_opp', -0.3042068267686932),
 ('fg_last_10', 0.09774620856273501),
 ('fga_last_10', -0.03577828173463356),
 ('fg%_last_10', 0.16526859367772997),
 ('3p_last_10', 0.11281379048771568),
 ('3pa_last_10', 0.1278865728069947),
 ('3p%_last_10', 0.029426454568792056),
 ('ft_last_10', 0.013220934736558011),
 ('fta_last_10', -0.022313797772159295),
 ('ft%_last_10', 0.0968212195950327),
 ('orb_last_10', -0.025481770137302757),
 ('drb_last_10', 0.1353005367559687),
 ('trb_last_10', 0.09554338703723673),
 ('ast_last_10', 0.1573748214656013),
 ('stl_last_10', 0.08387868776329506),
 ('blk_last_10', 0.09890931763526824),
 ('tov_last_10', -0.15820872973604194),
 ('pf_last_10', -0.10612931793154336),
 ('pts_last_10', 0.13509865901966092),
 ('ts%_last_10', 0.20288756876876268),
 ('efg%_last_10', 0.19329492872449477),
 ('3par_last_10', 0.15029515130435675),
 ('ftr_last_10', -0.007744549399201505),
 ('orb%_last_10', -0.0022751704107117168),
 ('drb%_last_10', 0.115572

In [38]:
# Sort the coefficients in descending order
non_elo_log_model_coefficients = sorted(non_elo_log_model_coefficients, key=lambda x: abs(x[1]), reverse=True)
non_elo_log_model_coefficients

[('home_opp', -0.3042068267686932),
 ('home', 0.30420682676336186),
 ('+/-_max_last_10_opp', -0.2960552590026435),
 ('+/-_max_last_10', 0.29605525900186513),
 ('pts_max_last_10', 0.2730127435623734),
 ('pts_max_last_10_opp', -0.273012743556067),
 ('ortg_last_10', 0.24805179705851774),
 ('drtg_allowed_last_10', 0.24805179705851774),
 ('ortg_last_10_opp', -0.2480517970472533),
 ('drtg_allowed_last_10_opp', -0.2480517970472533),
 ('blk_allowed_last_10', -0.24150777495595968),
 ('blk_allowed_last_10_opp', 0.2415077749489685),
 ('pts_allowed_last_10_opp', 0.2345626843628455),
 ('pts_allowed_last_10', -0.23456268435310515),
 ('drtg_max_allowed_last_10', 0.22293520706123043),
 ('drtg_max_allowed_last_10_opp', -0.222935207050671),
 ('+/-_max_allowed_last_10', -0.22098909939487196),
 ('+/-_max_allowed_last_10_opp', 0.22098909938917563),
 ('ft_max_last_10_opp', -0.21480966160932802),
 ('ft_max_last_10', 0.2148096616079338),
 ('fta_max_last_10_opp', -0.2144639161052756),
 ('fta_max_last_10', 0.21

In [51]:
# Setting a threshold for the absolute value of coefficients
threshold = 0.15

# Filtering the coefficients based on the threshold
filtered_coefficients = [coef for coef in non_elo_log_model_coefficients if abs(coef[1]) > threshold]

filtered_coefficients

[('home_opp', -0.3042068267686932),
 ('home', 0.30420682676336186),
 ('+/-_max_last_10_opp', -0.2960552590026435),
 ('+/-_max_last_10', 0.29605525900186513),
 ('pts_max_last_10', 0.2730127435623734),
 ('pts_max_last_10_opp', -0.273012743556067),
 ('ortg_last_10', 0.24805179705851774),
 ('drtg_allowed_last_10', 0.24805179705851774),
 ('ortg_last_10_opp', -0.2480517970472533),
 ('drtg_allowed_last_10_opp', -0.2480517970472533),
 ('blk_allowed_last_10', -0.24150777495595968),
 ('blk_allowed_last_10_opp', 0.2415077749489685),
 ('pts_allowed_last_10_opp', 0.2345626843628455),
 ('pts_allowed_last_10', -0.23456268435310515),
 ('drtg_max_allowed_last_10', 0.22293520706123043),
 ('drtg_max_allowed_last_10_opp', -0.222935207050671),
 ('+/-_max_allowed_last_10', -0.22098909939487196),
 ('+/-_max_allowed_last_10_opp', 0.22098909938917563),
 ('ft_max_last_10_opp', -0.21480966160932802),
 ('ft_max_last_10', 0.2148096616079338),
 ('fta_max_last_10_opp', -0.2144639161052756),
 ('fta_max_last_10', 0.21

In [52]:
top_features = [coef[0] for coef in filtered_coefficients]
top_features

['home_opp',
 'home',
 '+/-_max_last_10_opp',
 '+/-_max_last_10',
 'pts_max_last_10',
 'pts_max_last_10_opp',
 'ortg_last_10',
 'drtg_allowed_last_10',
 'ortg_last_10_opp',
 'drtg_allowed_last_10_opp',
 'blk_allowed_last_10',
 'blk_allowed_last_10_opp',
 'pts_allowed_last_10_opp',
 'pts_allowed_last_10',
 'drtg_max_allowed_last_10',
 'drtg_max_allowed_last_10_opp',
 '+/-_max_allowed_last_10',
 '+/-_max_allowed_last_10_opp',
 'ft_max_last_10_opp',
 'ft_max_last_10',
 'fta_max_last_10_opp',
 'fta_max_last_10',
 'ts%_last_10',
 'ts%_last_10_opp',
 'drtg_last_10_opp',
 'ortg_allowed_last_10_opp',
 'drtg_last_10',
 'ortg_allowed_last_10',
 'fg_allowed_last_10_opp',
 'fg_allowed_last_10',
 'efg%_last_10',
 'efg%_last_10_opp',
 'blk%_allowed_last_10',
 'blk%_allowed_last_10_opp',
 'fg_max_last_10',
 'fg_max_last_10_opp',
 '3par_max_last_10',
 '3par_max_last_10_opp',
 'ast_max_last_10',
 'ast_max_last_10_opp',
 'fg%_last_10',
 'fg%_last_10_opp',
 'tov_last_10',
 'tov_last_10_opp',
 'ast_last_1

## Training the model on the filtered, non_elo features

In [53]:
# Set up a new model to train
non_elo_log_model = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')

In [54]:
X = scaled_features_df[top_features]
Y = label

In [55]:
kf = KFold(n_splits=20, shuffle=True, random_state=42)

# Performing k-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(non_elo_log_model, X, Y, cv=kf, scoring="accuracy")

# Prining CV scores
print("CV scores:", cv_scores)

# Averaging CV scores
print("Average score:", cv_scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CV scores: [0.61397059 0.62775735 0.66727941 0.65992647 0.63235294 0.62775735
 0.625      0.61580882 0.640625   0.63419118 0.62132353 0.62408088
 0.64246324 0.64522059 0.64338235 0.64154412 0.62281509 0.63201472
 0.61361546 0.6448942 ]
Average score: 0.6338011644975378


## Let's try combining the elo features and the filtered features

In [56]:
X = scaled_features_df[elo_features + top_features]
Y = label

In [57]:
# Set up a new model to train
lgr = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')
kf = KFold(n_splits=20, shuffle=True, random_state=42)

# Performing k-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(lgr, X, Y, cv=kf, scoring="accuracy")

# Prining CV scores
print("CV scores:", cv_scores)

# Averaging CV scores
print("Average score:", cv_scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CV scores: [0.66268382 0.66452206 0.67371324 0.66819853 0.63970588 0.65441176
 0.63602941 0.62775735 0.65533088 0.65165441 0.62224265 0.65165441
 0.64889706 0.69577206 0.66176471 0.65900735 0.64397424 0.63937443
 0.64213431 0.65593376]
Average score: 0.652738116578278
