# Predicting the Outcome of NBA Games using Recent Team Performance and Elo

In this notebook, we will use the dataset we processed from the feature engineering notebook to train various models on predicting the outcome of games.

The data includes a team's and opponent's elo rating and performance from the last 10 games.

In [2]:
import pandas as pd

## Read the data

In [3]:
path_to_data = "../../data/processed/processed_team_dataset.csv"

In [7]:
stats_df = pd.read_csv(path_to_data, index_col=0)

In [8]:
stats_df

Unnamed: 0,date,season,team,team_opp,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,2015-10-29,2016,NYK,ATL,1356.009483,1541.442237,0.379472,-3.051170,1,0,...,41.20,24.80,35.60,3.20,4.70,33.30,23.60,132.0,104.0,106.0
1,2015-10-29,2016,ATL,NYK,1541.442237,1356.009483,0.620528,3.051170,0,1,...,20.50,20.30,34.90,3.40,6.80,50.00,32.50,140.0,137.0,97.0
2,2015-10-29,2016,LAC,DAL,1652.435518,1559.753058,0.751974,6.881516,1,0,...,38.50,28.10,32.20,4.20,4.40,66.70,27.90,162.0,114.0,95.0
3,2015-10-29,2016,IND,MEM,1499.317744,1557.619147,0.559723,1.489236,1,0,...,34.50,23.50,30.60,8.40,12.20,100.00,28.00,149.0,89.0,106.0
4,2015-10-29,2016,MEM,IND,1557.619147,1499.317744,0.440277,-1.489236,0,1,...,42.50,24.80,30.20,2.60,4.80,100.00,29.70,134.0,105.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,2024-02-15,2024,MIL,MEM,1539.547927,1410.179450,0.542164,1.048874,0,1,...,35.17,26.01,46.02,5.42,9.31,36.66,30.13,176.2,115.0,114.4
21752,2024-02-15,2024,MEM,MIL,1410.179450,1539.547927,0.457836,-1.048874,1,0,...,40.78,23.44,43.28,5.62,7.04,37.04,43.73,175.3,122.7,113.4
21753,2024-02-15,2024,MIN,POR,1676.024592,1333.586609,0.801482,8.658499,0,1,...,33.23,22.67,39.22,13.52,8.59,36.08,38.80,180.1,124.0,114.5
21754,2024-02-15,2024,POR,MIN,1333.586609,1676.024592,0.198518,-8.658499,1,0,...,38.21,23.52,44.58,4.41,9.39,45.89,35.88,191.6,129.7,103.6


In [9]:
# let's see what features we're dealing with
stats_df.columns.tolist()

['date',
 'season',
 'team',
 'team_opp',
 'team_elo_before',
 'team_opp_elo_before',
 'team_expected_win_probability',
 'team_point_diff_proj',
 'home',
 'home_opp',
 'won',
 'fg_last_10',
 'fga_last_10',
 'fg%_last_10',
 '3p_last_10',
 '3pa_last_10',
 '3p%_last_10',
 'ft_last_10',
 'fta_last_10',
 'ft%_last_10',
 'orb_last_10',
 'drb_last_10',
 'trb_last_10',
 'ast_last_10',
 'stl_last_10',
 'blk_last_10',
 'tov_last_10',
 'pf_last_10',
 'pts_last_10',
 'ts%_last_10',
 'efg%_last_10',
 '3par_last_10',
 'ftr_last_10',
 'orb%_last_10',
 'drb%_last_10',
 'trb%_last_10',
 'ast%_last_10',
 'stl%_last_10',
 'blk%_last_10',
 'tov%_last_10',
 'ortg_last_10',
 'drtg_last_10',
 'fg_max_last_10',
 'fga_max_last_10',
 'fg%_max_last_10',
 '3p_max_last_10',
 '3pa_max_last_10',
 '3p%_max_last_10',
 'ft_max_last_10',
 'fta_max_last_10',
 'orb_max_last_10',
 'drb_max_last_10',
 'trb_max_last_10',
 'ast_max_last_10',
 'stl_max_last_10',
 'blk_max_last_10',
 'tov_max_last_10',
 'pf_max_last_10',
 'pts_

In [10]:
# drop these columns (most of the are non_numeric or are not useful for machine learning)
drop_columns = ["date", "season", "team", "team_opp", "won"] # should we also remove home and home_opp?

selected_columns = stats_df.columns[~stats_df.columns.isin(drop_columns)]

In [12]:
features_df = stats_df[selected_columns]
features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,1356.009483,1541.442237,0.379472,-3.051170,1,0,42.0,93.0,0.4520,9.0,...,41.20,24.80,35.60,3.20,4.70,33.30,23.60,132.0,104.0,106.0
1,1541.442237,1356.009483,0.620528,3.051170,0,1,37.0,82.0,0.4510,8.0,...,20.50,20.30,34.90,3.40,6.80,50.00,32.50,140.0,137.0,97.0
2,1652.435518,1559.753058,0.751974,6.881516,1,0,42.0,80.0,0.5250,6.0,...,38.50,28.10,32.20,4.20,4.40,66.70,27.90,162.0,114.0,95.0
3,1499.317744,1557.619147,0.559723,1.489236,1,0,32.0,86.0,0.3720,9.0,...,34.50,23.50,30.60,8.40,12.20,100.00,28.00,149.0,89.0,106.0
4,1557.619147,1499.317744,0.440277,-1.489236,0,1,29.0,82.0,0.3540,2.0,...,42.50,24.80,30.20,2.60,4.80,100.00,29.70,134.0,105.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,1539.547927,1410.179450,0.542164,1.048874,0,1,42.0,87.1,0.4819,15.0,...,35.17,26.01,46.02,5.42,9.31,36.66,30.13,176.2,115.0,114.4
21752,1410.179450,1539.547927,0.457836,-1.048874,1,0,37.4,85.4,0.4387,14.3,...,40.78,23.44,43.28,5.62,7.04,37.04,43.73,175.3,122.7,113.4
21753,1676.024592,1333.586609,0.801482,8.658499,0,1,42.0,83.8,0.5023,13.9,...,33.23,22.67,39.22,13.52,8.59,36.08,38.80,180.1,124.0,114.5
21754,1333.586609,1676.024592,0.198518,-8.658499,1,0,40.4,86.0,0.4683,10.4,...,38.21,23.52,44.58,4.41,9.39,45.89,35.88,191.6,129.7,103.6


In [15]:
label = stats_df["won"]
label

0        False
1         True
2         True
3        False
4         True
         ...  
21751    False
21752     True
21753     True
21754    False
21755    False
Name: won, Length: 21756, dtype: bool

In [18]:
print(label.value_counts()) # Ensure that we have the same amount of wins and losses

won
False    10878
True     10878
Name: count, dtype: int64


## Let's see how good Elo is in predicting winner

Before we do any machine learning, let's see how accurate pure elo values are in determining the winner of an NBA game is. We already calculated a projected point spread which computes the difference in elo values and also accounts for home court advantage


In [23]:
projected_point_spread = stats_df["team_point_diff_proj"]
projected_point_spread

0       -3.051170
1        3.051170
2        6.881516
3        1.489236
4       -1.489236
           ...   
21751    1.048874
21752   -1.048874
21753    8.658499
21754   -8.658499
21755    0.310949
Name: team_point_diff_proj, Length: 21756, dtype: float64

In [24]:
projected_win = projected_point_spread > 0
projected_win

0        False
1         True
2         True
3         True
4        False
         ...  
21751     True
21752    False
21753     True
21754    False
21755     True
Name: team_point_diff_proj, Length: 21756, dtype: bool

In [27]:
# Now let's compute the accuracy score

from sklearn import metrics

print(metrics.accuracy_score(label, projected_win))

0.6485567199852914


From pure elo, we achieve a prediction accuracy of around 64.86% - not bad.

## Split Data into Test and Train

Now we have our features dataframe and the label we want to predict (whether that team won or not). We will test a variety of models and machine learning methods using our dataset. First, we must split the data into test and train.

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing

## Logistic Regression Model

Following Weiner's project, we will test a simple, non-parameterized Logistic Regression model.

In [22]:
from sklearn.linear_model import LogisticRegression 

# create a simple, non-parameterized Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred))

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model.get_params())

0.6546652367090547
Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Specify parameters of model
Weiner tried experimenting with different parameters and tried max_iter=131 and verbose=2.

In [29]:
# Logistic Regression with max_iter=200 
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
log_model.fit(x_train, y_train)
y_pred_log = log_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_log))

0.6574230121035698


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Let's try scaling the data with MinMaxScaler

In [39]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_features_df = pd.DataFrame(scaler.fit_transform(features_df), columns=features_df.columns)

In [40]:
scaled_features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,0.240976,0.517230,0.374891,0.438248,1.0,0.0,0.548387,0.454545,0.562044,0.391304,...,0.264080,0.135632,0.182741,0.062500,0.105856,0.218970,0.000000,0.120419,0.303571,0.447761
1,0.517230,0.240976,0.625109,0.561752,0.0,1.0,0.387097,0.204545,0.558394,0.347826,...,0.005006,0.083908,0.173858,0.071429,0.153153,0.414520,0.116492,0.162304,0.892857,0.313433
2,0.682585,0.544509,0.761551,0.639273,1.0,0.0,0.548387,0.159091,0.828467,0.260870,...,0.230288,0.173563,0.139594,0.107143,0.099099,0.610070,0.056283,0.277487,0.482143,0.283582
3,0.454473,0.541330,0.561992,0.530140,1.0,0.0,0.225806,0.295455,0.270073,0.391304,...,0.180225,0.120690,0.119289,0.294643,0.274775,1.000000,0.057592,0.209424,0.035714,0.447761
4,0.541330,0.454473,0.438008,0.469860,0.0,1.0,0.129032,0.204545,0.204380,0.086957,...,0.280350,0.135632,0.114213,0.035714,0.108108,1.000000,0.079843,0.130890,0.321429,0.447761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,0.514407,0.321677,0.543767,0.521228,0.0,1.0,0.548387,0.320455,0.671168,0.652174,...,0.188611,0.149540,0.314975,0.161607,0.209685,0.258314,0.085471,0.351832,0.500000,0.573134
21752,0.321677,0.514407,0.456233,0.478772,1.0,0.0,0.400000,0.281818,0.513504,0.621739,...,0.258824,0.120000,0.280203,0.170536,0.158559,0.262763,0.263482,0.347120,0.637500,0.558209
21753,0.717727,0.207571,0.812940,0.675237,0.0,1.0,0.548387,0.245455,0.745620,0.604348,...,0.164330,0.111149,0.228680,0.523214,0.193468,0.251522,0.198953,0.372251,0.660714,0.574627
21754,0.207571,0.717727,0.187060,0.324763,1.0,0.0,0.496774,0.295455,0.621533,0.452174,...,0.226658,0.120920,0.296701,0.116518,0.211486,0.366393,0.160733,0.432461,0.762500,0.411940


In [50]:
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing

x_train_scaled

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
14195,0.494303,0.605207,0.538108,0.518473,1.0,0.0,0.435484,0.302273,0.542336,0.621739,...,0.220651,0.162874,0.277792,0.535714,0.405405,0.380562,0.171990,0.315183,0.469643,0.389552
21000,0.057012,0.439222,0.280777,0.386840,1.0,0.0,0.503226,0.295455,0.636861,0.382609,...,0.255069,0.174483,0.312183,0.174554,0.197973,0.417330,0.161387,0.454450,0.683929,0.632836
18195,0.479762,0.426918,0.692700,0.597920,1.0,0.0,0.658065,0.365909,0.772263,0.560870,...,0.203755,0.115517,0.255076,0.089732,0.153378,0.309368,0.141099,0.386387,0.537500,0.617910
15397,0.665823,0.415814,0.836011,0.693581,1.0,0.0,0.522581,0.295455,0.666058,0.639130,...,0.212891,0.111724,0.272589,0.136161,0.197523,0.141218,0.104843,0.319372,0.526786,0.556716
21043,0.689161,0.629544,0.698527,0.601206,1.0,0.0,0.651613,0.393182,0.739051,0.578261,...,0.299499,0.185862,0.278807,0.129911,0.121396,0.371546,0.103927,0.432461,0.742857,0.623881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4154,0.458660,0.568577,0.539092,0.518951,1.0,0.0,0.467742,0.370455,0.528467,0.413043,...,0.221026,0.167011,0.228807,0.129464,0.147523,0.187705,0.144372,0.475393,0.605357,0.500000
2316,0.889263,0.852172,0.389525,0.445714,0.0,1.0,0.409677,0.136364,0.655474,0.295652,...,0.247184,0.135862,0.265609,0.123661,0.236036,0.298126,0.126571,0.327749,0.576786,0.429851
5434,0.502042,0.316950,0.536153,0.517522,0.0,1.0,0.333333,0.174242,0.515815,0.444444,...,0.211514,0.130268,0.217005,0.238591,0.176176,0.276477,0.132926,0.325771,0.398810,0.475954
3391,0.484336,0.447588,0.678627,0.590111,1.0,0.0,0.383871,0.188636,0.575547,0.369565,...,0.207009,0.132529,0.270685,0.137054,0.124324,0.247307,0.117539,0.374346,0.594643,0.501493


In [42]:
# create a simple, non-parameterized Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(x_train_scaled, y_train_scaled)

y_pred = model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred))

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model.get_params())

0.6482304274551861
Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


In [43]:
# Logistic Regression with max_iter=200 
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
log_model.fit(x_train_scaled, y_train_scaled)
y_pred_log = log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred_log))

0.6482304274551861


### Remove the "max" features
Looks like scaling didn't help. Let's try removing all the "max" features from our dataset and see if that improves accuracy.

In [48]:
stats_without_max_features_df = features_df.loc[:, ~features_df.columns.str.contains('max', case=False)]
stats_without_max_features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_allowed_last_10_opp,drb%_allowed_last_10_opp,trb%_allowed_last_10_opp,ast%_allowed_last_10_opp,stl%_allowed_last_10_opp,blk%_allowed_last_10_opp,tov%_allowed_last_10_opp,ortg_allowed_last_10_opp,drtg_allowed_last_10_opp,total_allowed_last_10_opp
0,1356.009483,1541.442237,0.379472,-3.051170,1,0,42.0,93.0,0.4520,9.0,...,41.10,83.70,59.60,62.20,5.20,5.50,12.30,111.20,98.60,106.0
1,1541.442237,1356.009483,0.620528,3.051170,0,1,37.0,82.0,0.4510,8.0,...,37.50,60.40,49.00,56.70,7.40,5.70,15.30,101.90,128.20,97.0
2,1652.435518,1559.753058,0.751974,6.881516,1,0,42.0,80.0,0.5250,6.0,...,23.60,87.00,52.50,44.10,2.90,4.70,15.10,93.20,108.90,95.0
3,1499.317744,1557.619147,0.559723,1.489236,1,0,32.0,86.0,0.3720,9.0,...,29.30,79.20,57.40,70.70,7.30,3.00,17.20,110.80,79.40,106.0
4,1557.619147,1499.317744,0.440277,-1.489236,0,1,29.0,82.0,0.3540,2.0,...,22.00,83.70,55.60,52.80,7.70,3.20,17.10,102.30,95.50,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,1539.547927,1410.179450,0.542164,1.048874,0,1,42.0,87.1,0.4819,15.0,...,25.77,80.44,54.95,61.29,9.03,10.48,12.65,118.76,107.38,114.4
21752,1410.179450,1539.547927,0.457836,-1.048874,1,0,37.4,85.4,0.4387,14.3,...,22.36,82.16,52.77,61.08,6.91,5.83,11.84,115.39,115.85,113.4
21753,1676.024592,1333.586609,0.801482,8.658499,0,1,42.0,83.8,0.5023,13.9,...,28.06,73.05,50.22,62.97,7.49,11.82,10.37,120.42,116.34,114.5
21754,1333.586609,1676.024592,0.198518,-8.658499,1,0,40.4,86.0,0.4683,10.4,...,23.50,76.08,47.79,64.94,7.36,10.89,12.73,109.52,121.46,103.6


In [49]:
print(stats_without_max_features_df.columns.tolist())

['team_elo_before', 'team_opp_elo_before', 'team_expected_win_probability', 'team_point_diff_proj', 'home', 'home_opp', 'fg_last_10', 'fga_last_10', 'fg%_last_10', '3p_last_10', '3pa_last_10', '3p%_last_10', 'ft_last_10', 'fta_last_10', 'ft%_last_10', 'orb_last_10', 'drb_last_10', 'trb_last_10', 'ast_last_10', 'stl_last_10', 'blk_last_10', 'tov_last_10', 'pf_last_10', 'pts_last_10', 'ts%_last_10', 'efg%_last_10', '3par_last_10', 'ftr_last_10', 'orb%_last_10', 'drb%_last_10', 'trb%_last_10', 'ast%_last_10', 'stl%_last_10', 'blk%_last_10', 'tov%_last_10', 'ortg_last_10', 'drtg_last_10', 'total_last_10', 'fg_allowed_last_10', 'fga_allowed_last_10', 'fg%_allowed_last_10', '3p_allowed_last_10', '3pa_allowed_last_10', '3p%_allowed_last_10', 'ft_allowed_last_10', 'fta_allowed_last_10', 'ft%_allowed_last_10', 'orb_allowed_last_10', 'drb_allowed_last_10', 'trb_allowed_last_10', 'ast_allowed_last_10', 'stl_allowed_last_10', 'blk_allowed_last_10', 'tov_allowed_last_10', 'pf_allowed_last_10', 'pts

In [51]:
# Split up the filtered dataset
x_train_no_max, x_test_no_max, y_train_no_max, y_test_no_max = train_test_split(stats_without_max_features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing
x_test_no_max

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_allowed_last_10_opp,drb%_allowed_last_10_opp,trb%_allowed_last_10_opp,ast%_allowed_last_10_opp,stl%_allowed_last_10_opp,blk%_allowed_last_10_opp,tov%_allowed_last_10_opp,ortg_allowed_last_10_opp,drtg_allowed_last_10_opp,total_allowed_last_10_opp
8411,1508.887230,1622.546099,0.480353,-0.487817,1,0,40.8,90.9,0.4504,12.5,...,22.50,75.38,48.75,58.13,7.84,6.72,9.65,113.08,109.82,112.4
15448,1681.619369,1516.006304,0.593318,2.343324,0,1,43.4,90.2,0.4814,12.0,...,24.70,77.77,50.79,59.56,6.77,9.06,11.40,107.81,107.21,103.2
18890,1526.418362,1592.851755,0.548156,1.198807,1,0,44.8,89.6,0.5008,14.5,...,21.11,77.89,50.06,59.69,5.46,9.73,12.37,116.76,120.95,115.7
8326,1566.146535,1452.528555,0.519588,0.486356,0,1,41.0,85.8,0.4795,10.9,...,23.62,72.19,49.85,53.63,7.03,8.45,12.13,111.56,110.26,110.2
6918,1409.195148,1596.140537,0.160867,-10.248050,0,1,37.4,87.0,0.4303,12.4,...,16.50,76.35,44.20,53.22,8.75,9.27,11.87,103.05,115.59,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13965,1497.889887,1410.305085,0.482141,-0.443400,0,1,37.6,83.3,0.4519,14.2,...,22.52,81.93,51.07,56.47,9.29,8.64,11.57,113.92,109.58,119.4
21489,1619.591580,1649.518346,0.321275,-4.640242,0,1,41.7,83.0,0.5054,13.0,...,25.32,76.62,50.56,66.58,6.48,12.77,13.54,109.18,119.11,106.9
19009,1441.398426,1608.552939,0.176846,-9.541233,0,1,40.8,92.5,0.4442,12.0,...,25.55,71.42,49.15,70.16,7.11,9.24,13.99,113.00,114.33,117.4
486,1560.769338,1591.627197,0.598210,2.469362,1,0,36.6,81.3,0.4515,9.6,...,24.58,71.23,46.72,55.68,10.16,7.12,13.27,102.72,110.24,99.3


In [52]:
# Do the Logistic Regression
model = LogisticRegression(random_state=42)
model.fit(x_train_no_max, y_train_no_max)

y_pred = model.predict(x_test_no_max)
print(metrics.accuracy_score(y_test_no_max, y_pred))

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model.get_params())

0.6468515397579286
Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
# Logistic Regression with max_iter=200 
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
log_model.fit(x_train_no_max, y_train_no_max)
y_pred_log = log_model.predict(x_test_no_max)
print(metrics.accuracy_score(y_test_no_max, y_pred_log))

0.6514478320821204


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Tune hyperparameters

Above, we found that just using our dataset without scaling or removing max features yields the best accuracy. Now we need to tune the hyperparameters to optimize the Logistic Regression Model. Weiner and his team conducted a GridSearchCV to optimize the parameters, which we will now use.

These were the optimal parameters they found:

{'C': 1.0,
 'l1_ratio': 0,
 'max_iter': 90,
 'multi_class': 'auto',
 'penalty': 'l2',
 'solver': 'lbfgs',
 'verbose': 0}
 
I found that ~200 max iterations yielded a slightly higher accuracy. However, this is a marginal improvement over the default values.

In [81]:
log_model = LogisticRegression(C=1.0, l1_ratio=0, max_iter=200, multi_class='auto', penalty='l2', solver='lbfgs', verbose=0, random_state=42)
log_model.fit(x_train, y_train)
y_pred_log = log_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_log))



0.6557377049180327


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest Classifier

Now we will try a Random Forest Classifier as our model.

In [68]:
# Let's redo the test train split to start fresh
x_train, x_test, y_train, y_test = train_test_split(features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing

In [69]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_rf))

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

0.643787344875134
Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


Not as good as Logistic Regression... Weiner tried to compute some optimal hyperparameter. Let's use them now.

{'bootstrap': True,
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 200}

In [71]:
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_rf))

0.651141412593841


In [82]:
# Try with scaled data
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
rf.fit(x_train_scaled, y_train_scaled)
y_pred_rf = rf.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred_rf))

0.6542056074766355


In [83]:
# Try without max features
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
rf.fit(x_train_no_max, y_train_no_max)
y_pred_rf = rf.predict(x_test_no_max)
print(metrics.accuracy_score(y_test_no_max, y_pred_rf))

0.6428680864102957


In [84]:
# Try without max features AND scaled

scaled_features_without_max_features_df = scaled_features_df.loc[:, ~scaled_features_df.columns.str.contains('max', case=False)]
scaled_features_without_max_features_df

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_allowed_last_10_opp,drb%_allowed_last_10_opp,trb%_allowed_last_10_opp,ast%_allowed_last_10_opp,stl%_allowed_last_10_opp,blk%_allowed_last_10_opp,tov%_allowed_last_10_opp,ortg_allowed_last_10_opp,drtg_allowed_last_10_opp,total_allowed_last_10_opp
0,0.240976,0.517230,0.374891,0.438248,1.0,0.0,0.548387,0.454545,0.562044,0.391304,...,0.928760,0.738693,0.780702,0.586354,0.220779,0.231092,0.410714,0.603175,0.397497,0.447761
1,0.517230,0.240976,0.625109,0.561752,0.0,1.0,0.387097,0.204545,0.558394,0.347826,...,0.833773,0.153266,0.470760,0.469083,0.363636,0.239496,0.589286,0.439153,0.942216,0.313433
2,0.682585,0.544509,0.761551,0.639273,1.0,0.0,0.548387,0.159091,0.828467,0.260870,...,0.467018,0.821608,0.573099,0.200426,0.071429,0.197479,0.577381,0.285714,0.587045,0.283582
3,0.454473,0.541330,0.561992,0.530140,1.0,0.0,0.225806,0.295455,0.270073,0.391304,...,0.617414,0.625628,0.716374,0.767591,0.357143,0.126050,0.702381,0.596120,0.044166,0.447761
4,0.541330,0.454473,0.438008,0.469860,0.0,1.0,0.129032,0.204545,0.204380,0.086957,...,0.424802,0.738693,0.663743,0.385928,0.383117,0.134454,0.696429,0.446208,0.340449,0.447761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21751,0.514407,0.321677,0.543767,0.521228,0.0,1.0,0.548387,0.320455,0.671168,0.652174,...,0.524274,0.656784,0.644737,0.566951,0.469481,0.440336,0.431548,0.736508,0.559073,0.573134
21752,0.321677,0.514407,0.456233,0.478772,1.0,0.0,0.400000,0.281818,0.513504,0.621739,...,0.434301,0.700000,0.580994,0.562473,0.331818,0.244958,0.383333,0.677072,0.714943,0.558209
21753,0.717727,0.207571,0.812940,0.675237,0.0,1.0,0.548387,0.245455,0.745620,0.604348,...,0.584697,0.471106,0.506433,0.602772,0.369481,0.496639,0.295833,0.765785,0.723960,0.574627
21754,0.207571,0.717727,0.187060,0.324763,1.0,0.0,0.496774,0.295455,0.621533,0.452174,...,0.464380,0.547236,0.435380,0.644776,0.361039,0.457563,0.436310,0.573545,0.818182,0.411940


In [85]:
x_train_scaled_no_max, x_test_scaled_no_max, y_train_scaled_no_max, y_test_scaled_no_max = train_test_split(scaled_features_without_max_features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing
x_train_scaled_no_max

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,orb%_allowed_last_10_opp,drb%_allowed_last_10_opp,trb%_allowed_last_10_opp,ast%_allowed_last_10_opp,stl%_allowed_last_10_opp,blk%_allowed_last_10_opp,tov%_allowed_last_10_opp,ortg_allowed_last_10_opp,drtg_allowed_last_10_opp,total_allowed_last_10_opp
20158,0.440861,0.406120,0.387279,0.444575,0.0,1.0,0.548387,0.250000,0.737226,0.391304,...,0.385224,0.482412,0.453216,0.788913,0.370130,0.403361,0.636905,0.613757,0.761870,0.641791
774,0.553848,0.353847,0.805044,0.669318,1.0,0.0,0.345161,0.168182,0.552555,0.326087,...,0.601319,0.528894,0.563158,0.633049,0.448052,0.292017,0.588690,0.557143,0.530548,0.432836
10810,0.496193,0.687393,0.200790,0.334952,0.0,1.0,0.600000,0.386364,0.671533,0.652174,...,0.491293,0.551508,0.484211,0.521535,0.407792,0.377311,0.451786,0.464903,0.666544,0.443284
2637,0.267683,0.705824,0.236874,0.359703,1.0,0.0,0.451613,0.284091,0.582117,0.195652,...,0.726913,0.663317,0.662281,0.485075,0.551948,0.287815,0.309524,0.732804,0.561281,0.679104
197,0.309684,0.487376,0.471238,0.486069,1.0,0.0,0.471774,0.380682,0.516880,0.342391,...,0.383340,0.390524,0.391395,0.400244,0.480519,0.289916,0.585034,0.399597,0.470056,0.300640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18797,0.688890,0.307195,0.718799,0.612910,0.0,1.0,0.574194,0.352273,0.679197,0.608696,...,0.421636,0.553769,0.505848,0.581023,0.396753,0.375630,0.378571,0.719753,0.657343,0.661194
15827,0.762264,0.197492,0.957095,0.846298,1.0,0.0,0.548387,0.370455,0.633212,0.500000,...,0.516359,0.603518,0.567836,0.587633,0.483766,0.462605,0.351786,0.666843,0.520611,0.516418
1130,0.913246,0.325778,0.857895,0.712747,0.0,1.0,0.532258,0.275000,0.687226,0.543478,...,0.484169,0.416080,0.459357,0.565458,0.433766,0.335714,0.348214,0.550617,0.496688,0.361194
9209,0.613834,0.736695,0.526166,0.512672,1.0,0.0,0.564516,0.363636,0.656204,0.539130,...,0.400000,0.509548,0.386550,0.487633,0.326623,0.216387,0.298214,0.629982,0.941480,0.582090


In [86]:
# Try without max features and scaled
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
rf.fit(x_train_scaled_no_max, y_train_scaled_no_max)
y_pred_rf = rf.predict(x_test_scaled_no_max)
print(metrics.accuracy_score(y_test_scaled_no_max, y_pred_rf))

0.6470047495020683


## Ridge Classifier

Now, we will test a ridge classifier which was used in the DataQuest Tutorial

**Reminder**

*features_df* : original dataset - split into (x_train, y_train, x_test, y_test)

*scaled_features_df* : scaled dataset - split into (x_train_scaled, y_train_scaled, x_test_scaled, y_test_scaled)

*stats_without_max_features_df* : dataset minus the "max" features - split into (x_train_no_max, y_train_no_max, x_test_no_max, y_test_no_max)

*scaled_features_without_max_features_df* : scaled dataset minus the "max" features - split into (x_train_scaled_no_max, y_train_scaled_no_max, x_test_scaled_no_max, y_test_scaled_no_max)

In [88]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)

# Try 30 features at first
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward")

In [90]:
sfs.fit(features_df, label)

In [91]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['team_expected_win_probability',
 'team_point_diff_proj',
 'fg_last_10',
 'fga_last_10',
 'fg%_last_10',
 'ts%_last_10',
 'fg%_max_last_10',
 '3p%_max_last_10',
 'orb_max_last_10',
 'ftr_max_last_10',
 'ft_max_allowed_last_10',
 'fta_max_allowed_last_10',
 'trb_max_allowed_last_10',
 'stl%_max_allowed_last_10',
 'usg%_max_allowed_last_10',
 'fga_last_10_opp',
 'pf_last_10_opp',
 'ts%_last_10_opp',
 'efg%_last_10_opp',
 'fg%_max_last_10_opp',
 'orb_max_last_10_opp',
 'ast_max_last_10_opp',
 'ortg_max_last_10_opp',
 '3pa_max_allowed_last_10_opp',
 'drb_max_allowed_last_10_opp',
 'pf_max_allowed_last_10_opp',
 'pts_max_allowed_last_10_opp',
 'efg%_max_allowed_last_10_opp',
 'stl%_max_allowed_last_10_opp',
 'usg%_max_allowed_last_10_opp']

In [94]:
# Train model with all features
rr.fit(x_train, y_train)
y_pred_rr = rr.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_rr))

0.6473111689903478


In [96]:
# Train model with selected features
rr.fit(x_train[predictors], y_train)
y_pred_rr = rr.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_rr))

0.6557377049180327


Looks like the feature selector increased accuracy! Let's try using the feature selector with 50, 100, 150, and 200 features.

In [97]:
# 50 features
sfs = SequentialFeatureSelector(rr, n_features_to_select=50, direction="forward")
sfs.fit(features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

['team_expected_win_probability',
 'team_point_diff_proj',
 'fg_last_10',
 'fga_last_10',
 'fg%_last_10',
 '3p_last_10',
 'ts%_last_10',
 'efg%_last_10',
 'fg%_max_last_10',
 '3p%_max_last_10',
 'orb_max_last_10',
 'ts%_max_last_10',
 'efg%_max_last_10',
 'ftr_max_last_10',
 'orb%_max_last_10',
 'ortg_max_last_10',
 'ftr_allowed_last_10',
 'ft_max_allowed_last_10',
 'fta_max_allowed_last_10',
 'trb_max_allowed_last_10',
 'ts%_max_allowed_last_10',
 'efg%_max_allowed_last_10',
 'stl%_max_allowed_last_10',
 'usg%_max_allowed_last_10',
 'fga_last_10_opp',
 'fta_last_10_opp',
 'ast_last_10_opp',
 'pf_last_10_opp',
 'pts_last_10_opp',
 'ts%_last_10_opp',
 'efg%_last_10_opp',
 'ftr_last_10_opp',
 'fg%_max_last_10_opp',
 'orb_max_last_10_opp',
 'ast_max_last_10_opp',
 'tov_max_last_10_opp',
 'ftr_max_last_10_opp',
 'orb%_max_last_10_opp',
 'ortg_max_last_10_opp',
 'total_last_10_opp',
 '3pa_allowed_last_10_opp',
 'efg%_allowed_last_10_opp',
 '3pa_max_allowed_last_10_opp',
 '3p%_max_allowed_la

In [98]:
rr.fit(x_train[predictors], y_train)
y_pred_rr = rr.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_rr))

0.6560441244063122


In [99]:
# 100 features
sfs = SequentialFeatureSelector(rr, n_features_to_select=100, direction="forward")
sfs.fit(features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

['team_expected_win_probability',
 'team_point_diff_proj',
 'fg_last_10',
 'fga_last_10',
 'fg%_last_10',
 '3p_last_10',
 'ft%_last_10',
 'tov_last_10',
 'pf_last_10',
 'ts%_last_10',
 'efg%_last_10',
 'ast%_last_10',
 'fg%_max_last_10',
 '3p%_max_last_10',
 'orb_max_last_10',
 'drb_max_last_10',
 'blk_max_last_10',
 'pf_max_last_10',
 'ts%_max_last_10',
 'efg%_max_last_10',
 'ftr_max_last_10',
 'orb%_max_last_10',
 'trb%_max_last_10',
 'blk%_max_last_10',
 'ortg_max_last_10',
 'fg%_allowed_last_10',
 '3pa_allowed_last_10',
 'ft_allowed_last_10',
 'fta_allowed_last_10',
 'ft%_allowed_last_10',
 'stl_allowed_last_10',
 'ts%_allowed_last_10',
 'efg%_allowed_last_10',
 '3par_allowed_last_10',
 'ftr_allowed_last_10',
 '3pa_max_allowed_last_10',
 '3p%_max_allowed_last_10',
 'ft_max_allowed_last_10',
 'fta_max_allowed_last_10',
 'drb_max_allowed_last_10',
 'trb_max_allowed_last_10',
 'ast_max_allowed_last_10',
 'stl_max_allowed_last_10',
 'blk_max_allowed_last_10',
 'pf_max_allowed_last_10',

In [100]:
rr.fit(x_train[predictors], y_train)
y_pred_rr = rr.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_rr))

0.6584954803125479


In [102]:
# 150 features
sfs = SequentialFeatureSelector(rr, n_features_to_select=150, direction="forward")
sfs.fit(features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

['team_expected_win_probability',
 'team_point_diff_proj',
 'fg_last_10',
 'fga_last_10',
 'fg%_last_10',
 '3p_last_10',
 'ft%_last_10',
 'ast_last_10',
 'stl_last_10',
 'blk_last_10',
 'tov_last_10',
 'pf_last_10',
 'ts%_last_10',
 'efg%_last_10',
 'ast%_last_10',
 'stl%_last_10',
 'blk%_last_10',
 'fg%_max_last_10',
 '3p_max_last_10',
 '3p%_max_last_10',
 'orb_max_last_10',
 'drb_max_last_10',
 'trb_max_last_10',
 'stl_max_last_10',
 'blk_max_last_10',
 'pf_max_last_10',
 'ts%_max_last_10',
 'efg%_max_last_10',
 'ftr_max_last_10',
 'orb%_max_last_10',
 'drb%_max_last_10',
 'trb%_max_last_10',
 'blk%_max_last_10',
 'ortg_max_last_10',
 'drtg_max_last_10',
 'fga_allowed_last_10',
 'fg%_allowed_last_10',
 '3pa_allowed_last_10',
 'ft_allowed_last_10',
 'fta_allowed_last_10',
 'ft%_allowed_last_10',
 'orb_allowed_last_10',
 'stl_allowed_last_10',
 'ts%_allowed_last_10',
 'efg%_allowed_last_10',
 '3par_allowed_last_10',
 'ftr_allowed_last_10',
 'ast%_allowed_last_10',
 'stl%_allowed_last_1

In [103]:
rr.fit(x_train[predictors], y_train)
y_pred_rr = rr.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_rr))

0.651141412593841


### Sequential Feature Selector with Random Forest

I didn't finish these because they took WAY too long. I only expect a marginal increase in accuracy, if any.

In [None]:
# 30 features
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
sfs = SequentialFeatureSelector(rf, n_features_to_select=30, direction="forward")
sfs.fit(scaled_features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
rf.fit(x_train_scaled[predictors], y_train_scaled)
y_pred_rf = rf.predict(x_test_scaled[predictors])
print(metrics.accuracy_score(y_test_scaled, y_pred_rf))

In [None]:
# 50 features
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
sfs = SequentialFeatureSelector(rf, n_features_to_select=50, direction="forward")
sfs.fit(scaled_features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
rf.fit(x_train_scaled[predictors], y_train_scaled)
y_pred_rf = rf.predict(x_test_scaled[predictors])
print(metrics.accuracy_score(y_test_scaled, y_pred_rf))

In [None]:
# 100 features
rf = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=60, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, n_estimators=200)
sfs = SequentialFeatureSelector(rf, n_features_to_select=100, direction="forward")
sfs.fit(scaled_features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
rf.fit(x_train_scaled[predictors], y_train_scaled)
y_pred_rf = rf.predict(x_test_scaled[predictors])
print(metrics.accuracy_score(y_test_scaled, y_pred_rf))

### Feature Selector with Log Model

Same thing here. The SFS took forever to run. The 30 feature sfs took over two hours to complete and it didn't even raise the accuracy by much.

In [None]:
# 30 features
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
sfs = SequentialFeatureSelector(log_model, n_features_to_select=30, direction="forward")
sfs.fit(features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

Cleared the output in the cell above for the sfs on log_model with 30 features. Here are the features.

['team_expected_win_probability',
 'team_point_diff_proj',
 'fg%_last_10',
 'ast_last_10',
 'pf_last_10',
 'orb%_max_last_10',
 '3pa_allowed_last_10',
 'efg%_allowed_last_10',
 'ftr_allowed_last_10',
 'fg%_max_allowed_last_10',
 'stl_max_allowed_last_10',
 'pf_max_allowed_last_10',
 'stl%_max_allowed_last_10',
 'ortg_max_allowed_last_10',
 '3p%_last_10_opp',
 'ftr_last_10_opp',
 'orb%_last_10_opp',
 'tov%_last_10_opp',
 'pf_max_last_10_opp',
 'fg%_allowed_last_10_opp',
 'ft%_allowed_last_10_opp',
 'ts%_allowed_last_10_opp',
 'efg%_allowed_last_10_opp',
 '3par_allowed_last_10_opp',
 'ftr_allowed_last_10_opp',
 '3p%_max_allowed_last_10_opp',
 'ft_max_allowed_last_10_opp',
 'pf_max_allowed_last_10_opp',
 'efg%_max_allowed_last_10_opp',
 'stl%_max_allowed_last_10_opp']

In [106]:
log_model.fit(x_train[predictors], y_train)
y_pred_log_model = log_model.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_log_model))

0.6505285736172821


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# 50 features
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
sfs = SequentialFeatureSelector(log_model, n_features_to_select=50, direction="forward")
sfs.fit(features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
log_model.fit(x_train[predictors], y_train)
y_pred_log_model = log_model.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_log_model))

In [None]:
# 100 features
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
sfs = SequentialFeatureSelector(log_model, n_features_to_select=100, direction="forward")
sfs.fit(features_df, label)
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
log_model.fit(x_train[predictors], y_train)
y_pred_log_model = log_model.predict(x_test[predictors])
print(metrics.accuracy_score(y_test, y_pred_log_model))

### Trying other models

Let's try out more models from sklearn


## Support Vector Machines

More specifically, support vector classifiers (SVCs)

In [175]:
from sklearn import svm

# Simple SVC
svm_model = svm.SVC(probability=True)
svm_model.fit(x_train, y_train)
y_pred_svm = svm_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_svm))
y_pred_svm

0.635973647924008


array([ True,  True, False, ..., False,  True,  True])

In [109]:
# Predict with a probability for each class
y_pred_prob_svm = svm_model.predict_proba(x_test)
y_pred_prob_svm

array([[0.33179662, 0.66820338],
       [0.42257925, 0.57742075],
       [0.6471903 , 0.3528097 ],
       ...,
       [0.64086886, 0.35913114],
       [0.43927236, 0.56072764],
       [0.38644143, 0.61355857]])

In [144]:
print(type(y_pred_prob_svm))
y_pred_prob_svm.shape

<class 'numpy.ndarray'>


(6527, 2)

The code above gives us the probability that a game is either a win or loss. We will test the accuracy of predictions with a "high" probability of classification.

In [167]:
prob_threshold = 0.7

# Identify predictions where the probability of either class 0 or class 1 meets or exceeds the threshold
high_confidence_indices = np.where((y_pred_prob_svm[:, 0] >= prob_threshold) | (y_pred_prob_svm[:, 1] >= prob_threshold))

high_confidence_indices = list(high_confidence_indices[0])
print(len(high_confidence_indices))

1684


In [168]:
# Extract the predictions and actual labels for these high confidence predictions
high_conf_predictions = np.argmax(y_pred_prob_svm[high_confidence_indices], axis=1)
high_conf_actual = y_test.reset_index(drop=True)
high_conf_actual

0        True
1       False
2       False
3        True
4       False
        ...  
6522    False
6523    False
6524    False
6525    False
6526    False
Name: won, Length: 6527, dtype: bool

In [169]:
y_test

11521     True
12676    False
859      False
11974     True
14389    False
         ...  
2084     False
5708     False
21217    False
8818     False
18791    False
Name: won, Length: 6527, dtype: bool

In [170]:
high_conf_actual = high_conf_actual[high_confidence_indices]

In [171]:
print(metrics.accuracy_score(high_conf_actual, high_conf_predictions))

0.7868171021377672


In [174]:
## What about the accuracy of those outside of the "high_conf_predictions"?

low_confidence_indices = [i for i in range(len(y_pred_prob_svm)) if i not in high_confidence_indices]
print(len(low_confidence_indices))
assert len(low_confidence_indices) + len(high_confidence_indices) == len(y_pred_prob_svm)

4843


In [179]:
# Extract the predictions and actual labels for these "low" confidence predictions
low_conf_predictions = np.argmax(y_pred_prob_svm[low_confidence_indices], axis=1)
low_conf_actual = y_test.reset_index(drop=True)
low_conf_actual = low_conf_actual[low_confidence_indices]
low_conf_actual

0        True
1       False
2       False
3        True
5       False
        ...  
6522    False
6523    False
6524    False
6525    False
6526    False
Name: won, Length: 4843, dtype: bool

In [182]:
print(metrics.accuracy_score(low_conf_actual, low_conf_predictions)) # A little better than coin flip...

0.5862068965517241


Let's compare if Elo performs better for those "lower" confidence predictions

In [191]:
x_test_low_confidence = x_test.reset_index(drop=True)
x_test_low_confidence = x_test_low_confidence.loc[low_confidence_indices]
assert(x_test_low_confidence.index.values.tolist() == low_conf_actual.index.values.tolist())
x_test_low_confidence

Unnamed: 0,team_elo_before,team_opp_elo_before,team_expected_win_probability,team_point_diff_proj,home,home_opp,fg_last_10,fga_last_10,fg%_last_10,3p_last_10,...,drb%_max_allowed_last_10_opp,trb%_max_allowed_last_10_opp,ast%_max_allowed_last_10_opp,stl%_max_allowed_last_10_opp,blk%_max_allowed_last_10_opp,tov%_max_allowed_last_10_opp,usg%_max_allowed_last_10_opp,ortg_max_allowed_last_10_opp,drtg_max_allowed_last_10_opp,total_allowed_last_10_opp
0,1576.324864,1431.513971,0.564133,1.600389,0,1,40.1,89.500000,0.4481,9.700000,...,40.28,26.620000,44.72,4.65,15.090000,44.270000,35.170000,177.9,111.000000,107.9
1,1385.889070,1343.599383,0.694042,5.081775,1,0,44.0,92.333333,0.4760,11.333333,...,35.40,34.566667,46.20,12.80,10.233333,40.266667,42.833333,180.0,114.666667,126.0
2,1423.404577,1549.811052,0.462071,-0.943088,1,0,40.5,85.000000,0.4758,4.100000,...,37.02,23.590000,39.28,13.97,9.940000,34.900000,36.650000,164.7,115.500000,97.0
3,1578.871866,1478.439669,0.760201,7.158293,1,0,39.1,86.100000,0.4555,12.300000,...,32.96,21.560000,35.34,6.98,9.230000,28.770000,30.870000,157.2,124.900000,106.7
5,1603.488833,1654.041261,0.295952,-5.376872,0,1,39.6,89.200000,0.4463,9.400000,...,34.47,22.080000,39.54,6.00,6.290000,33.760000,33.090000,167.6,127.800000,109.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6522,1411.645256,1368.685404,0.418642,-2.037148,0,1,38.6,84.100000,0.4601,7.100000,...,36.94,24.780000,35.87,5.50,9.220000,45.250000,30.460000,156.7,117.100000,109.8
6523,1483.148810,1516.669624,0.316780,-4.768600,0,1,39.0,84.100000,0.4633,9.100000,...,32.88,22.180000,36.09,5.43,9.060000,50.530000,37.170000,160.4,114.400000,103.6
6524,1493.192483,1609.084434,0.477146,-0.567570,1,0,44.3,91.700000,0.4853,14.700000,...,39.65,21.650000,48.86,5.56,10.530000,35.890000,34.700000,195.0,129.000000,105.7
6525,1465.325736,1404.566776,0.443767,-1.401466,0,1,38.5,86.600000,0.4449,11.600000,...,34.04,22.920000,37.39,5.87,10.640000,39.990000,33.540000,166.0,107.900000,106.0


In [193]:
elo_proj_low_confidence = x_test_low_confidence["team_point_diff_proj"] > 0
elo_proj_low_confidence

0        True
1        True
2       False
3        True
5       False
        ...  
6522    False
6523    False
6524    False
6525    False
6526    False
Name: team_point_diff_proj, Length: 4843, dtype: bool

In [194]:
print(metrics.accuracy_score(low_conf_actual, elo_proj_low_confidence))

0.6054098699153417


### Elo does better at projecting "closer", "lower confidence" games! Interesting

## Now, I am just gonna try a bunch of different models...

In [196]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Initialize and train decision tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_dt = dt_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_dt))

0.566876053316991


In [197]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

# Initialize and train gradient boosting model
gb_model = GradientBoostingClassifier()
gb_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_gb = gb_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_gb))

0.6514478320821204


In [198]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# Initialize and train KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_knn = knn_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_knn))

0.5875593687758541


In [199]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

# Initialize and train neural network model
nn_model = MLPClassifier()
nn_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_nn = nn_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_nn))

0.6385782135743834


In [200]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

# Initialize and train AdaBoost model
ada_model = AdaBoostClassifier()
ada_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_ada = ada_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_ada))



0.6471579592462081


In [201]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Initialize and train Gaussian Naive Bayes model
gnb_model = GaussianNB()
gnb_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_gnb = gnb_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_gnb))

0.6385782135743834


In [202]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import metrics

# Initialize and train QDA model
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_qda = qda_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_qda))



0.5537000153209745


In [203]:
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

# Initialize and train SGD model
sgd_model = SGDClassifier()
sgd_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_sgd = sgd_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_sgd))

0.6399571012716408


In [204]:
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics

# Initialize and train Bagging model
bagging_model = BaggingClassifier()
bagging_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_bagging = bagging_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_bagging))

0.607323425769879


In [205]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics

# Initialize and train Extra Trees model
et_model = ExtraTreesClassifier()
et_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_et = et_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_et))

0.6373525356212655


In [206]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Initialize individual models
log_clf = LogisticRegression()
svc_clf = SVC(probability=True)
dt_clf = DecisionTreeClassifier()

# Create Voting Classifier
voting_model = VotingClassifier(estimators=[('lr', log_clf), ('svc', svc_clf), ('dt', dt_clf)], voting='soft')
voting_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_voting = voting_model.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_voting))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5832694959399418


In [207]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize and train XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(x_test)
print(accuracy_score(y_test, y_pred_xgb))

0.620805883254175


In [208]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Initialize and train LightGBM model
lgbm_model = LGBMClassifier()
lgbm_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_lgbm = lgbm_model.predict(x_test)
print(accuracy_score(y_test, y_pred_lgbm))

[LightGBM] [Info] Number of positive: 7600, number of negative: 7629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58236
[LightGBM] [Info] Number of data points in the train set: 15229, number of used features: 262
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499048 -> initscore=-0.003809
[LightGBM] [Info] Start training from score -0.003809
0.6390378428068025


In [209]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize and train CatBoost model
cat_model = CatBoostClassifier(verbose=0) # `verbose=0` to keep the output clean
cat_model.fit(x_train, y_train)

# Predict and evaluate
y_pred_cat = cat_model.predict(x_test)
print(accuracy_score(y_test, y_pred_cat))

0.6489964761758847
