# Predict NBA Games Tutorial

The following is an exploratory notebook for predicting the results of historial NBA games. This notebook follows the tutorial by Vikas Paruchuri of Dataquest, which can be found [here](https://www.youtube.com/watch?v=egTylm6C2is) and in [this repo](https://github.com/dataquestio/project-walkthroughs/tree/master/nba_games).

The dataset for this notebook can be found [here](https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbmpENGkySmZQVzk3Z0VRMkdnOXhJLWw2eUFCd3xBQ3Jtc0trU0RGdUFyd3dQazNjaXJPOVd4NW9xVzhpYUdHLUZ6UEQ2dGUtSnd6S3ZtVC13bnhuZWx4em9QeTZKVm5kNkVUX2F6U2IzTzNCalBoSml5WGt5b09YTlhSN1o0WWZOVThsNGV1NnNwZl9BVW41RWpJdw&q=https%3A%2F%2Fdrive.google.com%2Fuc%3Fexport%3Ddownload%26id%3D1YyNpERG0jqPlpxZvvELaNcMHTiKVpfWe&v=egTylm6C2is) and should be placed in the "../data/raw/" directory.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("nba_games_runtime.csv", index_col=0) # read NBA data, first col is our pandas index
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22040,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22041,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False
22042,240.0,240.0,49.0,92.0,0.533,20.0,42.0,0.476,22.0,28.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True


In [3]:
df = df.sort_values("date")
df = df.reset_index(drop=True) # drop old index
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,240.0,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,...,25.0,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False
22040,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22041,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22042,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False


In [4]:
# Remove extraneous columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,0.750,...,25.0,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False
22040,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,0.632,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22041,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,0.926,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22042,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,0.700,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False


In [5]:
# This adds a "target" column to our data frame, which is what we should predict as the outcome of the subsequent game
def add_target(team):
    new_team_df = team.copy()
    new_team_df["target"] = new_team_df["won"].shift(-1)
    return new_team_df

df = df.groupby("team", group_keys=False).apply(add_target)

  df = df.groupby("team", group_keys=False).apply(add_target)


In [6]:
df["team"]

0        DET
1        ATL
2        CLE
3        CHI
4        NOP
        ... 
22039    MIL
22040    MEM
22041    MIN
22042    POR
22043    UTA
Name: team, Length: 22044, dtype: object

In [7]:
df[df["team"] == "TOR"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
16,240.0,36.0,80.0,0.450,7.0,18.0,0.389,27.0,39.0,0.692,...,31.1,127.0,110.0,IND,99,0,2016,2015-10-28,True,True
58,240.0,36.0,82.0,0.439,11.0,26.0,0.423,30.0,35.0,0.857,...,32.7,120.0,116.0,BOS,103,1,2016,2015-10-30,True,True
77,240.0,41.0,90.0,0.456,11.0,25.0,0.440,13.0,20.0,0.650,...,53.8,150.0,130.0,MIL,87,0,2016,2015-11-01,True,True
115,240.0,37.0,82.0,0.451,5.0,17.0,0.294,23.0,26.0,0.885,...,100.0,136.0,114.0,DAL,91,1,2016,2015-11-03,True,True
125,240.0,33.0,79.0,0.418,5.0,14.0,0.357,32.0,39.0,0.821,...,39.0,139.0,119.0,OKC,98,1,2016,2015-11-04,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21920,240.0,43.0,81.0,0.531,17.0,33.0,0.515,20.0,24.0,0.833,...,36.6,209.0,137.0,CHO,117,1,2024,2024-02-07,True,True
21942,240.0,40.0,95.0,0.421,10.0,33.0,0.303,17.0,19.0,0.895,...,30.9,300.0,118.0,HOU,104,0,2024,2024-02-09,True,False
21960,240.0,35.0,87.0,0.402,9.0,31.0,0.290,16.0,23.0,0.696,...,27.2,229.0,113.0,CLE,119,0,2024,2024-02-10,False,False
21981,240.0,39.0,98.0,0.398,8.0,31.0,0.258,13.0,16.0,0.813,...,34.3,155.0,107.0,SAS,122,0,2024,2024-02-12,False,False


In [8]:
df.loc[pd.isnull(df["target"]), "target"] = 2

In [9]:
df["target"] = df["target"].astype(int, errors="ignore")

In [10]:
# Check our dataset is balanced
df["won"].value_counts()

won
True     11022
False    11022
Name: count, dtype: int64

In [11]:
df["target"].value_counts()

target
1    11008
0    11006
2       30
Name: count, dtype: int64

In [12]:
nulls = pd.isnull(df)

In [13]:
nulls = nulls.sum()

In [14]:
nulls = nulls[nulls > 0]
nulls

+/-             22044
mp_max          22044
mp_max.1        22044
+/-_max            17
+/-_opp         22044
mp_max_opp      22044
mp_max_opp.1    22044
+/-_max_opp        17
dtype: int64

In [15]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=140)

In [16]:
df = df[valid_columns].copy()

In [17]:
df  # should have 142 columns

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True,1
1,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
2,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
3,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
4,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,0.750,...,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False,2
22040,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,0.632,...,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True,2
22041,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,0.926,...,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True,2
22042,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,0.700,...,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False,2


In [18]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

In [19]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

In [20]:
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [21]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [22]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.441860,0.396825,0.730455,...,0.151282,0.800948,0.517647,ATL,0.267857,1.0,2016,2015-10-27,True,1
1,0.0,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.020513,0.203791,0.317647,DET,0.375000,0.0,2016,2015-10-27,False,1
2,0.0,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.161538,0.345972,0.317647,CHI,0.294643,1.0,2016,2015-10-27,False,1
3,0.0,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.089744,0.232227,0.329412,CLE,0.276786,0.0,2016,2015-10-27,True,1
4,0.0,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.278205,0.554502,0.317647,GSW,0.419643,1.0,2016,2015-10-27,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,0.0,0.500000,0.441176,0.473684,0.379310,0.606061,0.296912,0.325581,0.301587,0.708285,...,0.128205,0.341232,0.482353,MEM,0.437500,1.0,2024,2024-02-15,False,2
22040,0.0,0.543478,0.338235,0.624402,0.448276,0.333333,0.593824,0.255814,0.285714,0.570595,...,0.160256,0.383886,0.517647,MIL,0.410714,0.0,2024,2024-02-15,True,2
22041,0.0,0.586957,0.426471,0.593301,0.379310,0.393939,0.435867,0.558140,0.412698,0.913652,...,0.121795,0.526066,0.705882,POR,0.241071,1.0,2024,2024-02-15,True,2
22042,0.0,0.260870,0.338235,0.248804,0.275862,0.378788,0.327791,0.465116,0.460317,0.649942,...,0.141026,0.436019,0.282353,MIN,0.571429,0.0,2024,2024-02-15,False,2


In [23]:
sfs.fit(df[selected_columns], df["target"])

In [24]:
predictors = list(selected_columns[sfs.get_support()])

In [25]:
predictors

['orb',
 'drb',
 'trb',
 'pf',
 'ftr',
 'usg%',
 'ft_max',
 'fta_max',
 'drb%_max',
 'trb%_max',
 'tov%_max',
 'usg%_max',
 'fg%_opp',
 'ft%_opp',
 'ast_opp',
 'pf_opp',
 'efg%_opp',
 'blk%_opp',
 'usg%_opp',
 '3p_max_opp',
 'ft%_max_opp',
 'orb_max_opp',
 'tov_max_opp',
 'pf_max_opp',
 'drb%_max_opp',
 'ast%_max_opp',
 'stl%_max_opp',
 'blk%_max_opp',
 'tov%_max_opp',
 'usg%_max_opp']

In [26]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds , index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

In [27]:
predictions = backtest(df, rr, predictors)

In [28]:
predictions

Unnamed: 0,actual,prediction
5250,0,0
5251,1,0
5252,1,1
5253,1,0
5254,1,1
...,...,...
22039,2,0
22040,2,1
22041,2,1
22042,2,0


In [29]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.5419350990217132

In [30]:
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

  df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])


home
0.0    0.428053
1.0    0.571947
dtype: float64

In [31]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [32]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.441860,0.396825,0.730455,...,0.071,0.550314,0.151282,0.800948,0.517647,0.267857,1.0,True,DET,2016
1,0.0,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.047,0.300839,0.020513,0.203791,0.317647,0.375000,0.0,False,ATL,2016
2,0.0,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.140,0.509434,0.161538,0.345972,0.317647,0.294643,1.0,False,CLE,2016
3,0.0,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.185,0.270440,0.089744,0.232227,0.329412,0.276786,0.0,True,CHI,2016
4,0.0,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.079,0.679245,0.278205,0.554502,0.317647,0.419643,1.0,False,NOP,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,0.0,0.500000,0.441176,0.473684,0.379310,0.606061,0.296912,0.325581,0.301587,0.708285,...,0.045,0.213836,0.128205,0.341232,0.482353,0.437500,1.0,False,MIL,2024
22040,0.0,0.543478,0.338235,0.624402,0.448276,0.333333,0.593824,0.255814,0.285714,0.570595,...,0.097,0.371069,0.160256,0.383886,0.517647,0.410714,0.0,True,MEM,2024
22041,0.0,0.586957,0.426471,0.593301,0.379310,0.393939,0.435867,0.558140,0.412698,0.913652,...,0.089,0.222222,0.121795,0.526066,0.705882,0.241071,1.0,True,MIN,2024
22042,0.0,0.260870,0.338235,0.248804,0.275862,0.378788,0.327791,0.465116,0.460317,0.649942,...,0.068,0.300839,0.141026,0.436019,0.282353,0.571429,0.0,False,POR,2024


In [33]:
def find_team_averages(team):
    # Calculate rolling mean for numeric columns only
    numeric_cols = team.select_dtypes(include='number')
    rolling_numeric = numeric_cols.rolling(10).mean()
#     # Ensure the order of rows is preserved by combining the numeric rolling means with non-numeric data
#     team[numeric_cols.columns] = rolling_numeric
    return rolling_numeric

# Compute a rolling average of past 10 games, grouped by team and season
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)


In [34]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,0.000,0.480435,0.397059,0.487081,0.486207,0.542424,0.428029,0.318605,0.312698,0.674329,...,0.341170,0.0558,0.0691,0.296331,0.276282,0.401896,0.510588,0.437500,0.6,2024.0
22040,0.000,0.413043,0.375000,0.422010,0.500000,0.530303,0.450713,0.283721,0.279365,0.661960,...,0.426376,0.0553,0.0962,0.349790,0.115513,0.412322,0.445882,0.444643,0.3,2024.0
22041,0.025,0.519565,0.370588,0.563876,0.482759,0.448485,0.496200,0.397674,0.347619,0.752275,...,0.379472,0.0476,0.0978,0.429874,0.163974,0.501422,0.647059,0.350893,0.7,2024.0
22042,0.025,0.423913,0.360294,0.444019,0.334483,0.384848,0.395487,0.430233,0.358730,0.807351,...,0.317775,0.1409,0.0868,0.343082,0.212949,0.422275,0.504706,0.448214,0.3,2024.0


In [35]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [36]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,season_10
0,0.0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.441860,0.396825,0.730455,...,,,,,,,,,,
1,0.0,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,,,,,,,,,,
2,0.0,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,,,,,,,,,,
3,0.0,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,,,,,,,,,,
4,0.0,0.347826,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,0.0,0.500000,0.441176,0.473684,0.379310,0.606061,0.296912,0.325581,0.301587,0.708285,...,0.341170,0.0558,0.0691,0.296331,0.276282,0.401896,0.510588,0.437500,0.6,2024.0
22040,0.0,0.543478,0.338235,0.624402,0.448276,0.333333,0.593824,0.255814,0.285714,0.570595,...,0.426376,0.0553,0.0962,0.349790,0.115513,0.412322,0.445882,0.444643,0.3,2024.0
22041,0.0,0.586957,0.426471,0.593301,0.379310,0.393939,0.435867,0.558140,0.412698,0.913652,...,0.379472,0.0476,0.0978,0.429874,0.163974,0.501422,0.647059,0.350893,0.7,2024.0
22042,0.0,0.260870,0.338235,0.248804,0.275862,0.378788,0.327791,0.465116,0.460317,0.649942,...,0.317775,0.1409,0.0868,0.343082,0.212949,0.422275,0.504706,0.448214,0.3,2024.0


In [37]:
df = df.dropna()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,season_10
243,0.0,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.311927,0.0628,0.0679,0.413522,0.125256,0.361611,0.449412,0.322321,0.4,2016.0
249,0.0,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.412271,0.0613,0.0772,0.469497,0.220641,0.394787,0.531765,0.300893,0.5,2016.0
254,0.0,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.331537,0.0657,0.1032,0.437212,0.126026,0.404739,0.408235,0.398214,0.2,2016.0
256,0.0,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.627907,0.476190,0.886814,...,0.398280,0.0747,0.0742,0.303564,0.131667,0.387678,0.410588,0.325893,0.5,2016.0
258,0.0,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.458257,0.0699,0.1072,0.380294,0.274359,0.270616,0.478824,0.286607,0.6,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,0.0,0.500000,0.441176,0.473684,0.379310,0.606061,0.296912,0.325581,0.301587,0.708285,...,0.341170,0.0558,0.0691,0.296331,0.276282,0.401896,0.510588,0.437500,0.6,2024.0
22040,0.0,0.543478,0.338235,0.624402,0.448276,0.333333,0.593824,0.255814,0.285714,0.570595,...,0.426376,0.0553,0.0962,0.349790,0.115513,0.412322,0.445882,0.444643,0.3,2024.0
22041,0.0,0.586957,0.426471,0.593301,0.379310,0.393939,0.435867,0.558140,0.412698,0.913652,...,0.379472,0.0476,0.0978,0.429874,0.163974,0.501422,0.647059,0.350893,0.7,2024.0
22042,0.0,0.260870,0.338235,0.248804,0.275862,0.378788,0.327791,0.465116,0.460317,0.649942,...,0.317775,0.1409,0.0868,0.343082,0.212949,0.422275,0.504706,0.448214,0.3,2024.0


In [38]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# This will add information about the next future game (if we are the home team, who is the opp, and the date)

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["home_next"] = add_col(df, "home")
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["team_opp_next"] = add_col(df, "team_opp")
  return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See 

In [39]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,season_10,home_next,team_opp_next,date_next
243,0.0,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.413522,0.125256,0.361611,0.449412,0.322321,0.4,2016.0,0.0,BOS,2015-11-13
249,0.0,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.469497,0.220641,0.394787,0.531765,0.300893,0.5,2016.0,1.0,BRK,2015-11-14
254,0.0,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.437212,0.126026,0.404739,0.408235,0.398214,0.2,2016.0,1.0,TOR,2015-11-15
256,0.0,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.627907,0.476190,0.886814,...,0.303564,0.131667,0.387678,0.410588,0.325893,0.5,2016.0,0.0,WAS,2015-11-14
258,0.0,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.380294,0.274359,0.270616,0.478824,0.286607,0.6,2016.0,0.0,SAC,2015-11-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,0.0,0.500000,0.441176,0.473684,0.379310,0.606061,0.296912,0.325581,0.301587,0.708285,...,0.296331,0.276282,0.401896,0.510588,0.437500,0.6,2024.0,,,
22040,0.0,0.543478,0.338235,0.624402,0.448276,0.333333,0.593824,0.255814,0.285714,0.570595,...,0.349790,0.115513,0.412322,0.445882,0.444643,0.3,2024.0,,,
22041,0.0,0.586957,0.426471,0.593301,0.379310,0.393939,0.435867,0.558140,0.412698,0.913652,...,0.429874,0.163974,0.501422,0.647059,0.350893,0.7,2024.0,,,
22042,0.0,0.260870,0.338235,0.248804,0.275862,0.378788,0.327791,0.465116,0.460317,0.649942,...,0.343082,0.212949,0.422275,0.504706,0.448214,0.3,2024.0,,,


In [40]:
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
    left_on=["team", "date_next"], 
    right_on=["team_opp_next", "date_next"]
)

In [41]:
full

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp_10_y,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.1072,0.380294,0.274359,0.270616,0.478824,0.286607,0.6,2016.0,SAC,TOR
1,0.0,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.1032,0.437212,0.126026,0.404739,0.408235,0.398214,0.2,2016.0,TOR,SAC
2,0.0,0.326087,0.558824,0.186603,0.206897,0.469697,0.203088,0.139535,0.111111,0.854142,...,0.0766,0.413732,0.157821,0.470142,0.391765,0.405357,0.6,2016.0,DEN,NOP
3,0.0,0.413043,0.397059,0.401914,0.137931,0.212121,0.263658,0.418605,0.365079,0.757293,...,0.0726,0.468868,0.124231,0.332227,0.407059,0.343750,0.5,2016.0,ORL,MIN
4,0.0,0.304348,0.323529,0.318182,0.275862,0.272727,0.432304,0.186047,0.158730,0.787631,...,0.0839,0.310377,0.128205,0.312796,0.410588,0.328571,0.5,2016.0,PHI,DAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19481,0.0,0.565217,0.705882,0.354067,0.275862,0.363636,0.339667,0.279070,0.253968,0.725788,...,0.1444,0.422956,0.139744,0.538389,0.601176,0.474107,0.7,2024.0,DET,PHO
19482,0.0,0.434783,0.279412,0.538278,0.517241,0.439394,0.540380,0.372093,0.396825,0.596266,...,0.0672,0.239413,0.253846,0.517062,0.630588,0.435714,0.3,2024.0,BRK,BOS
19483,0.0,0.652174,0.602941,0.516746,0.482759,0.621212,0.369359,0.209302,0.190476,0.730455,...,0.0824,0.330608,0.149359,0.439336,0.607059,0.458929,0.6,2024.0,UTA,GSW
19484,0.0,0.434783,0.279412,0.538278,0.517241,0.469697,0.509501,0.627907,0.634921,0.630105,...,0.0704,0.340042,0.278590,0.409005,0.537647,0.441071,0.5,2024.0,MEM,MIL


In [42]:
# Check if the merge worked properly
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,DEN,NOP,NOP,DEN,2015-11-17
3,ORL,MIN,MIN,ORL,2015-11-18
4,PHI,DAL,DAL,PHI,2015-11-16
...,...,...,...,...,...
19481,DET,PHO,PHO,DET,2024-02-14
19482,BRK,BOS,BOS,BRK,2024-02-14
19483,UTA,GSW,GSW,UTA,2024-02-15
19484,MEM,MIL,MIL,MEM,2024-02-15


In [43]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [44]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [45]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [46]:
sfs.fit(full[selected_columns], full["target"])

In [47]:
predictors = list(selected_columns[sfs.get_support()])

In [48]:
predictions = backtest(full, rr, predictors)

In [49]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6360996691201296