<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Feature Selection using vtreat</h4>
    <p style="font-size: 20px;">NBA API Data (2022-2024)</p>
</div>

<a name="Feature-Selection"></a>

# Table of Contents

[Setup](#Setup)

[Data](#Data)

**[1. vtreat for Total Points](#1.-vtreat-for-Total_Points)**

**[2. vtreat for Plus Minus](#2.-vtreat-for-Plus_Minus)**

**[3. vtreat for Game Winner](#3.-vtreat-for-Game-Winner)**

# Setup

[Return to top](#Feature-Engineering)

In [1]:
import sys
from pathlib import Path
# get current working directory
cwd = %pwd
# add shared_code directory to Python sys.path
sys.path.append(str(Path(cwd).parent / "shared_code"))
# import all libraries in shared_code directory 'imports.py' file
from imports import *
%matplotlib inline

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# Data

[Return to top](#Feature-Engineering)

In [2]:
# load, filter (by time) and scale data
pts_scaled_df, pm_scaled_df, res_scaled_df, test_set_obs = utl.load_and_scale_data(
    file_path='../../data/processed/nba_team_matchups_rolling_box_scores_2022_2024_r05.csv',
    seasons_to_keep=['2021-22', '2022-23', '2023-24'],
    training_season='2021-22',
    feature_prefix='ROLL_',
    scaler_type='minmax', 
    scale_target=False
)

Season 2021-22: 1186 games
Season 2022-23: 1181 games
Season 2023-24: 692 games
Total number of games across sampled seasons: 3059 games


In [3]:
# date after the training data ends (for filtering rows later)
end_of_training = '2022-05-01'

<a name="1.-vtreat-for-Total-Points"></a>
# 1. vtreat for Total Points

[Return to top](#Feature-Selection)

In [4]:
# setting up vtreat for a regression problem
treatment_pts = vtreat.NumericOutcomeTreatment(
    outcome_name='TOTAL_PTS',  
    cols_to_copy=None,  # copy without any transformation
    params={
        'filter_to_recommended': True,
        'indicator_min_fraction': 0.01
    }
)

In [5]:
# fit the treatment to the training data
Xy_train = pts_scaled_df.loc[:end_of_training]
Xy_train_processed = treatment_pts.fit_transform(Xy_train)

# apply to the test data
Xy_test = pts_scaled_df.loc[end_of_training:]
Xy_test_processed = treatment_pts.transform(Xy_test)

In [6]:
# concatenate train and test sets
pts_scaled_df_processed = pd.concat([Xy_train, Xy_test], axis=0)

# get recommended features
pts_to_keep = treatment_pts.score_frame_.loc[treatment_pts.score_frame_['recommended'], 'variable'].to_list()
pts_scaled_df_processed_rec = pts_scaled_df_processed[pts_to_keep  + ['TOTAL_PTS']]

pts_scaled_df_processed_rec.head()

Unnamed: 0_level_0,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_HOME_FTA,ROLL_HOME_FTM,ROLL_HOME_PF,ROLL_AWAY_FGA,ROLL_AWAY_TOV,ROLL_AWAY_PTS,ROLL_AWAY_FG_PCT,ROLL_HOME_FG_PCT,ROLL_HOME_AST,ROLL_HOME_FG3A,ROLL_HOME_FGM,ROLL_HOME_PTS,ROLL_HOME_FG3_PCT,ROLL_AWAY_AST,ROLL_AWAY_FGM,ROLL_AWAY_DREB,TOTAL_PTS
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-10-23,0.336,0.285,0.878,0.805,0.661,0.202,0.391,0.577,0.704,0.753,0.612,0.58,0.522,0.745,0.731,0.5,0.586,0.369,185
2021-10-23,0.294,0.163,0.534,0.466,0.576,0.362,0.348,0.096,0.0,0.0,0.0,0.412,0.0,0.0,0.0,0.083,0.017,0.685,198
2021-10-23,0.672,0.772,0.534,0.593,0.661,0.176,0.174,0.635,0.728,0.758,0.561,0.454,0.652,0.691,0.466,0.708,0.586,0.685,239
2021-10-23,0.588,0.813,0.382,0.297,0.661,0.122,0.348,0.25,0.225,0.827,0.918,0.244,0.826,0.727,0.772,0.208,0.069,0.369,232
2021-10-24,0.504,0.569,0.229,0.254,0.322,1.0,0.478,1.0,0.362,0.848,0.765,0.58,0.783,0.745,0.82,0.833,0.897,0.73,204


In [7]:
# examine score frame
treatment_pts.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,ROLL_HOME_REB,ROLL_HOME_REB,clean_copy,False,True,-0.05,0.002492,0.08569,36.0,0.028,False
1,ROLL_AWAY_FTM,ROLL_AWAY_FTM,clean_copy,False,True,0.1,0.01008,0.0005347,36.0,0.028,True
2,ROLL_AWAY_FTA,ROLL_AWAY_FTA,clean_copy,False,True,0.116,0.01347,6.181e-05,36.0,0.028,True
3,ROLL_HOME_FGA,ROLL_HOME_FGA,clean_copy,False,True,0.032,0.00101,0.2742,36.0,0.028,False
4,ROLL_AWAY_FG3M,ROLL_AWAY_FG3M,clean_copy,False,True,0.056,0.003117,0.05459,36.0,0.028,False
5,ROLL_HOME_FTA,ROLL_HOME_FTA,clean_copy,False,True,0.125,0.01566,1.542e-05,36.0,0.028,True
6,ROLL_HOME_FT_PCT,ROLL_HOME_FT_PCT,clean_copy,False,True,0.02,0.0004106,0.4857,36.0,0.028,False
7,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FG3_PCT,clean_copy,False,True,0.055,0.003014,0.05873,36.0,0.028,False
8,ROLL_HOME_FTM,ROLL_HOME_FTM,clean_copy,False,True,0.121,0.0147,2.83e-05,36.0,0.028,True
9,ROLL_HOME_PF,ROLL_HOME_PF,clean_copy,False,True,0.064,0.004123,0.02701,36.0,0.028,True


In [8]:
# recommended variables
treatment_pts.score_frame_.loc[treatment_pts.score_frame_['recommended'], ['variable']]

Unnamed: 0,variable
1,ROLL_AWAY_FTM
2,ROLL_AWAY_FTA
5,ROLL_HOME_FTA
8,ROLL_HOME_FTM
9,ROLL_HOME_PF
11,ROLL_AWAY_FGA
15,ROLL_AWAY_TOV
18,ROLL_AWAY_PTS
19,ROLL_AWAY_FG_PCT
21,ROLL_HOME_FG_PCT


In [9]:
# not recommended variables
treatment_pts.score_frame_.loc[~treatment_pts.score_frame_['recommended'], ['variable']]

Unnamed: 0,variable
0,ROLL_HOME_REB
3,ROLL_HOME_FGA
4,ROLL_AWAY_FG3M
6,ROLL_HOME_FT_PCT
7,ROLL_AWAY_FG3_PCT
10,ROLL_AWAY_BLK
12,ROLL_HOME_DREB
13,ROLL_AWAY_REB
14,ROLL_HOME_OREB
16,ROLL_AWAY_STL


<a name="2.-vtreat-for-Plus_Minus"></a>
# 2. vtreat for Plus Minus

[Return to top](#Feature-Selection)

In [10]:
# setting up vtreat for a regression problem
treatment_pm = vtreat.NumericOutcomeTreatment(
    outcome_name='PLUS_MINUS',  
    cols_to_copy=None,  # copy without any transformation
    params={
        'filter_to_recommended': True,
        'indicator_min_fraction': 0.01
    }
)

In [11]:
# fit the treatment to the training data
Xy_train = pm_scaled_df.loc[:end_of_training]
Xy_train_processed = treatment_pm.fit_transform(Xy_train)

# apply to the test data
Xy_test = pm_scaled_df.loc[end_of_training:]
Xy_test_processed = treatment_pts.transform(Xy_test)

In [12]:
# concatenate train and test sets
pm_scaled_df_processed = pd.concat([Xy_train, Xy_test], axis=0)

# get recommended features
pm_to_keep = treatment_pm.score_frame_.loc[treatment_pm.score_frame_['recommended'], 'variable'].to_list()
pm_scaled_df_processed_rec = pm_scaled_df_processed[pm_to_keep  + ['PLUS_MINUS']]

pm_scaled_df_processed_rec.head()

Unnamed: 0_level_0,ROLL_HOME_DREB,ROLL_AWAY_PTS,ROLL_AWAY_FG_PCT,ROLL_HOME_FG_PCT,ROLL_HOME_AST,ROLL_HOME_FG3A,ROLL_HOME_FG3M,ROLL_HOME_FGM,ROLL_AWAY_FT_PCT,ROLL_HOME_PTS,ROLL_HOME_FG3_PCT,ROLL_AWAY_AST,ROLL_AWAY_FGM,ROLL_AWAY_DREB,PLUS_MINUS
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-10-23,0.292,0.577,0.704,0.753,0.612,0.58,0.758,0.522,0.603,0.745,0.731,0.5,0.586,0.369,7.0
2021-10-23,0.381,0.096,0.0,0.0,0.0,0.412,0.076,0.0,0.837,0.0,0.0,0.083,0.017,0.685,-8.0
2021-10-23,0.602,0.635,0.728,0.758,0.561,0.454,0.455,0.652,0.469,0.691,0.466,0.708,0.586,0.685,29.0
2021-10-23,0.159,0.25,0.225,0.827,0.918,0.244,0.53,0.826,0.268,0.727,0.772,0.208,0.069,0.369,-10.0
2021-10-24,0.779,1.0,0.362,0.848,0.765,0.58,0.833,0.783,0.446,0.745,0.82,0.833,0.897,0.73,-10.0


In [13]:
# examine score frame
treatment_pm.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,ROLL_HOME_REB,ROLL_HOME_REB,clean_copy,False,True,0.061,0.003694,0.03635,36.0,0.028,False
1,ROLL_AWAY_FTM,ROLL_AWAY_FTM,clean_copy,False,True,-0.031,0.0009783,0.2818,36.0,0.028,False
2,ROLL_AWAY_FTA,ROLL_AWAY_FTA,clean_copy,False,True,0.01,0.0001015,0.7288,36.0,0.028,False
3,ROLL_HOME_FGA,ROLL_HOME_FGA,clean_copy,False,True,-0.037,0.001388,0.1997,36.0,0.028,False
4,ROLL_AWAY_FG3M,ROLL_AWAY_FG3M,clean_copy,False,True,-0.027,0.000704,0.3613,36.0,0.028,False
5,ROLL_HOME_FTA,ROLL_HOME_FTA,clean_copy,False,True,-0.013,0.000158,0.6654,36.0,0.028,False
6,ROLL_HOME_FT_PCT,ROLL_HOME_FT_PCT,clean_copy,False,True,-0.008,6.278e-05,0.7852,36.0,0.028,False
7,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FG3_PCT,clean_copy,False,True,-0.041,0.001694,0.1567,36.0,0.028,False
8,ROLL_HOME_FTM,ROLL_HOME_FTM,clean_copy,False,True,-0.011,0.0001176,0.7091,36.0,0.028,False
9,ROLL_HOME_PF,ROLL_HOME_PF,clean_copy,False,True,-0.019,0.0003648,0.5111,36.0,0.028,False


In [14]:
# recommended variables
treatment_pm.score_frame_.loc[treatment_pm.score_frame_['recommended'], ['variable']]

Unnamed: 0,variable
12,ROLL_HOME_DREB
18,ROLL_AWAY_PTS
19,ROLL_AWAY_FG_PCT
21,ROLL_HOME_FG_PCT
22,ROLL_HOME_AST
23,ROLL_HOME_FG3A
24,ROLL_HOME_FG3M
25,ROLL_HOME_FGM
27,ROLL_AWAY_FT_PCT
28,ROLL_HOME_PTS


In [15]:
# not recommended variables
treatment_pm.score_frame_.loc[~treatment_pm.score_frame_['recommended'], ['variable']]

Unnamed: 0,variable
0,ROLL_HOME_REB
1,ROLL_AWAY_FTM
2,ROLL_AWAY_FTA
3,ROLL_HOME_FGA
4,ROLL_AWAY_FG3M
5,ROLL_HOME_FTA
6,ROLL_HOME_FT_PCT
7,ROLL_AWAY_FG3_PCT
8,ROLL_HOME_FTM
9,ROLL_HOME_PF


<a name="3.-vtreat-for-Game-Winner"></a>
# 3. vtreat for Game Winner

[Return to top](#Feature-Selection)

In [16]:
# setting up vtreat for a binary classification problem
treatment_res = vtreat.BinomialOutcomeTreatment(
    outcome_name='GAME_RESULT',
    outcome_target=1,   # target for binary classification
    cols_to_copy=None,  # copy without any transformation
    params={
        'filter_to_recommended': True,
        'indicator_min_fraction': 0.01
    }
)

In [17]:
# fit the treatment to the training data
Xy_train = res_scaled_df.loc[:end_of_training]
Xy_train_processed = treatment_res.fit_transform(Xy_train)

# apply to the test data
Xy_test = res_scaled_df.loc[end_of_training:]
Xy_test_processed = treatment_res.transform(Xy_test)

In [18]:
# concatenate train and test sets
res_scaled_df_processed = pd.concat([Xy_train, Xy_test], axis=0)

# get recommended features
res_to_keep = treatment_res.score_frame_.loc[treatment_res.score_frame_['recommended'], 'variable'].to_list()
res_scaled_df_processed_rec = res_scaled_df_processed[res_to_keep  + ['GAME_RESULT']]

res_scaled_df_processed_rec.head()

Unnamed: 0_level_0,ROLL_HOME_REB,ROLL_HOME_DREB,ROLL_AWAY_TOV,ROLL_AWAY_STL,ROLL_AWAY_PTS,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FGM,ROLL_AWAY_FT_PCT,ROLL_HOME_PTS,ROLL_AWAY_AST,ROLL_AWAY_FGM,GAME_RESULT
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-10-23,0.478,0.292,0.391,0.28,0.577,0.753,0.758,0.522,0.603,0.745,0.5,0.586,1
2021-10-23,0.826,0.381,0.348,0.28,0.096,0.0,0.076,0.0,0.837,0.0,0.083,0.017,0
2021-10-23,0.609,0.602,0.174,0.36,0.635,0.758,0.455,0.652,0.469,0.691,0.708,0.586,1
2021-10-23,0.348,0.159,0.348,0.2,0.25,0.827,0.53,0.826,0.268,0.727,0.208,0.069,0
2021-10-24,0.826,0.779,0.478,0.76,1.0,0.848,0.833,0.783,0.446,0.745,0.833,0.897,0


In [19]:
# examine score frame
treatment_res.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,ROLL_HOME_REB,ROLL_HOME_REB,clean_copy,False,True,0.07381,0.003967,0.01087,36.0,0.028,True
1,ROLL_AWAY_FTM,ROLL_AWAY_FTM,clean_copy,False,True,-0.01869,0.0002533,0.5199,36.0,0.028,False
2,ROLL_AWAY_FTA,ROLL_AWAY_FTA,clean_copy,False,True,0.009199,6.139e-05,0.7514,36.0,0.028,False
3,ROLL_HOME_FGA,ROLL_HOME_FGA,clean_copy,False,True,-0.03112,0.0007022,0.2839,36.0,0.028,False
4,ROLL_AWAY_FG3M,ROLL_AWAY_FG3M,clean_copy,False,True,-0.01946,0.0002746,0.5029,36.0,0.028,False
5,ROLL_HOME_FTA,ROLL_HOME_FTA,clean_copy,False,True,0.01467,0.0001561,0.6134,36.0,0.028,False
6,ROLL_HOME_FT_PCT,ROLL_HOME_FT_PCT,clean_copy,False,True,0.01153,9.643e-05,0.6913,36.0,0.028,False
7,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FG3_PCT,clean_copy,False,True,-0.02717,0.0005358,0.3493,36.0,0.028,False
8,ROLL_HOME_FTM,ROLL_HOME_FTM,clean_copy,False,True,0.02081,0.0003144,0.4734,36.0,0.028,False
9,ROLL_HOME_PF,ROLL_HOME_PF,clean_copy,False,True,-0.04246,0.001308,0.1436,36.0,0.028,False


In [20]:
# recommended variables
treatment_res.score_frame_.loc[treatment_res.score_frame_['recommended'], ['variable']]

Unnamed: 0,variable
0,ROLL_HOME_REB
12,ROLL_HOME_DREB
15,ROLL_AWAY_TOV
16,ROLL_AWAY_STL
18,ROLL_AWAY_PTS
21,ROLL_HOME_FG_PCT
24,ROLL_HOME_FG3M
25,ROLL_HOME_FGM
27,ROLL_AWAY_FT_PCT
28,ROLL_HOME_PTS


In [21]:
# not recommended variables
treatment_res.score_frame_.loc[~treatment_res.score_frame_['recommended'], ['variable']]

Unnamed: 0,variable
1,ROLL_AWAY_FTM
2,ROLL_AWAY_FTA
3,ROLL_HOME_FGA
4,ROLL_AWAY_FG3M
5,ROLL_HOME_FTA
6,ROLL_HOME_FT_PCT
7,ROLL_AWAY_FG3_PCT
8,ROLL_HOME_FTM
9,ROLL_HOME_PF
10,ROLL_AWAY_BLK
