<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Feature Selection using Filter Methods</h4>
    <p style="font-size: 20px;">NBA API Data (2022-2024)</p>
</div>

<a name="Feature-Selection"></a>

# Table of Contents

[Setup](#Setup)

[Data](#Data)

**[1. Filter Methods for Total Points](#1.-Filter-Methods-for-Total-Points)**

- [1.1. Correlation Based](#1.1.-Correlation-Based)

- [1.2. `vtreat` Library](#1.2.-vtreat-Library)

**[2. Filter Methods for Plus Minus](#2.-Filter-Methods-for-Plus-Minus)**

- [2.1. Correlation Based](#2.1.-Correlation-Based)

- [2.2. `vtreat` Library](#2.2.-vtreat-Library)

**[3. Filter Methods for Game Winner](#3.-Filter-Methods-for-Game-Winner)**

- [2.1. Correlation Based](#2.1.-Correlation-Based)

- [2.2. `vtreat` Library](#2.2.-vtreat-Library)

# Setup

[Return to top](#Feature-Selection)

In [1]:
import sys
from pathlib import Path
# get current working directory
cwd = %pwd
# add shared_code directory to Python sys.path
sys.path.append(str(Path(cwd).parent / "shared_code"))
# import all libraries in shared_code directory 'imports.py' file
from imports import *
%matplotlib inline

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# Data

[Return to top](#Feature-Selection)

In [2]:
# load all the rolling data
boxscores = pd.read_csv('../../data/processed/nba_team_matchups_rolling_box_scores_2022_2024_r05.csv')
opponent = pd.read_csv('../../data/processed/nba_team_matchups_rolling_opponent_impact_box_scores_2022_2024_r05.csv')
advanced = pd.read_csv('../../data/processed/nba_team_matchups_rolling_adv_stats_2021_2024_r05.csv')
hustle = pd.read_csv('../../data/processed/nba_team_matchups_rolling_hustle_stats_2021_2024_r05.csv')
miscbox = pd.read_csv('../../data/processed/nba_team_matchups_rolling_misc_box_stats_2021_2024_r05.csv')
factor = pd.read_csv('../../data/processed/nba_team_matchups_rolling_four_factor_stats_2021_2024_r05.csv')
scoring = pd.read_csv('../../data/processed/nba_team_matchups_rolling_scoring_box_stats_2021_2024_r05.csv')
track = pd.read_csv('../../data/processed/nba_team_matchups_rolling_player_track_box_stats_2021_2024_r05.csv')

In [3]:
# merge all the features
cols_to_drop = ['GAME_RESULT', 'TOTAL_PTS', 'PLUS_MINUS', 'HOME_TEAM_NAME',
                'SEASON_ID', 'GAME_DATE', 'AWAY_TEAM_NAME', 'ROLL_HOME_PTS', 'ROLL_AWAY_PTS']

features_df = boxscores.merge(opponent.drop([col for col in cols_to_drop if col in opponent.columns], axis=1), how='left', on='GAME_ID')
features_df = features_df.merge(advanced.drop([col for col in cols_to_drop if col in advanced.columns], axis=1), how='left', on='GAME_ID')
features_df = features_df.merge(hustle.drop([col for col in cols_to_drop if col in hustle.columns], axis=1), how='left', on='GAME_ID')
features_df = features_df.merge(miscbox.drop([col for col in cols_to_drop if col in miscbox.columns], axis=1), how='left', on='GAME_ID')
features_df = features_df.merge(factor.drop([col for col in cols_to_drop if col in factor.columns], axis=1), how='left', on='GAME_ID')
features_df = features_df.merge(scoring.drop([col for col in cols_to_drop if col in scoring.columns], axis=1), how='left', on='GAME_ID')
features_df = features_df.merge(track.drop([col for col in cols_to_drop if col in track.columns], axis=1), how='left', on='GAME_ID')

print(features_df.shape)
features_df.head()

(3172, 264)


Unnamed: 0,GAME_ID,GAME_RESULT,TOTAL_PTS,PLUS_MINUS,HOME_TEAM_NAME,SEASON_ID,GAME_DATE,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT_x,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST_x,ROLL_HOME_STL,ROLL_HOME_BLK_x,ROLL_HOME_TOV,ROLL_HOME_PF_x,...,ROLL_HOME_UFGA,ROLL_HOME_UFG_PCT,ROLL_HOME_FG_PCT_y,ROLL_HOME_DFGM,ROLL_HOME_DFGA,ROLL_HOME_DFG_PCT,ROLL_AWAY_DIST,ROLL_AWAY_ORBC,ROLL_AWAY_DRBC,ROLL_AWAY_RBC,ROLL_AWAY_TCHS,ROLL_AWAY_SAST,ROLL_AWAY_FTAST,ROLL_AWAY_PASS,ROLL_AWAY_AST_y,ROLL_AWAY_CFGM,ROLL_AWAY_CFGA,ROLL_AWAY_CFG_PCT,ROLL_AWAY_UFGM,ROLL_AWAY_UFGA,ROLL_AWAY_UFG_PCT,ROLL_AWAY_FG_PCT_y,ROLL_AWAY_DFGM,ROLL_AWAY_DFGA,ROLL_AWAY_DFG_PCT
0,22100002,0,235,-7.0,Los Angeles Lakers,2021-22,2021-10-19,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,22100001,1,231,23.0,Milwaukee Bucks,2021-22,2021-10-19,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,22100007,1,253,11.0,Memphis Grizzlies,2021-22,2021-10-20,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,22100008,1,230,18.0,Minnesota Timberwolves,2021-22,2021-10-20,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,22100009,0,214,-20.0,New Orleans Pelicans,2021-22,2021-10-20,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
# load, filter (by time) and scale data
pts_features, pm_features, res_features, test_set_obs = utl.load_and_scale_data(
    input_data=features_df,
    seasons_to_keep=['2021-22', '2022-23', '2023-24'],
    training_season='2021-22',
    feature_prefixes=['ROLL_', 'ROLLDIFF_'],
    scaler_type='minmax', 
    scale_target=False
)

Season 2021-22: 1186 games
Season 2022-23: 1181 games
Season 2023-24: 691 games
Total number of games across sampled seasons: 3058 games


In [5]:
pts_features.head()

Unnamed: 0_level_0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT_x,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST_x,ROLL_HOME_STL,ROLL_HOME_BLK_x,ROLL_HOME_TOV,ROLL_HOME_PF_x,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT_x,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,...,ROLL_HOME_UFG_PCT,ROLL_HOME_FG_PCT_y,ROLL_HOME_DFGM,ROLL_HOME_DFGA,ROLL_HOME_DFG_PCT,ROLL_AWAY_DIST,ROLL_AWAY_ORBC,ROLL_AWAY_DRBC,ROLL_AWAY_RBC,ROLL_AWAY_TCHS,ROLL_AWAY_SAST,ROLL_AWAY_FTAST,ROLL_AWAY_PASS,ROLL_AWAY_AST_y,ROLL_AWAY_CFGM,ROLL_AWAY_CFGA,ROLL_AWAY_CFG_PCT,ROLL_AWAY_UFGM,ROLL_AWAY_UFGA,ROLL_AWAY_UFG_PCT,ROLL_AWAY_FG_PCT_y,ROLL_AWAY_DFGM,ROLL_AWAY_DFGA,ROLL_AWAY_DFG_PCT,TOTAL_PTS
GAME_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
2021-10-23,0.745,0.522,0.296,0.753,0.758,0.58,0.731,0.805,0.878,0.535,0.571,0.292,0.478,0.612,1.0,1.0,0.6,0.661,0.577,0.586,0.202,0.704,0.526,0.176,1.0,...,0.761,0.753,0.909,1.0,0.352,0.665,0.613,0.119,0.252,0.53,0.25,0.714,0.531,0.5,0.595,0.704,0.564,0.409,0.19,0.738,0.704,0.62,0.423,1.0,185
2021-10-23,0.0,0.0,0.648,0.0,0.076,0.412,0.0,0.466,0.534,0.438,1.0,0.381,0.826,0.0,0.42,0.273,0.657,0.576,0.096,0.017,0.362,0.0,0.421,0.588,0.364,...,0.0,0.0,0.909,0.655,0.762,0.619,0.387,0.489,0.409,0.579,0.125,0.179,0.517,0.083,0.0,0.458,0.0,0.364,0.453,0.301,0.0,0.185,0.269,0.453,198
2021-10-23,0.691,0.652,0.507,0.758,0.455,0.454,0.466,0.593,0.534,0.72,0.286,0.602,0.609,0.561,0.058,0.364,0.257,0.661,0.635,0.586,0.176,0.728,0.263,0.265,0.396,...,0.676,0.758,0.398,0.276,0.72,0.591,0.032,0.744,0.37,0.66,0.375,0.357,0.638,0.708,0.238,0.246,0.581,0.682,0.433,0.749,0.728,0.076,0.423,0.023,239
2021-10-23,0.727,0.826,0.683,0.827,0.53,0.244,0.772,0.297,0.382,0.315,0.571,0.159,0.348,0.918,0.275,0.182,0.029,0.661,0.25,0.069,0.122,0.225,0.368,0.559,0.317,...,0.712,0.827,0.455,0.586,0.291,0.715,0.452,0.46,0.409,0.46,0.375,0.179,0.403,0.208,0.357,0.282,0.735,0.136,0.372,0.075,0.225,0.511,0.346,0.965,232
2021-10-24,0.745,0.783,0.577,0.848,0.833,0.58,0.82,0.254,0.229,0.56,0.357,0.779,0.826,0.765,0.565,0.818,0.543,0.322,1.0,0.897,1.0,0.362,0.842,1.0,0.559,...,0.85,0.848,0.341,0.655,0.075,1.0,0.903,0.46,0.665,1.0,0.875,0.536,0.859,0.833,0.179,0.352,0.355,1.0,1.0,0.47,0.362,0.783,0.731,0.732,204


<a name="1.-Filter-Methods-for-Total-Points"></a>
# 1. Filter Methods for Total Points

[Return to top](#Feature-Selection)

<a name="1.1.-Correlation-Based"></a>
## 1.1. Correlation Based

[Return to top](#Feature-Selection)

In [6]:
start_time = time.time()

# filter-based feature selection
pts_selection = utl.filter_feature_selection(
    df=pts_features, 
    outcome_name='TOTAL_PTS'
)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

{
    "outcome_correlation": [
        "ROLL_HOME_PTS",
        "ROLL_HOME_FGM",
        "ROLL_HOME_FG_PCT_x",
        "ROLL_HOME_FTM",
        "ROLL_HOME_FTA",
        "ROLL_HOME_AST_x",
        "ROLL_AWAY_PTS",
        "ROLL_AWAY_FGM",
        "ROLL_AWAY_FGA",
        "ROLL_AWAY_FG_PCT_x",
        "ROLL_AWAY_FTM",
        "ROLL_AWAY_FTA",
        "ROLL_AWAY_AST_x",
        "ROLL_HOME_estimatedOffensiveRating",
        "ROLL_HOME_offensiveRating",
        "ROLL_HOME_estimatedDefensiveRating",
        "ROLL_HOME_defensiveRating",
        "ROLL_HOME_assistToTurnover",
        "ROLL_HOME_assistRatio",
        "ROLL_HOME_effectiveFieldGoalPercentage",
        "ROLL_HOME_trueShootingPercentage",
        "ROLL_HOME_estimatedPace",
        "ROLL_AWAY_estimatedOffensiveRating",
        "ROLL_AWAY_offensiveRating",
        "ROLL_AWAY_estimatedDefensiveRating",
        "ROLL_AWAY_defensiveRating",
        "ROLL_AWAY_assistToTurnover",
        "ROLL_AWAY_estimatedTeamTurnoverPercentage",
       

<a name="1.2.-vtreat-Library"></a>
## 1.2. `vtreat` Library

[Return to top](#Feature-Selection)

In [7]:
start_time = time.time()

# automated feature selection and preprocessing
pts_features_selected, pts_selection = utl.vtreat_feature_selection(
    df=pts_features,
    outcome_name='TOTAL_PTS'
)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

There were 83 features selected out of 256 original features

Total time taken: 0.81 seconds


In [8]:
pts_features_selected.head()

Unnamed: 0,ROLL_HOME_PCT_AST_2PM,ROLL_AWAY_effectiveFieldGoalPercentage,ROLL_AWAY_offensiveRating,ROLL_HOME_CHARGES_DRAWN,ROLL_HOME_CFG_PCT,ROLL_HOME_pacePer40,ROLL_HOME_DFGA,ROLL_AWAY_possessions,ROLL_HOME_CFGM,ROLL_HOME_offensiveRating,ROLL_AWAY_BOX_OUT_PLAYER_TEAM_REBS,ROLL_AWAY_CFG_PCT,ROLL_HOME_FTA,ROLL_HOME_OFF_LOOSE_BALLS_RECOVERED,ROLL_HOME_FTM,ROLL_HOME_estimatedOffensiveRating,ROLL_HOME_trueShootingPercentage,ROLL_HOME_FTA_RATE,ROLL_AWAY_TM_TOV_PCT,ROLL_AWAY_PTS_OFF_TOV,ROLL_AWAY_FGM,ROLL_AWAY_EFG_PCT,ROLL_AWAY_DFG_PCT,ROLL_HOME_PCT_UAST_2PM,ROLL_AWAY_UFGM,...,ROLL_HOME_UFG_PCT,ROLL_AWAY_estimatedDefensiveRating,ROLL_AWAY_trueShootingPercentage,ROLL_HOME_defensiveRating,ROLL_HOME_assistRatio,ROLL_AWAY_estimatedOffensiveRating,ROLL_HOME_AST_x,ROLL_HOME_DFG_PCT,ROLL_HOME_estimatedPace,ROLL_HOME_estimatedDefensiveRating,ROLL_HOME_AST_y,ROLL_HOME_PCT_PTS_OFF_TOV,ROLL_AWAY_FTM,ROLL_AWAY_estimatedTeamTurnoverPercentage,ROLL_HOME_OPP_EFG_PCT,ROLL_AWAY_PTS,ROLL_AWAY_AST_y,ROLL_AWAY_AST_x,ROLL_AWAY_defensiveRating,ROLL_AWAY_pace,ROLL_AWAY_DFGA,ROLL_AWAY_DEF_BOXOUTS,ROLL_HOME_DFGM,ROLL_HOME_OPP_TOV_PCT,TOTAL_PTS
0,0.807,0.712,0.611,0.667,0.511,1.0,1.0,0.379,0.257,0.727,0.149,0.564,0.878,0.6,0.805,0.762,0.808,0.812,0.521,0.604,0.586,0.712,1.0,0.193,0.409,...,0.761,0.881,0.714,0.315,0.597,0.672,0.612,0.352,0.763,0.298,0.612,1.0,0.336,0.519,0.487,0.577,0.5,0.5,0.875,0.515,0.423,0.208,0.909,0.84,185
1,0.184,0.0,0.152,0.0,0.138,0.909,0.655,0.408,0.527,0.0,0.468,0.0,0.534,0.3,0.466,0.0,0.0,0.414,0.406,0.34,0.017,0.0,0.453,0.816,0.364,...,0.0,0.566,0.035,0.192,0.0,0.183,0.0,0.762,0.732,0.176,0.0,0.602,0.294,0.404,0.308,0.096,0.083,0.083,0.583,0.557,0.269,0.307,0.909,0.762,198
2,0.291,0.618,0.684,0.0,0.601,0.879,0.276,0.349,0.932,0.715,0.149,0.581,0.534,0.45,0.593,0.685,0.712,0.444,0.255,0.377,0.586,0.618,0.023,0.709,0.682,...,0.676,0.462,0.67,0.728,0.582,0.7,0.561,0.72,0.74,0.717,0.561,0.087,0.672,0.256,0.624,0.635,0.708,0.708,0.513,0.43,0.423,0.208,0.398,0.144,239
3,0.604,0.21,0.277,0.0,0.746,0.242,0.586,0.438,0.662,0.987,0.149,0.735,0.382,0.6,0.297,0.849,0.683,0.281,0.385,0.0,0.069,0.21,0.965,0.396,0.136,...,0.712,0.894,0.256,0.402,0.888,0.279,0.918,0.291,0.367,0.328,0.918,0.469,0.588,0.383,0.239,0.25,0.208,0.208,0.944,0.641,0.346,0.109,0.455,0.519,232
4,0.65,0.416,0.6,0.333,0.566,0.758,0.655,1.0,0.865,0.791,0.415,0.355,0.229,0.45,0.254,0.708,0.797,0.167,0.37,0.453,0.897,0.416,0.732,0.35,1.0,...,0.85,0.555,0.414,0.101,0.724,0.543,0.765,0.075,0.812,0.002,0.765,0.461,0.504,0.371,0.205,1.0,0.833,0.833,0.614,0.516,0.731,0.455,0.341,0.591,204


In [9]:
pts_selection

['ROLL_HOME_PCT_AST_2PM',
 'ROLL_AWAY_effectiveFieldGoalPercentage',
 'ROLL_AWAY_offensiveRating',
 'ROLL_HOME_CHARGES_DRAWN',
 'ROLL_HOME_CFG_PCT',
 'ROLL_HOME_pacePer40',
 'ROLL_HOME_DFGA',
 'ROLL_AWAY_possessions',
 'ROLL_HOME_CFGM',
 'ROLL_HOME_offensiveRating',
 'ROLL_AWAY_BOX_OUT_PLAYER_TEAM_REBS',
 'ROLL_AWAY_CFG_PCT',
 'ROLL_HOME_FTA',
 'ROLL_HOME_OFF_LOOSE_BALLS_RECOVERED',
 'ROLL_HOME_FTM',
 'ROLL_HOME_estimatedOffensiveRating',
 'ROLL_HOME_trueShootingPercentage',
 'ROLL_HOME_FTA_RATE',
 'ROLL_AWAY_TM_TOV_PCT',
 'ROLL_AWAY_PTS_OFF_TOV',
 'ROLL_AWAY_FGM',
 'ROLL_AWAY_EFG_PCT',
 'ROLL_AWAY_DFG_PCT',
 'ROLL_HOME_PCT_UAST_2PM',
 'ROLL_AWAY_UFGM',
 'ROLL_AWAY_FGA',
 'ROLL_HOME_FG_PCT_y',
 'ROLL_AWAY_OFF_LOOSE_BALLS_RECOVERED',
 'ROLL_HOME_PTS_PAINT',
 'ROLL_AWAY_assistToTurnover',
 'ROLL_AWAY_BOX_OUTS',
 'ROLL_AWAY_CFGM',
 'ROLL_AWAY_DFGM',
 'ROLL_HOME_FGM',
 'ROLL_AWAY_FTA',
 'ROLL_HOME_OPP_PTS_PAINT',
 'ROLL_HOME_FTAST',
 'ROLL_HOME_pace',
 'ROLL_HOME_PTS',
 'ROLL_HOME_effectiv

<a name="2.-Filter-Methods-for-Plus-Minus"></a>
# 2. Filter Methods for Plus Minus

[Return to top](#Feature-Selection)

<a name="2.1.-Correlation-Based"></a>
## 2.1. Correlation Based

[Return to top](#Feature-Selection)

In [10]:
start_time = time.time()

# filter-based feature selection
pm_selection = utl.filter_feature_selection(
    df=pm_features, 
    outcome_name='PLUS_MINUS'
)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

{
    "outcome_correlation": [
        "ROLL_HOME_PTS",
        "ROLL_HOME_FG_PCT_x",
        "ROLL_AWAY_FT_PCT",
        "ROLLDIFF_HOME_PTS",
        "ROLLDIFF_HOME_FGM",
        "ROLLDIFF_HOME_FG_PCT",
        "ROLLDIFF_HOME_DREB",
        "ROLLDIFF_HOME_REB",
        "ROLLDIFF_HOME_AST",
        "ROLLDIFF_HOME_BLK",
        "ROLLDIFF_AWAY_PTS",
        "ROLLDIFF_AWAY_FGM",
        "ROLLDIFF_AWAY_FG_PCT",
        "ROLLDIFF_AWAY_FT_PCT",
        "ROLLDIFF_AWAY_DREB",
        "ROLLDIFF_AWAY_AST",
        "ROLL_HOME_estimatedOffensiveRating",
        "ROLL_HOME_offensiveRating",
        "ROLL_HOME_estimatedNetRating",
        "ROLL_HOME_netRating",
        "ROLL_HOME_effectiveFieldGoalPercentage",
        "ROLL_HOME_trueShootingPercentage",
        "ROLL_HOME_PIE",
        "ROLL_AWAY_offensiveRating",
        "ROLL_AWAY_estimatedDefensiveRating",
        "ROLL_AWAY_defensiveRating",
        "ROLL_AWAY_estimatedNetRating",
        "ROLL_AWAY_netRating",
        "ROLL_AWAY_PIE",
        "

<a name="2.2.-vtreat-Library"></a>
## 2.2. `vtreat` Library

[Return to top](#Feature-Selection)

In [11]:
start_time = time.time()

# automated feature selection and preprocessing
pm_features_selected, pm_selection = utl.vtreat_feature_selection(
    df=pm_features,
    outcome_name='PLUS_MINUS'
)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

There were 63 features selected out of 256 original features

Total time taken: 0.66 seconds


In [12]:
pm_features_selected.head()

Unnamed: 0,ROLL_AWAY_offensiveRating,ROLLDIFF_AWAY_FG_PCT,ROLLDIFF_AWAY_TOV,ROLLDIFF_HOME_AST,ROLL_AWAY_SAST,ROLL_HOME_CFG_PCT,ROLL_HOME_FG3M,ROLLDIFF_AWAY_AST,ROLL_HOME_offensiveRating,ROLL_HOME_netRating,ROLLDIFF_HOME_FGM,ROLL_HOME_estimatedOffensiveRating,ROLL_HOME_trueShootingPercentage,ROLL_HOME_UFGA,ROLLDIFF_HOME_FG_PCT,ROLL_AWAY_FGM,ROLL_HOME_SCREEN_ASSISTS,ROLL_HOME_FG_PCT_y,ROLLDIFF_AWAY_STL,ROLL_HOME_PIE,ROLL_AWAY_PIE,ROLL_AWAY_DFGM,ROLL_HOME_FGM,ROLLDIFF_HOME_DREB,ROLLDIFF_HOME_PTS,...,ROLL_HOME_DREB,ROLL_AWAY_netRating,ROLL_HOME_FG_PCT_x,ROLL_HOME_UFG_PCT,ROLL_AWAY_estimatedDefensiveRating,ROLL_AWAY_trueShootingPercentage,ROLL_HOME_defensiveRating,ROLL_HOME_assistRatio,ROLL_AWAY_estimatedOffensiveRating,ROLLDIFF_AWAY_FT_PCT,ROLL_HOME_BLKA,ROLL_HOME_estimatedDefensiveRating,ROLLDIFF_AWAY_FGM,ROLL_AWAY_FT_PCT,ROLLDIFF_AWAY_PTS,ROLL_HOME_OPP_EFG_PCT,ROLL_AWAY_PTS,ROLLDIFF_AWAY_REB,ROLL_AWAY_defensiveRating,ROLL_AWAY_DFGA,ROLL_HOME_SCREEN_AST_PTS,ROLL_HOME_DFGM,ROLLDIFF_HOME_FG3M,ROLL_HOME_estimatedNetRating,PLUS_MINUS
0,0.611,0.428,0.28,0.519,0.25,0.511,0.758,0.381,0.727,0.698,0.485,0.762,0.808,0.679,0.488,0.586,0.209,0.753,0.533,0.78,0.33,0.62,0.522,0.585,0.641,...,0.292,0.319,0.753,0.761,0.881,0.714,0.315,0.597,0.672,0.519,0.3,0.298,0.509,0.603,0.482,0.487,0.577,0.437,0.875,0.423,0.248,0.909,0.686,0.779,7.0
1,0.152,0.464,0.402,0.331,0.125,0.138,0.076,0.467,0.0,0.242,0.288,0.0,0.0,0.359,0.13,0.017,0.209,0.0,0.3,0.0,0.06,0.185,0.0,0.455,0.286,...,0.381,0.182,0.0,0.0,0.566,0.035,0.192,0.0,0.183,0.39,0.8,0.176,0.491,0.837,0.472,0.308,0.096,0.364,0.583,0.269,0.208,0.909,0.605,0.286,-8.0
2,0.684,0.399,0.366,0.459,0.375,0.601,0.455,0.171,0.715,0.417,0.523,0.685,0.712,0.295,0.569,0.586,0.86,0.758,0.6,0.502,0.746,0.076,0.652,0.675,0.484,...,0.602,0.654,0.758,0.676,0.462,0.67,0.728,0.582,0.7,0.523,0.4,0.717,0.473,0.469,0.365,0.624,0.635,0.331,0.513,0.423,0.861,0.398,0.256,0.457,29.0
3,0.277,0.188,0.585,0.504,0.375,0.746,0.53,0.314,0.987,0.833,0.538,0.849,0.683,0.808,0.346,0.069,0.488,0.827,0.35,0.787,0.0,0.511,0.826,0.317,0.508,...,0.159,0.0,0.827,0.712,0.894,0.256,0.402,0.888,0.279,0.627,0.6,0.328,0.161,0.268,0.149,0.239,0.25,0.245,0.944,0.346,0.406,0.455,0.593,0.827,-10.0
4,0.6,0.232,0.463,0.624,0.875,0.566,0.833,0.571,0.791,0.886,0.75,0.708,0.797,0.391,0.715,0.897,0.349,0.848,0.4,0.978,0.471,0.783,0.783,0.634,0.738,...,0.779,0.509,0.848,0.85,0.555,0.414,0.101,0.724,0.543,0.415,0.4,0.002,0.268,0.446,0.309,0.205,1.0,0.318,0.614,0.731,0.366,0.341,0.698,0.931,-10.0


In [13]:
pm_selection

['ROLL_AWAY_offensiveRating',
 'ROLLDIFF_AWAY_FG_PCT',
 'ROLLDIFF_AWAY_TOV',
 'ROLLDIFF_HOME_AST',
 'ROLL_AWAY_SAST',
 'ROLL_HOME_CFG_PCT',
 'ROLL_HOME_FG3M',
 'ROLLDIFF_AWAY_AST',
 'ROLL_HOME_offensiveRating',
 'ROLL_HOME_netRating',
 'ROLLDIFF_HOME_FGM',
 'ROLL_HOME_estimatedOffensiveRating',
 'ROLL_HOME_trueShootingPercentage',
 'ROLL_HOME_UFGA',
 'ROLLDIFF_HOME_FG_PCT',
 'ROLL_AWAY_FGM',
 'ROLL_HOME_SCREEN_ASSISTS',
 'ROLL_HOME_FG_PCT_y',
 'ROLLDIFF_AWAY_STL',
 'ROLL_HOME_PIE',
 'ROLL_AWAY_PIE',
 'ROLL_AWAY_DFGM',
 'ROLL_HOME_FGM',
 'ROLLDIFF_HOME_DREB',
 'ROLLDIFF_HOME_PTS',
 'ROLL_HOME_OPP_PTS_PAINT',
 'ROLLDIFF_AWAY_DREB',
 'ROLL_HOME_PTS',
 'ROLL_AWAY_estimatedNetRating',
 'ROLLDIFF_HOME_REB',
 'ROLL_HOME_effectiveFieldGoalPercentage',
 'ROLL_AWAY_OPP_PTS_PAINT',
 'ROLL_HOME_EFG_PCT',
 'ROLL_AWAY_OPP_EFG_PCT',
 'ROLL_HOME_CFGA',
 'ROLL_AWAY_OPP_PTS_OFF_TOV',
 'ROLLDIFF_HOME_BLK',
 'ROLL_HOME_SAST',
 'ROLL_HOME_UFGM',
 'ROLL_HOME_DREB',
 'ROLL_AWAY_netRating',
 'ROLL_HOME_FG_PCT

<a name="3.-Filter-Methods-for-Game-Winner"></a>
# 3. Filter Methods for Game Winner

[Return to top](#Feature-Selection)

<a name="3.1.-Correlation-Based"></a>
## 3.1. Correlation Based

[Return to top](#Feature-Selection)

In [14]:
start_time = time.time()

# filter-based feature selection
res_selection = utl.filter_feature_selection(
    df=res_features, 
    outcome_name='GAME_RESULT'
)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

{
    "outcome_correlation": [
        "ROLL_HOME_FG_PCT_x",
        "ROLL_AWAY_STL",
        "ROLLDIFF_HOME_PTS",
        "ROLLDIFF_HOME_FGM",
        "ROLLDIFF_HOME_FG_PCT",
        "ROLLDIFF_HOME_DREB",
        "ROLLDIFF_HOME_REB",
        "ROLLDIFF_HOME_BLK",
        "ROLLDIFF_AWAY_PTS",
        "ROLLDIFF_AWAY_FGM",
        "ROLLDIFF_AWAY_FG_PCT",
        "ROLLDIFF_AWAY_AST",
        "ROLLDIFF_AWAY_STL",
        "ROLLDIFF_AWAY_TOV",
        "ROLL_HOME_estimatedOffensiveRating",
        "ROLL_HOME_offensiveRating",
        "ROLL_HOME_estimatedDefensiveRating",
        "ROLL_HOME_estimatedNetRating",
        "ROLL_HOME_netRating",
        "ROLL_HOME_effectiveFieldGoalPercentage",
        "ROLL_HOME_trueShootingPercentage",
        "ROLL_HOME_PIE",
        "ROLL_AWAY_estimatedDefensiveRating",
        "ROLL_AWAY_defensiveRating",
        "ROLL_AWAY_estimatedNetRating",
        "ROLL_AWAY_netRating",
        "ROLL_AWAY_PIE",
        "ROLL_AWAY_DEFLECTIONS",
        "ROLL_HOME_OPP_PTS_P

<a name="3.2.-vtreat-Library"></a>
## 3.2. `vtreat` Library

[Return to top](#Feature-Selection)

In [15]:
start_time = time.time()

# automated feature selection and preprocessing
res_features_selected, res_selection = utl.vtreat_feature_selection(
    df=res_features,
    outcome_name='GAME_RESULT'
)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

There were 54 features selected out of 256 original features

Total time taken: 1.06 seconds


In [16]:
res_features_selected.head()

Unnamed: 0,ROLLDIFF_AWAY_FG_PCT,ROLLDIFF_AWAY_TOV,ROLLDIFF_HOME_AST,ROLL_AWAY_DEFLECTIONS,ROLL_HOME_CFG_PCT,ROLL_HOME_DFGA,ROLLDIFF_AWAY_AST,ROLL_HOME_offensiveRating,ROLL_HOME_netRating,ROLLDIFF_HOME_FGM,ROLL_HOME_estimatedOffensiveRating,ROLL_HOME_trueShootingPercentage,ROLL_HOME_UFGA,ROLLDIFF_HOME_FG_PCT,ROLL_AWAY_STL,ROLL_AWAY_PTS_OFF_TOV,ROLL_HOME_SCREEN_ASSISTS,ROLL_HOME_FG_PCT_y,ROLLDIFF_AWAY_STL,ROLL_HOME_PIE,ROLL_AWAY_PIE,ROLL_AWAY_DFGM,ROLLDIFF_HOME_DREB,ROLLDIFF_HOME_PTS,ROLL_HOME_OPP_PTS_PAINT,...,ROLL_HOME_EFG_PCT,ROLL_AWAY_OPP_EFG_PCT,ROLL_HOME_CFGA,ROLL_AWAY_OPP_PTS_OFF_TOV,ROLLDIFF_HOME_BLK,ROLL_AWAY_assistRatio,ROLL_HOME_UFGM,ROLL_HOME_DREB,ROLL_AWAY_netRating,ROLL_HOME_FG_PCT_x,ROLL_AWAY_estimatedDefensiveRating,ROLL_HOME_defensiveRating,ROLL_HOME_BLKA,ROLL_HOME_estimatedDefensiveRating,ROLLDIFF_AWAY_FGM,ROLLDIFF_AWAY_PTS,ROLL_HOME_OPP_EFG_PCT,ROLL_AWAY_AST_y,ROLL_AWAY_AST_x,ROLL_AWAY_defensiveRating,ROLL_AWAY_OPP_TOV_PCT,ROLL_HOME_SCREEN_AST_PTS,ROLL_HOME_DFGM,ROLL_HOME_estimatedNetRating,GAME_RESULT
0,0.428,0.28,0.519,0.363,0.511,1.0,0.381,0.727,0.698,0.485,0.762,0.808,0.679,0.488,0.28,0.604,0.209,0.753,0.533,0.78,0.33,0.62,0.585,0.641,0.808,...,0.769,0.9,0.225,0.875,0.552,0.571,0.696,0.292,0.319,0.753,0.881,0.315,0.3,0.298,0.509,0.482,0.487,0.5,0.5,0.875,0.377,0.248,0.909,0.779,1
1,0.464,0.402,0.331,0.265,0.138,0.655,0.467,0.0,0.242,0.288,0.0,0.0,0.359,0.13,0.28,0.34,0.209,0.0,0.3,0.0,0.06,0.185,0.455,0.286,0.758,...,0.0,0.735,1.0,0.125,0.621,0.116,0.0,0.381,0.182,0.0,0.566,0.192,0.8,0.176,0.491,0.472,0.308,0.083,0.083,0.583,0.309,0.208,0.909,0.286,0
2,0.399,0.366,0.459,0.314,0.601,0.276,0.171,0.715,0.417,0.523,0.685,0.712,0.295,0.569,0.36,0.377,0.86,0.758,0.6,0.502,0.746,0.076,0.675,0.484,0.505,...,0.679,0.474,0.922,0.542,0.534,0.769,0.391,0.602,0.654,0.758,0.462,0.728,0.4,0.717,0.473,0.365,0.624,0.708,0.708,0.513,0.491,0.861,0.398,0.457,1
3,0.188,0.585,0.504,0.314,0.746,0.586,0.314,0.987,0.833,0.538,0.849,0.683,0.808,0.346,0.2,0.0,0.488,0.827,0.35,0.787,0.0,0.511,0.317,0.508,0.606,...,0.739,0.907,0.496,0.625,0.069,0.265,0.739,0.159,0.0,0.827,0.894,0.402,0.6,0.328,0.161,0.149,0.239,0.208,0.208,0.944,0.171,0.406,0.455,0.827,0
4,0.232,0.463,0.624,0.559,0.566,0.655,0.571,0.791,0.886,0.75,0.708,0.797,0.391,0.715,0.76,0.453,0.349,0.848,0.4,0.978,0.471,0.783,0.634,0.738,0.202,...,0.836,0.763,0.884,0.458,0.707,0.571,0.565,0.779,0.509,0.848,0.555,0.101,0.4,0.002,0.268,0.309,0.205,0.833,0.833,0.614,0.44,0.366,0.341,0.931,0


In [17]:
res_selection

['ROLLDIFF_AWAY_FG_PCT',
 'ROLLDIFF_AWAY_TOV',
 'ROLLDIFF_HOME_AST',
 'ROLL_AWAY_DEFLECTIONS',
 'ROLL_HOME_CFG_PCT',
 'ROLL_HOME_DFGA',
 'ROLLDIFF_AWAY_AST',
 'ROLL_HOME_offensiveRating',
 'ROLL_HOME_netRating',
 'ROLLDIFF_HOME_FGM',
 'ROLL_HOME_estimatedOffensiveRating',
 'ROLL_HOME_trueShootingPercentage',
 'ROLL_HOME_UFGA',
 'ROLLDIFF_HOME_FG_PCT',
 'ROLL_AWAY_STL',
 'ROLL_AWAY_PTS_OFF_TOV',
 'ROLL_HOME_SCREEN_ASSISTS',
 'ROLL_HOME_FG_PCT_y',
 'ROLLDIFF_AWAY_STL',
 'ROLL_HOME_PIE',
 'ROLL_AWAY_PIE',
 'ROLL_AWAY_DFGM',
 'ROLLDIFF_HOME_DREB',
 'ROLLDIFF_HOME_PTS',
 'ROLL_HOME_OPP_PTS_PAINT',
 'ROLL_HOME_PTS',
 'ROLL_AWAY_estimatedNetRating',
 'ROLLDIFF_HOME_REB',
 'ROLL_HOME_effectiveFieldGoalPercentage',
 'ROLL_AWAY_OPP_PTS_PAINT',
 'ROLL_HOME_EFG_PCT',
 'ROLL_AWAY_OPP_EFG_PCT',
 'ROLL_HOME_CFGA',
 'ROLL_AWAY_OPP_PTS_OFF_TOV',
 'ROLLDIFF_HOME_BLK',
 'ROLL_AWAY_assistRatio',
 'ROLL_HOME_UFGM',
 'ROLL_HOME_DREB',
 'ROLL_AWAY_netRating',
 'ROLL_HOME_FG_PCT_x',
 'ROLL_AWAY_estimatedDefen