<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Feature Engineering</h4>
    <p style="font-size: 20px;">NBA API Data (2022-2024)</p>
</div>

<a name="Feature Engineering"></a>

# Table of Contents

[Setup](#Setup)

[Data](#Data)

**[1. Create Team Matchups and Targets](#1.-Create-Team-Matchups-and-Targets)**

- [1.1. Clean Game Data](#1.1.-Clean-Game-Data)

- [1.2. Reshape to Game Matchups](#1.2.-Reshape-to-Game-Matchups)

- [1.3. Create Target Variables](#1.3.-Create-Target-Variables)

**[2. Create Rolling Window Statistics](#2.-Create-Rolling-Window-Statistics)**

# Setup

[Return to top](#Feature-Engineering)

In [2]:
import sys
from pathlib import Path
# get current working directory
cwd = %pwd
# add shared_code directory to Python sys.path
sys.path.append(str(Path(cwd).parent / "shared_code"))
# import all libraries in shared_code directory 'imports.py' file
from imports import *
%matplotlib inline

# Data

[Return to top](#Feature-Engineering)

In [45]:
four_factor_stats_df = pd.read_csv('../../data/original/nba_four_factors_statistics_2021_2024.csv')
four_factor_stats_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22101221,1610612745,Rockets,HOU,Houston,240.000000:00,0.556,0.225,0.08,0.118,0.638,0.276,0.133,0.326
1,22101221,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.638,0.276,0.133,0.283,0.556,0.225,0.08,0.216
2,22101207,1610612748,Heat,MIA,Miami,240.000000:00,0.632,0.389,0.175,0.111,0.523,0.216,0.153,0.354
3,22101207,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.523,0.216,0.153,0.271,0.632,0.389,0.175,0.167
4,22101192,1610612764,Wizards,WAS,Washington,240.000000:00,0.547,0.128,0.142,0.136,0.554,0.207,0.06,0.2


In [46]:
box_score_df = pd.read_csv('../../data/original/nba_games_box_scores_2022_2024.csv')

In [47]:
box_score_df.tail()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
7520,22023,1610612764,WAS,Washington Wizards,22300642,2024-01-27,WAS @ DET,W,240,118,45,100,0.45,11,34.0,0.324,17,21,0.81,16.0,34.0,50.0,26,10.0,4,9,19,14.0
7521,22023,1610612764,WAS,Washington Wizards,22300665,2024-01-29,WAS @ SAS,W,240,118,46,86,0.535,9,25.0,0.36,17,24,0.708,14.0,31.0,45.0,32,9.0,8,18,15,5.0
7522,22023,1610612764,WAS,Washington Wizards,22300676,2024-01-31,WAS vs. LAC,L,239,109,45,97,0.464,9,29.0,0.31,10,15,0.667,12.0,33.0,45.0,19,4.0,10,13,19,-16.0
7523,22023,1610612764,WAS,Washington Wizards,22300689,2024-02-02,WAS vs. MIA,L,239,102,37,90,0.411,11,42.0,0.262,17,21,0.81,6.0,37.0,43.0,28,5.0,4,8,25,-8.0
7524,22023,1610612764,WAS,Washington Wizards,22300705,2024-02-04,WAS vs. PHX,L,240,112,47,96,0.49,7,32.0,0.219,11,17,0.647,13.0,22.0,35.0,32,11.0,4,18,19,-28.0


In [48]:
#formats the minutes field
four_factor_stats_df['MIN'] = four_factor_stats_df['MIN'].str.slice(0, 3)
four_factor_stats_df['MIN'] = four_factor_stats_df['MIN'].astype(int)
#creates team name field to match games_df
four_factor_stats_df['TEAM_NAME'] = four_factor_stats_df['TEAM_CITY'] + " " + four_factor_stats_df['TEAM_NAME']
four_factor_stats_df.drop(['TEAM_CITY'], inplace=True, axis=1)
four_factor_stats_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22101221,1610612745,Houston Rockets,HOU,240,0.556,0.225,0.08,0.118,0.638,0.276,0.133,0.326
1,22101221,1610612737,Atlanta Hawks,ATL,240,0.638,0.276,0.133,0.283,0.556,0.225,0.08,0.216
2,22101207,1610612748,Miami Heat,MIA,240,0.632,0.389,0.175,0.111,0.523,0.216,0.153,0.354
3,22101207,1610612737,Atlanta Hawks,ATL,240,0.523,0.216,0.153,0.271,0.632,0.389,0.175,0.167
4,22101192,1610612764,Washington Wizards,WAS,240,0.547,0.128,0.142,0.136,0.554,0.207,0.06,0.2


In [49]:
four_factor_stats_df = pd.merge(four_factor_stats_df, box_score_df[['SEASON_ID','GAME_DATE','MATCHUP', 'GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION']], on=['GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION'])

four_factor_stats_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT,SEASON_ID,GAME_DATE,MATCHUP
0,22101221,1610612745,Houston Rockets,HOU,240,0.556,0.225,0.08,0.118,0.638,0.276,0.133,0.326,22021,2022-04-10,HOU vs. ATL
1,22101221,1610612737,Atlanta Hawks,ATL,240,0.638,0.276,0.133,0.283,0.556,0.225,0.08,0.216,22021,2022-04-10,ATL @ HOU
2,22101207,1610612748,Miami Heat,MIA,240,0.632,0.389,0.175,0.111,0.523,0.216,0.153,0.354,22021,2022-04-08,MIA vs. ATL
3,22101207,1610612737,Atlanta Hawks,ATL,240,0.523,0.216,0.153,0.271,0.632,0.389,0.175,0.167,22021,2022-04-08,ATL @ MIA
4,22101192,1610612764,Washington Wizards,WAS,240,0.547,0.128,0.142,0.136,0.554,0.207,0.06,0.2,22021,2022-04-06,WAS @ ATL


<a name="1.-Create-Team-Matchups-and-Targets"></a>
# 1. Create Team Matchups and Targets

[Return to top](#Feature-Engineering)

<a name="1.1.-Clean-Game-Data"></a>
## 1.1. Clean Game Data

[Return to top](#Feature-Engineering)

We need to do three key things to clean the data:

1. Remove games with team aggregated game times of less than 238 minutes (which will remove exhibition matches).
2. Retain only games that are part of the regular season.
3. Remove any orphans (i.e., game IDs that do not have a partner) when reshaping to matchups.

Last 3 NBA regular seasons start and end dates:

- 2021-22 season: 2021-10-19 to 2022-04-10
- 2022-23 season: 2022-10-18 to 2023-04-09
- 2023-24 season: 2023-10-24 to 2024-04-14

In [50]:
# last 3 seasons start and end dates and labels
season_start_dates = ['2021-10-19', '2022-10-18', '2023-10-24']
season_end_dates   = ['2022-04-10', '2023-04-09', '2024-04-14']
season_labels      = ['2021-22', '2022-23', '2023-24']

In [51]:
# clean up the data
four_factor_stats_df_cleaned = utl.clean_team_bs_data(four_factor_stats_df, season_start_dates=season_start_dates, 
                                            season_end_dates=season_end_dates, season_labels=season_labels)

Season 2021-22: 1230 games
Season 2022-23: 1230 games
Season 2023-24: 739 games


In [52]:
# clean up the data
box_score_df_cleaned = utl.clean_team_bs_data(box_score_df, season_start_dates=season_start_dates, 
                                            season_end_dates=season_end_dates, season_labels=season_labels)

Season 2021-22: 1230 games
Season 2022-23: 1230 games
Season 2023-24: 736 games


In [53]:
#get WL, PTS, PLUS_MINUS fields from games_df
four_factor_stats_df_cleaned = pd.merge(four_factor_stats_df_cleaned, box_score_df_cleaned[['WL','PTS','PLUS_MINUS', 'GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION']], on=['GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION'])

four_factor_stats_df_cleaned.sort_values(by=['PTS'])

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT,SEASON_ID,GAME_DATE,MATCHUP,WL,PTS,PLUS_MINUS
890,22100075,1610612742,Dallas Mavericks,DAL,240,0.346,0.346,0.152,0.105,0.584,0.157,0.193,0.238,2021-22,2021-10-29,DAL @ DEN,L,75,-31.0
242,22100595,1610612752,New York Knicks,NYK,240,0.437,0.190,0.158,0.204,0.553,0.224,0.139,0.286,2021-22,2022-01-08,NYK @ BOS,L,75,-24.0
224,22100717,1610612758,Sacramento Kings,SAC,240,0.337,0.158,0.122,0.217,0.592,0.068,0.110,0.404,2021-22,2022-01-25,SAC @ BOS,L,75,-53.0
6222,22300529,1610612757,Portland Trail Blazers,POR,240,0.332,0.158,0.095,0.176,0.645,0.237,0.121,0.310,2023-24,2024-01-11,POR @ OKC,L,77,-62.0
727,22100257,1610612741,Chicago Bulls,CHI,240,0.400,0.176,0.094,0.088,0.517,0.200,0.130,0.333,2021-22,2021-11-22,CHI vs. IND,L,77,-32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3481,22201230,1610612744,Golden State Warriors,GSW,240,0.745,0.167,0.145,0.237,0.447,0.253,0.123,0.180,2022-23,2023-04-09,GSW @ POR,W,157,56.0
4975,22300039,1610612754,Indiana Pacers,IND,240,0.723,0.298,0.124,0.175,0.679,0.295,0.136,0.350,2023-24,2023-11-21,IND @ ATL,W,157,5.0
2049,22100723,1610612766,Charlotte Hornets,CHA,240,0.710,0.355,0.116,0.190,0.533,0.370,0.182,0.423,2021-22,2022-01-26,CHA @ IND,W,158,32.0
3751,22200902,1610612746,LA Clippers,LAC,290,0.735,0.367,0.190,0.167,0.667,0.315,0.116,0.280,2022-23,2023-02-24,LAC vs. SAC,L,175,-1.0


<a name="1.2.-Reshape-to-Game-Matchups"></a>
## 1.2. Reshape to Game Matchups

[Return to top](#Feature-Engineering)

In [54]:
# identify non-stats columns
non_stats_cols = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP']

# reshape team box score data to wide format so each row is a game matchup
four_factor_stats_matchups_df = utl.reshape_team_bs_to_matchups(four_factor_stats_df_cleaned, non_stats_cols)

Season 2021-22: 1222 games
Season 2022-23: 1221 games
Season 2023-24: 728 games


In [55]:
four_factor_stats_matchups_df.head()

Unnamed: 0,GAME_ID,HOME_TEAM_ID,HOME_TEAM_NAME,HOME_TEAM_ABBREVIATION,HOME_MIN,HOME_EFG_PCT,HOME_FTA_RATE,HOME_TM_TOV_PCT,HOME_OREB_PCT,HOME_OPP_EFG_PCT,HOME_OPP_FTA_RATE,HOME_OPP_TOV_PCT,HOME_OPP_OREB_PCT,SEASON_ID,GAME_DATE,HOME_WL,HOME_PTS,HOME_PLUS_MINUS,AWAY_TEAM_ID,AWAY_TEAM_NAME,AWAY_TEAM_ABBREVIATION,AWAY_MIN,AWAY_EFG_PCT,AWAY_FTA_RATE,AWAY_TM_TOV_PCT,AWAY_OREB_PCT,AWAY_OPP_EFG_PCT,AWAY_OPP_FTA_RATE,AWAY_OPP_TOV_PCT,AWAY_OPP_OREB_PCT,AWAY_WL,AWAY_PTS,AWAY_PLUS_MINUS
0,22101221,1610612745,Houston Rockets,HOU,240,0.556,0.225,0.08,0.118,0.638,0.276,0.133,0.326,2021-22,2022-04-10,L,114,-16.0,1610612737,Atlanta Hawks,ATL,240,0.638,0.276,0.133,0.283,0.556,0.225,0.08,0.216,W,130,16.0
1,22101207,1610612748,Miami Heat,MIA,240,0.632,0.389,0.175,0.111,0.523,0.216,0.153,0.354,2021-22,2022-04-08,W,113,4.0,1610612737,Atlanta Hawks,ATL,240,0.523,0.216,0.153,0.271,0.632,0.389,0.175,0.167,L,109,-4.0
2,22101192,1610612737,Atlanta Hawks,ATL,240,0.554,0.207,0.06,0.14,0.547,0.128,0.142,0.227,2021-22,2022-04-06,W,118,15.0,1610612764,Washington Wizards,WAS,240,0.547,0.128,0.142,0.136,0.554,0.207,0.06,0.2,L,103,-15.0
3,22101182,1610612761,Toronto Raptors,TOR,240,0.48,0.307,0.078,0.328,0.532,0.128,0.073,0.204,2021-22,2022-04-05,W,118,10.0,1610612737,Atlanta Hawks,ATL,240,0.532,0.128,0.073,0.185,0.48,0.307,0.078,0.41,L,108,-10.0
4,22101163,1610612737,Atlanta Hawks,ATL,240,0.494,0.57,0.067,0.179,0.51,0.192,0.118,0.254,2021-22,2022-04-02,W,122,7.0,1610612751,Brooklyn Nets,BKN,240,0.51,0.192,0.118,0.169,0.494,0.57,0.067,0.214,L,115,-7.0


<a name="1.3.-Create-Target-Variables"></a>
## 1.3. Create Target Variables

[Return to top](#Feature-Engineering)

There are three targets of interest:

1. **Total Game Points (over / under):** This can be calculated as the sum of `HOME_PTS + AWAY_PTS`.
2. **Difference in Game Points (plus / minus):** This can be calculated in relation to the home team as the following difference: `HOME_PTS - AWAY_PTS`.
3. **Game Winner (moneyline):** This can be defined in relation to the home team using the `HOME_WL` column, where a win for the home team is equal to 1 and a loss for the home team equal to 0. We will create a new column called `GAME_RESULT` for this indicator.

In [56]:
# create the above three target variables
four_factor_stats_matchups_df = utl.create_target_variables(four_factor_stats_matchups_df, 'HOME_WL', 'HOME_PTS', 'AWAY_PTS')

In [57]:
four_factor_stats_matchups_df[['GAME_DATE', 'GAME_ID',  'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'HOME_PTS', 'AWAY_PTS', 'GAME_RESULT', 'TOTAL_PTS', 'PLUS_MINUS']].tail()

Unnamed: 0,GAME_DATE,GAME_ID,HOME_TEAM_NAME,AWAY_TEAM_NAME,HOME_PTS,AWAY_PTS,GAME_RESULT,TOTAL_PTS,PLUS_MINUS
3166,2023-11-22,22300225,Charlotte Hornets,Washington Wizards,117,114,1,231,3.0
3167,2023-11-10,22300009,Washington Wizards,Charlotte Hornets,117,124,0,241,-7.0
3168,2023-11-08,22300157,Charlotte Hornets,Washington Wizards,116,132,0,248,-16.0
3169,2024-01-24,22300619,Detroit Pistons,Charlotte Hornets,113,106,1,219,7.0
3170,2023-10-27,22300077,Charlotte Hornets,Detroit Pistons,99,111,0,210,-12.0


<a name="2.-Create-Rolling-Window-Statistics"></a>
# 2. Create Rolling Window Statistics

[Return to top](#Feature-Engineering)

Here we create average box scores for each team over a rolling window of the previous $n$-games.

In [58]:
# identify stats columns
non_stats_cols = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM_ID', 'AWAY_TEAM_ID',
                  'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'HOME_WL', 'AWAY_WL', 'HOME_MIN', 
                  'AWAY_MIN', 'HOME_TEAM_ABBREVIATION', 'AWAY_TEAM_ABBREVIATION']
stats_cols = [col for col in four_factor_stats_matchups_df.columns if col not in non_stats_cols]

In [73]:
# calculate rolling averages for each statistic and add them to the DataFrame
four_factor_stats_matchups_roll_df = utl.process_rolling_stats(
    four_factor_stats_matchups_df, 
    stats_cols, 
    target_cols=['GAME_RESULT', 'TOTAL_PTS', 'PLUS_MINUS'],
    window_size=5,   # the number of games to include in the rolling window
    min_obs=1,       # the minimum number of observations present within the window to yield an aggregate value
    stratify_by_season=True,  # should the rolling calculations be reset at the start of each new season or be contiguous across seasons? 
    exclude_initial_games=0   # number of initial games to exclude from the rolling averages (optionally by season)
)

In [74]:
four_factor_stats_matchups_roll_df.tail()

Unnamed: 0,GAME_ID,GAME_RESULT,TOTAL_PTS,PLUS_MINUS,HOME_TEAM_NAME,SEASON_ID,GAME_DATE,ROLL_HOME_EFG_PCT,ROLL_HOME_FTA_RATE,ROLL_HOME_TM_TOV_PCT,ROLL_HOME_OREB_PCT,ROLL_HOME_OPP_EFG_PCT,ROLL_HOME_OPP_FTA_RATE,ROLL_HOME_OPP_TOV_PCT,ROLL_HOME_OPP_OREB_PCT,ROLL_HOME_PTS,AWAY_TEAM_NAME,ROLL_AWAY_EFG_PCT,ROLL_AWAY_FTA_RATE,ROLL_AWAY_TM_TOV_PCT,ROLL_AWAY_OREB_PCT,ROLL_AWAY_OPP_EFG_PCT,ROLL_AWAY_OPP_FTA_RATE,ROLL_AWAY_OPP_TOV_PCT,ROLL_AWAY_OPP_OREB_PCT,ROLL_AWAY_PTS
2540,22300703,0,218,-16.0,San Antonio Spurs,2023-24,2024-02-03,0.541,0.227,0.168,0.27,0.552,0.234,0.144,0.27,110.6,Cleveland Cavaliers,0.582,0.235,0.132,0.186,0.501,0.268,0.137,0.244,115.6
3077,22300705,0,252,-28.0,Washington Wizards,2023-24,2024-02-04,0.506,0.247,0.13,0.164,0.517,0.27,0.111,0.321,106.0,Phoenix Suns,0.624,0.247,0.164,0.199,0.549,0.209,0.111,0.313,120.6
3017,22300704,0,210,-12.0,Detroit Pistons,2023-24,2024-02-04,0.547,0.269,0.125,0.203,0.545,0.227,0.101,0.269,115.0,Orlando Magic,0.534,0.3,0.136,0.239,0.575,0.32,0.175,0.243,111.0
3036,22300707,0,214,-18.6,Charlotte Hornets,2023-24,2024-02-04,0.514,0.226,0.128,0.157,0.583,0.251,0.115,0.263,103.4,Indiana Pacers,0.543,0.202,0.111,0.185,0.565,0.302,0.12,0.291,116.0
2492,22300706,1,222,40.0,Boston Celtics,2023-24,2024-02-04,0.564,0.186,0.11,0.203,0.522,0.178,0.099,0.298,113.8,Memphis Grizzlies,0.534,0.25,0.159,0.198,0.548,0.252,0.146,0.301,104.4


In [75]:
# write out the matchups with rolling features
four_factor_stats_matchups_roll_df.to_csv('../../data/processed/nba_team_matchups_rolling_four_factor_stats_2021_2024_r05.csv', index=False)