<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Feature Engineering</h4>
    <p style="font-size: 20px;">NBA API Data (2022-2024)</p>
</div>

<a name="Feature Engineering"></a>

# Table of Contents

[Setup](#Setup)

[Data](#Data)

**[1. Create Team Matchups and Targets](#1.-Create-Team-Matchups-and-Targets)**

- [1.1. Clean Game Data](#1.1.-Clean-Game-Data)

- [1.2. Reshape to Game Matchups](#1.2.-Reshape-to-Game-Matchups)

- [1.3. Create Target Variables](#1.3.-Create-Target-Variables)

**[2. Create Rolling Window Statistics](#2.-Create-Rolling-Window-Statistics)**

# Setup

[Return to top](#Feature-Engineering)

In [1]:
import sys
from pathlib import Path
# get current working directory
cwd = %pwd
# add shared_code directory to Python sys.path
sys.path.append(str(Path(cwd).parent / "shared_code"))
# import all libraries in shared_code directory 'imports.py' file
from imports import *
%matplotlib inline

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [216]:
# import other libraries
import numpy as np
import time
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import playergamelogs

from nba_api.stats.endpoints import CommonAllPlayers

from nba_api.stats.endpoints import CommonPlayerInfo
from nba_api.stats.endpoints import CommonTeamRoster



In [34]:
# Other functions don't work for our purposes
# For instance, boxscoreplayertrackv2 is deprecated for 2021-24 seasons, cannot be pulled  
# CommonAllPlayers is deprecrated, returns only 100-120 players for older seasons
# playergamelog doesn't have Team ID
# CommonTeamRoster only shows ending team roster, but  players change teams throughout the season
# playercareerstats only has season stats, not per game


# To get player stats per game, for each season, we:
# Loop through each team for each season, pull rosters from CommonTeamRosters
# Aggregate and pull unique player_ids for each season
# Then we loop through playergamelogs (different function than playergamelog)
 


In [122]:
# We use nba game box scores to check later work
# Get total box score df
team_bs_df = pd.read_csv('../../data/original/nba_games_box_scores_2022_2024.csv')

# Get season and game IDs into list
# we check unique games later to make sure pull is legitimate
id_df = team_bs_df[['SEASON_ID', 'GAME_ID', 'TEAM_ID']]
id_list = id_df.values.tolist()

In [134]:
team_id_list = id_df['TEAM_ID'].unique()
len(team_id_list)
print(team_id_list)

[1610612737 1610612738 1610612751 1610612766 1610612741 1610612739
 1610612742 1610612743 1610612765 1610612744 1610612745 1610612754
 1610612746 1610612747 1610612763 1610612748 1610612749 1610612750
 1610612740 1610612752 1610612760 1610612753 1610612755 1610612756
 1610612757 1610612758 1610612759 1610612761 1610612762 1610612764]


In [140]:
# Example Player Roster    
player_roster = CommonTeamRoster(
    team_id= '1610612737',
    season = '2019-20', # change year(s) if needed
    league_id_nullable= '00' # nba 00, g_league 20, wnba 10
)

df_player_roster = player_roster.get_data_frames()[0]
df_player_roster

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED
0,1610612737,2019,0,Jeff Teague,Jeff,jeff-teague,0,G,6-3,195,"JUN 10, 1988",32.0,10,Wake Forest,201952,
1,1610612737,2019,0,Brandon Goodwin,Brandon,brandon-goodwin,0,G,6-0,180,"OCT 02, 1995",24.0,1,Florida Gulf Coast,1629164,
2,1610612737,2019,0,Treveon Graham,Treveon,treveon-graham,2,G-F,6-5,219,"OCT 28, 1993",26.0,3,Va Commonwealth,1626203,
3,1610612737,2019,0,Kevin Huerter,Kevin,kevin-huerter,3,G-F,6-7,190,"AUG 27, 1998",21.0,1,Maryland,1628989,
4,1610612737,2019,0,Charlie Brown Jr.,Charlie,charlie-brown-jr,4,G,6-6,199,"FEB 02, 1997",23.0,R,St. Joseph's (PA),1629718,
5,1610612737,2019,0,Skal Labissiere,Skal,skal-labissiere,7,F-C,6-10,235,"MAR 18, 1996",24.0,3,Kentucky,1627746,
6,1610612737,2019,0,Trae Young,Trae,trae-young,11,G,6-1,180,"SEP 19, 1998",21.0,1,Oklahoma,1629027,Draft Rights Traded from DAL on 06/21/18
7,1610612737,2019,0,De'Andre Hunter,De'Andre,deandre-hunter,12,F-G,6-7,225,"DEC 02, 1997",22.0,R,Virginia,1629631,Draft Rights Traded from NOP on 07/07/19
8,1610612737,2019,0,Dewayne Dedmon,Dewayne,dewayne-dedmon,14,C,7-0,245,"AUG 12, 1989",30.0,6,USC,203473,
9,1610612737,2019,0,Vince Carter,Vince,vince-carter,15,G-F,6-6,220,"JAN 26, 1977",43.0,21,North Carolina,1713,


In [227]:
# specify seasons to pull
seasons = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

# declare lists
#player_rosters_list = []
#player_rosters_col_names = []
all_player_roster_df = pd.DataFrame()

# Loop Through Each Roster for each season    
for season in seasons:
    for team_id in team_id_list:
        player_roster = CommonTeamRoster(
            team_id= team_id,
            season = season, # change year(s) if needed
            league_id_nullable= '00' # nba 00, g_league 20, wnba 10
        )
        
        # get values into df
        df_player_roster = player_roster.get_data_frames()[0]
        
        # add column for season_year
        df_player_roster['SEASON_YEAR'] = season

        # concat to previous df
        all_player_roster_df = pd.concat([all_player_roster_df, df_player_roster], ignore_index=True)

In [228]:
all_player_roster_df

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED,SEASON_YEAR
0,1610612737,2019,00,Jeff Teague,Jeff,jeff-teague,00,G,6-3,195,"JUN 10, 1988",32.0,10,Wake Forest,201952,,2019-20
1,1610612737,2019,00,Brandon Goodwin,Brandon,brandon-goodwin,0,G,6-0,180,"OCT 02, 1995",24.0,1,Florida Gulf Coast,1629164,,2019-20
2,1610612737,2019,00,Treveon Graham,Treveon,treveon-graham,2,G-F,6-5,219,"OCT 28, 1993",26.0,3,Va Commonwealth,1626203,,2019-20
3,1610612737,2019,00,Kevin Huerter,Kevin,kevin-huerter,3,G-F,6-7,190,"AUG 27, 1998",21.0,1,Maryland,1628989,,2019-20
4,1610612737,2019,00,Charlie Brown Jr.,Charlie,charlie-brown-jr,4,G,6-6,199,"FEB 02, 1997",23.0,R,St. Joseph's (PA),1629718,,2019-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2554,1610612764,2023,00,Corey Kispert,Corey,corey-kispert,24,F,6-6,224,"MAR 03, 1999",25.0,2,Gonzaga,1630557,#15 Pick in 2021 Draft,2023-24
2555,1610612764,2023,00,Kyle Kuzma,Kyle,kyle-kuzma,33,F,6-9,221,"JUL 24, 1995",28.0,6,Utah,1628398,Traded from LAL on 08/06/21,2023-24
2556,1610612764,2023,00,Marvin Bagley III,Marvin,marvin-bagley-iii,35,F,6-10,235,"MAR 14, 1999",24.0,5,Duke,1628963,Traded from DET on 01/14/24,2023-24
2557,1610612764,2023,00,Eugene Omoruyi,Eugene,eugene-omoruyi,97,F,6-6,235,"FEB 14, 1997",27.0,2,Oregon,1630647,Signed on 07/12/23,2023-24


In [276]:
# Check that playergamelogs is pulling correctly for single player
# because looping through all players and seasons will take 1 hour

# Initialize an empty DataFrame to store all game logs
check_all_seasons_logs_df = pd.DataFrame()

# List of seasons to loop through (update this list as needed)
# '2019-20', '2020-21', '2021-22', '2022-23', '2023-24'
seasons = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

# Fetch game logs for each season and add a 'SEASON' column
for season in seasons:
    player_logs = playergamelogs.PlayerGameLogs(player_id_nullable='203500', season_nullable= season, season_type_nullable= "Regular Season")
    season_logs_df = player_logs.get_data_frames()[0]
    check_all_seasons_logs_df = pd.concat([check_all_seasons_logs_df, season_logs_df], ignore_index=True)

check_all_seasons_logs_df

Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,...,FGA_RANK,FG_PCT_RANK,FG3M_RANK,FG3A_RANK,FG3_PCT_RANK,FTM_RANK,FTA_RANK,FT_PCT_RANK,OREB_RANK,DREB_RANK,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,AVAILABLE_FLAG
0,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21901317,2020-08-14T00:00:00,OKC @ LAC,L,6.367,0,2,0.0,0,0,0.0,0,0,0.0,2,2,4,0,...,59,61,2,4,2,43,52,43,40,57,60,57,24,31,37,1,1,61,61,29,62,23,1,62,1
1,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21901306,2020-08-12T00:00:00,OKC vs. MIA,W,19.95,4,7,0.571,0,0,0.0,0,2,0.0,2,6,8,0,...,31,36,2,4,2,43,34,43,40,26,38,57,55,14,37,1,1,39,42,55,50,23,1,50,1
2,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21901265,2020-08-05T00:00:00,OKC @ LAL,W,28.25,7,10,0.7,0,0,0.0,4,7,0.571,1,6,7,2,...,8,18,2,4,2,7,4,22,51,26,45,28,24,31,37,41,29,3,8,10,34,23,1,30,1
3,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21901251,2020-08-03T00:00:00,OKC vs. DEN,L,34.03,3,6,0.5,0,0,0.0,3,4,0.75,3,7,10,1,...,43,39,2,4,2,16,16,13,31,17,23,46,12,31,37,41,63,5,37,62,51,23,1,46,1
4,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21901240,2020-08-01T00:00:00,OKC vs. UTA,W,27.51,7,10,0.7,0,0,0.0,2,5,0.4,3,8,11,2,...,8,18,2,4,2,21,14,38,31,10,13,28,2,31,8,1,46,18,11,5,21,1,1,16,1
5,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21900949,2020-03-08T00:00:00,OKC @ BOS,W,26.44,1,2,0.5,0,0,0.0,4,8,0.5,0,6,6,1,...,59,39,2,4,2,7,1,24,57,26,50,46,24,3,22,1,13,8,51,26,39,23,1,41,1
6,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21900932,2020-03-06T00:00:00,OKC @ NYK,W,23.517,5,8,0.625,0,0,0.0,2,2,1.0,6,5,11,1,...,21,30,2,4,2,21,34,1,4,36,13,46,55,31,8,41,1,39,23,7,24,1,1,26,1
7,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21900918,2020-03-04T00:00:00,OKC @ DET,W,32.057,4,5,0.8,0,0,0.0,0,3,0.0,0,7,7,3,...,51,10,2,4,2,43,31,43,57,17,45,14,1,3,22,1,13,51,42,29,36,23,1,33,1
8,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21900912,2020-03-03T00:00:00,OKC vs. LAC,L,27.278,2,6,0.333,0,0,0.0,0,2,0.0,5,5,10,2,...,43,51,2,4,2,43,34,43,10,36,23,28,24,31,37,1,59,39,56,61,56,23,1,58,1
9,2019-20,203500,Steven Adams,Steven,1610612760,OKC,Oklahoma City Thunder,21900882,2020-02-28T00:00:00,OKC @ MIL,L,16.967,4,5,0.8,0,0,0.0,0,0,0.0,1,6,7,3,...,51,10,2,4,2,43,52,43,51,26,45,14,24,14,37,1,13,51,42,52,44,23,1,46,1


In [275]:
##### Warning - this cell will take more than 1 hour to run
# make sure code is running properly before running this cell

# Loop through all seasons for all players

# Initialize an empty DataFrame to store all game logs
all_seasons_logs_df = pd.DataFrame()

# List of seasons to loop through (update this list as needed)
seasons = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

for season in seasons:
    # Get unique ids in specific season
    unique_player_ids = all_player_roster_df[all_player_roster_df['SEASON_YEAR'] == season]['PLAYER_ID'].unique()
    
    # Loop through unique IDS
    for unique_id in unique_player_ids:
        # Get player logs
        player_logs = playergamelogs.PlayerGameLogs(player_id_nullable=unique_id, 
                                                    season_nullable= season,
                                                    season_type_nullable= "Regular Season")
        # get player game data into df
        season_logs_df = player_logs.get_data_frames()[0]
        # concat with master df
        all_seasons_logs_df = pd.concat([all_seasons_logs_df, season_logs_df], ignore_index=True)

In [277]:
len(all_seasons_logs_df)

114335

In [284]:
# Check that each season has pulled the correct amount of games (should be 1230)
# 2019-20 will have less games becasue of covid
len(all_seasons_logs_df[all_seasons_logs_df['SEASON_YEAR'] == '2022-23']['GAME_ID'].unique())


1230

In [283]:
all_seasons_logs_df.head(20)

Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,...,FGA_RANK,FG_PCT_RANK,FG3M_RANK,FG3A_RANK,FG3_PCT_RANK,FTM_RANK,FTA_RANK,FT_PCT_RANK,OREB_RANK,DREB_RANK,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,AVAILABLE_FLAG
0,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900969,2020-03-11T00:00:00,ATL vs. NYK,L,18.317,5,7,0.714,1,1,1.0,4,4,1.0,1,2,3,3,...,34,4,9,33,1,13,14,1,4,20,14,43,22,30,20,33,26,8,12,29,30,4,1,26,1
1,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900957,2020-03-09T00:00:00,ATL vs. CHA,W,14.417,3,4,0.75,1,1,1.0,0,0,0.0,0,1,1,3,...,49,2,9,33,1,51,51,51,25,37,42,43,53,30,20,1,1,41,39,14,49,4,1,50,1
2,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900943,2020-03-07T00:00:00,ATL @ MEM,L,19.533,3,8,0.375,1,3,0.333,3,3,1.0,0,2,2,3,...,23,35,9,10,25,23,27,1,25,20,28,43,53,30,1,33,1,30,28,47,34,4,1,36,1
3,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900930,2020-03-06T00:00:00,ATL @ WAS,L,28.425,4,9,0.444,0,1,0.0,1,2,0.5,0,2,2,5,...,17,27,32,33,32,42,32,45,25,20,28,26,33,11,20,1,46,41,30,11,31,4,1,36,1
4,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900905,2020-03-02T00:00:00,ATL vs. MEM,L,16.733,5,12,0.417,1,2,0.5,1,1,1.0,1,2,3,1,...,11,32,9,19,15,42,47,1,4,20,14,55,10,30,1,33,10,41,21,29,43,4,1,32,1
5,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900888,2020-02-29T00:00:00,ATL vs. POR,W,17.283,1,4,0.25,0,1,0.0,1,1,1.0,0,2,2,3,...,49,44,32,33,32,42,47,1,25,20,28,43,10,30,20,1,1,30,55,18,55,4,1,55,1
6,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900878,2020-02-28T00:00:00,ATL vs. BKN,W,19.6,3,6,0.5,0,1,0.0,0,0,0.0,2,0,2,3,...,41,16,32,33,32,51,51,51,1,49,28,43,33,4,20,33,1,41,43,37,40,4,1,43,1
7,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900867,2020-02-26T00:00:00,ATL vs. ORL,L,22.717,3,7,0.429,1,3,0.333,5,6,0.833,1,2,3,6,...,34,28,9,10,25,8,8,34,4,20,14,15,53,11,1,53,10,8,21,29,12,4,1,15,1
8,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900850,2020-02-24T00:00:00,ATL @ PHI,L,17.733,1,6,0.167,0,1,0.0,2,2,1.0,1,3,4,2,...,41,53,32,33,32,29,32,1,4,9,5,50,33,30,20,1,26,41,51,39,52,4,1,53,1
9,2019-20,201952,Jeff Teague,Jeff,1610612737,ATL,Atlanta Hawks,21900837,2020-02-22T00:00:00,ATL vs. DAL,W,13.05,3,8,0.375,0,2,0.0,1,2,0.5,0,0,0,0,...,23,35,32,19,32,42,32,45,25,49,55,58,33,30,20,33,26,30,39,29,56,4,1,56,1


In [268]:
# Check nba_games_box_scores_2022-2024 for unique SEASON_IDs
id_df['SEASON_ID'].unique()

array([12021, 22021, 52021, 42021, 22022, 12022, 52022, 42022, 22023,
       12023, 62023], dtype=int64)

In [274]:
# check games with unique SEASON_IDs
len(id_df[id_df['SEASON_ID'] == 22022])

2636

In [286]:
len(id_df['GAME_ID'].unique())

3767

## Unused Player API Functions

In [155]:
## Doesn't work because commonAllplayers only pulls 100-120 players for 2019-2023 seasons

# specify seasons to pull
seasons = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

# loop through seasons
player_list = []
for season in seasons:
    common_all_players = CommonAllPlayers(
        is_only_current_season = 1, # 1 active, 0 not active
        league_id = '00', # nba 00, g_league 20, wnba 10
        season = season # change year(s) if needed
    )

    df_common_players = common_all_players.get_data_frames()[0]
    player_list.append(df_common_players.values.tolist())

In [156]:
len(pd.DataFrame(player_list[3]))

126

In [93]:
# Show example of player box scores for a given game id
player_boxscores = boxscoreplayertrackv2.BoxScorePlayerTrackV2(game_id = '0022200021')
df_player_boxscores = player_boxscores.get_data_frames()[0]
df_player_boxscores


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,START_POSITION,COMMENT,MIN,SPD,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,AST,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22200021,1610612761,TOR,Toronto,1628384,OG Anunoby,F,,37:15,4.11,2.73,3,8,11,48,1,0,33,1,1,4,0.25,2,5,0.4,0.333,1,2,0.5
1,22200021,1610612761,TOR,Toronto,1630567,Scottie Barnes,F,,35:53,4.19,2.7,4,9,11,61,0,1,43,3,4,7,0.571,3,7,0.429,0.5,1,3,0.333
2,22200021,1610612761,TOR,Toronto,1627783,Pascal Siakam,C,,36:48,4.11,2.7,6,12,15,90,0,0,60,11,10,11,0.909,5,10,0.5,0.713,3,3,1.0
3,22200021,1610612761,TOR,Toronto,1629018,Gary Trent Jr.,G,,35:58,4.05,2.62,0,1,1,33,0,0,15,0,4,9,0.444,2,7,0.286,0.375,1,1,1.0
4,22200021,1610612761,TOR,Toronto,1627832,Fred VanVleet,G,,37:56,4.14,2.81,1,9,10,82,1,0,65,9,1,2,0.5,6,9,0.667,0.636,3,3,1.0
5,22200021,1610612761,TOR,Toronto,1630173,Precious Achiuwa,,,17:29,4.33,1.37,1,6,7,20,0,0,10,0,0,2,0.0,1,4,0.25,0.167,3,4,0.75
6,22200021,1610612761,TOR,Toronto,1631132,Christian Koloko,,,19:45,4.29,1.51,5,2,7,11,0,0,8,0,0,1,0.0,0,1,0.0,0.0,1,3,0.333
7,22200021,1610612761,TOR,Toronto,1630625,Dalano Banton,,,12:57,4.56,1.06,0,0,0,25,0,0,17,0,1,2,0.5,1,2,0.5,0.5,0,1,0.0
8,22200021,1610612761,TOR,Toronto,201152,Thaddeus Young,,,5:59,4.44,0.47,0,1,1,12,0,0,10,1,0,0,0.0,0,1,0.0,0.0,0,0,0.0
9,22200021,1610612761,TOR,Toronto,203920,Khem Birch,,DNP - Coach's Decision,0:00,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0.0


In [15]:
from nba_api.stats.endpoints import playercareerstats
# Fetching career statistics for Player of Choice using his player ID
player_career = playercareerstats.PlayerCareerStats(player_id='203500')
player_career_df = player_career.get_data_frames()[0]

# Extracting the seasons of player of choice
seasons_played = player_career_df['SEASON_ID'].unique()
print(seasons_played.tolist())

['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23']


In [92]:
# Show example of player box scores for a given game id
player_boxscores = playergamelog.PlayerGameLog(player_id= '203925', season=2023)
df_player_boxscores = player_boxscores.get_data_frames()[0]
df_player_boxscores

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22023,203925,22300732,"FEB 07, 2024",DET @ SAC,W,12,1,3,0.333,0,1,0.0,0,0,0.0,0,0,0,0,0,0,0,1,2,-5,1
1,22023,203925,22300558,"JAN 15, 2024",DET @ WAS,W,5,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,1,0,0,1,0,0,-10,1
2,22023,203925,22300519,"JAN 10, 2024",DET vs. SAS,L,6,0,0,0.0,0,0,0.0,0,0,0.0,0,1,1,0,0,0,0,1,0,3,1
3,22023,203925,22300512,"JAN 09, 2024",DET vs. SAC,L,9,3,5,0.6,3,5,0.6,0,0,0.0,0,1,1,0,0,0,0,2,9,2,1
4,22023,203925,22300394,"DEC 23, 2023",DET @ BKN,L,7,0,0,0.0,0,0,0.0,0,0,0.0,0,1,1,0,0,0,0,0,0,-7,1
5,22023,203925,22300352,"DEC 18, 2023",DET @ ATL,L,12,0,1,0.0,0,1,0.0,1,2,0.5,1,1,2,2,1,1,0,1,1,1,1
6,22023,203925,22300334,"DEC 16, 2023",DET @ MIL,L,7,0,1,0.0,0,1,0.0,0,0,0.0,0,0,0,0,0,0,0,1,0,-4,1
7,22023,203925,22300327,"DEC 15, 2023",DET @ PHI,L,11,1,3,0.333,1,3,0.333,0,0,0.0,0,1,1,1,0,0,0,1,3,-8,1
8,22023,203925,22301217,"DEC 08, 2023",DET @ ORL,L,4,0,1,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,1,0,0,0,-7,1
9,22023,203925,22300140,"NOV 05, 2023",DET vs. PHX,L,13,0,2,0.0,0,2,0.0,0,0,0.0,0,1,1,1,0,0,0,1,0,-5,1


In [18]:
from nba_api.stats.endpoints import CommonTeamRoster

common_team_roster = CommonTeamRoster(
    team_id = '1610612752', # input team id
    league_id_nullable = '00', # nba 00, g_league 20, wnba 10
    season='2023-24')
df_common_team_roster = common_team_roster.get_data_frames()[0]
df_common_team_roster

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED
0,1610612752,2023,0,Donte DiVincenzo,Donte,donte-divincenzo,0,G,6-4,203,"JAN 31, 1997",27.0,5,Villanova,1628978,Signed on 07/08/23
1,1610612752,2023,0,Jacob Toppin,Jacob,jacob-toppin,0,F,6-8,200,"MAY 08, 2000",23.0,R,Kentucky,1631210,Signed on 07/06/23
2,1610612752,2023,0,Duane Washington Jr.,Duane,duane-washington-jr,1,G,6-2,197,"MAR 24, 2000",23.0,2,Ohio State,1630613,Signed on 02/28/23
3,1610612752,2023,0,Miles McBride,Miles,miles-mcbride,2,G,6-1,195,"SEP 08, 2000",23.0,2,West Virginia,1630540,Draft Rights Traded from OKC on 07/30/21
4,1610612752,2023,0,Josh Hart,Josh,josh-hart,3,G,6-4,215,"MAR 06, 1995",29.0,6,Villanova,1628404,Traded from POR on 02/09/23
5,1610612752,2023,0,Charlie Brown Jr.,Charlie,charlie-brown-jr,4,G,6-6,199,"FEB 02, 1997",27.0,3,St. Joseph's (PA),1629718,Signed on 09/07/23
6,1610612752,2023,0,Precious Achiuwa,Precious,precious-achiuwa,5,F,6-8,243,"SEP 19, 1999",24.0,3,Memphis,1630173,Traded from TOR on 12/30/23
7,1610612752,2023,0,Shake Milton,Shake,shake-milton,5,G-F,6-5,205,"SEP 26, 1996",27.0,5,Southern Methodist,1629003,Signed on 03/05/24
8,1610612752,2023,0,OG Anunoby,OG,og-anunoby,8,F-G,6-7,240,"JUL 17, 1997",26.0,6,Indiana,1628384,Traded from TOR on 12/30/23
9,1610612752,2023,0,Jalen Brunson,Jalen,jalen-brunson,11,G,6-2,190,"AUG 31, 1996",27.0,5,Villanova,1628973,Signed on 07/12/22


# Data

[Return to top](#Feature-Engineering)

In [3]:
team_bs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7525 entries, 0 to 7524
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SEASON_ID          7525 non-null   int64  
 1   TEAM_ID            7525 non-null   int64  
 2   TEAM_ABBREVIATION  7525 non-null   object 
 3   TEAM_NAME          7525 non-null   object 
 4   GAME_ID            7525 non-null   int64  
 5   GAME_DATE          7525 non-null   object 
 6   MATCHUP            7525 non-null   object 
 7   WL                 7514 non-null   object 
 8   MIN                7525 non-null   int64  
 9   PTS                7525 non-null   int64  
 10  FGM                7525 non-null   int64  
 11  FGA                7525 non-null   int64  
 12  FG_PCT             7523 non-null   float64
 13  FG3M               7525 non-null   int64  
 14  FG3A               7525 non-null   float64
 15  FG3_PCT            7523 non-null   float64
 16  FTM                7525 

<a name="1.-Create-Team-Matchups-and-Targets"></a>
# 1. Create Team Matchups and Targets

[Return to top](#Feature-Engineering)

<a name="1.1.-Clean-Game-Data"></a>
## 1.1. Clean Game Data

[Return to top](#Feature-Engineering)

We need to do three key things to clean the data:

1. Remove games with team aggregated game times of less than 238 minutes (which will remove exhibition matches).
2. Retain only games that are part of the regular season.
3. Remove any orphans (i.e., game IDs that do not have a partner) when reshaping to matchups.

Last 3 NBA regular seasons start and end dates:

- 2021-22 season: 2021-10-19 to 2022-04-10
- 2022-23 season: 2022-10-18 to 2023-04-09
- 2023-24 season: 2023-10-24 to 2024-04-14

In [4]:
# last 3 seasons start and end dates and labels
season_start_dates = ['2021-10-19', '2022-10-18', '2023-10-24']
season_end_dates   = ['2022-04-10', '2023-04-09', '2024-04-14']
season_labels      = ['2021-22', '2022-23', '2023-24']

In [5]:
# clean up the data
team_bs_df_cleaned = utl.clean_team_bs_data(team_bs_df, season_start_dates=season_start_dates, 
                                            season_end_dates=season_end_dates, season_labels=season_labels)

Season 2021-22: 1230 games
Season 2022-23: 1230 games
Season 2023-24: 736 games


<a name="1.2.-Reshape-to-Game-Matchups"></a>
## 1.2. Reshape to Game Matchups

[Return to top](#Feature-Engineering)

In [6]:
# identify non-stats columns
non_stats_cols = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP']

# reshape team box score data to wide format so each row is a game matchup
team_bs_matchups_df = utl.reshape_team_bs_to_matchups(team_bs_df_cleaned, non_stats_cols)

Season 2021-22: 1222 games
Season 2022-23: 1221 games
Season 2023-24: 729 games


In [7]:
team_bs_matchups_df.head()

Unnamed: 0,SEASON_ID,HOME_TEAM_ID,HOME_TEAM_ABBREVIATION,HOME_TEAM_NAME,GAME_ID,GAME_DATE,HOME_WL,HOME_MIN,HOME_PTS,HOME_FGM,HOME_FGA,HOME_FG_PCT,HOME_FG3M,HOME_FG3A,HOME_FG3_PCT,HOME_FTM,HOME_FTA,HOME_FT_PCT,HOME_OREB,HOME_DREB,HOME_REB,HOME_AST,HOME_STL,HOME_BLK,HOME_TOV,...,HOME_PLUS_MINUS,AWAY_TEAM_ID,AWAY_TEAM_ABBREVIATION,AWAY_TEAM_NAME,AWAY_WL,AWAY_MIN,AWAY_PTS,AWAY_FGM,AWAY_FGA,AWAY_FG_PCT,AWAY_FG3M,AWAY_FG3A,AWAY_FG3_PCT,AWAY_FTM,AWAY_FTA,AWAY_FT_PCT,AWAY_OREB,AWAY_DREB,AWAY_REB,AWAY_AST,AWAY_STL,AWAY_BLK,AWAY_TOV,AWAY_PF,AWAY_PLUS_MINUS
0,2021-22,1610612737,ATL,Atlanta Hawks,22100014,2021-10-21,W,242,113,45,94,0.479,15,35.0,0.429,8,9,0.889,6.0,49.0,55.0,31,8.0,9,13,...,26.0,1610612742,DAL,Dallas Mavericks,L,240,87,31,93,0.333,13,43.0,0.302,12,13,0.923,10.0,40.0,50.0,16,7.0,3,15,21,-26.0
1,2021-22,1610612737,ATL,Atlanta Hawks,22100043,2021-10-25,W,238,122,46,90,0.511,12,32.0,0.375,18,21,0.857,10.0,39.0,49.0,24,11.0,3,13,...,18.0,1610612765,DET,Detroit Pistons,L,239,104,40,91,0.44,9,33.0,0.273,15,18,0.833,11.0,25.0,36.0,26,7.0,6,14,15,-18.0
2,2021-22,1610612737,ATL,Atlanta Hawks,22100097,2021-11-01,W,240,118,38,83,0.458,13,34.0,0.382,29,29,1.0,13.0,34.0,47.0,24,9.0,5,11,...,7.0,1610612764,WAS,Washington Wizards,L,240,111,41,86,0.477,13,39.0,0.333,16,16,1.0,7.0,29.0,36.0,27,7.0,4,12,24,-7.0
3,2021-22,1610612737,ATL,Atlanta Hawks,22100120,2021-11-04,L,240,98,35,82,0.427,7,28.0,0.25,21,26,0.808,5.0,27.0,32.0,18,11.0,4,9,...,-18.0,1610612762,UTA,Utah Jazz,W,240,116,41,81,0.506,17,41.0,0.415,17,23,0.739,8.0,38.0,46.0,30,6.0,4,14,20,18.0
4,2021-22,1610612737,ATL,Atlanta Hawks,22100193,2021-11-14,W,241,120,47,97,0.485,15,35.0,0.429,11,13,0.846,15.0,36.0,51.0,21,6.0,1,12,...,20.0,1610612749,MIL,Milwaukee Bucks,L,240,100,38,84,0.452,14,41.0,0.341,10,16,0.625,4.0,26.0,30.0,24,8.0,3,11,17,-20.0


<a name="1.3.-Create-Target-Variables"></a>
## 1.3. Create Target Variables

[Return to top](#Feature-Engineering)

There are three targets of interest:

1. **Total Game Points (over / under):** This can be calculated as the sum of `HOME_PTS + AWAY_PTS`.
2. **Difference in Game Points (plus / minus):** This can be calculated in relation to the home team as the following difference: `HOME_PTS - AWAY_PTS`.
3. **Game Winner (moneyline):** This can be defined in relation to the home team using the `HOME_WL` column, where a win for the home team is equal to 1 and a loss for the home team equal to 0. We will create a new column called `GAME_RESULT` for this indicator.

In [8]:
# create the above three target variables
team_bs_matchups_df = utl.create_target_variables(team_bs_matchups_df, 'HOME_WL', 'HOME_PTS', 'AWAY_PTS')

In [9]:
team_bs_matchups_df[['GAME_DATE', 'GAME_ID',  'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'HOME_PTS', 'AWAY_PTS', 'GAME_RESULT', 'TOTAL_PTS', 'PLUS_MINUS']].tail()

Unnamed: 0,GAME_DATE,GAME_ID,HOME_TEAM_NAME,AWAY_TEAM_NAME,HOME_PTS,AWAY_PTS,GAME_RESULT,TOTAL_PTS,PLUS_MINUS
3167,2024-01-24,22300620,Washington Wizards,Minnesota Timberwolves,107,118,0,225,-11.0
3168,2024-01-25,22300628,Washington Wizards,Utah Jazz,108,123,0,231,-15.0
3169,2024-01-31,22300676,Washington Wizards,LA Clippers,109,125,0,234,-16.0
3170,2024-02-02,22300689,Washington Wizards,Miami Heat,102,110,0,212,-8.0
3171,2024-02-04,22300705,Washington Wizards,Phoenix Suns,112,140,0,252,-28.0


<a name="2.-Create-Rolling-Window-Statistics"></a>
# 2. Create Rolling Window Statistics

[Return to top](#Feature-Engineering)

Here we create the average difference in box scores between teams over a rolling window of the previous $n$-games.

In [10]:
# identify stats columns
non_stats_cols = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'HOME_TEAM_ID', 'AWAY_TEAM_ID',
                  'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'HOME_WL', 'AWAY_WL', 'HOME_MIN', 
                  'AWAY_MIN', 'HOME_TEAM_ABBREVIATION', 'AWAY_TEAM_ABBREVIATION']
stats_cols = [col for col in team_bs_matchups_df.columns if col not in non_stats_cols]

In [18]:
# calculate rolling averages for each statistic and add them to the DataFrame
team_bs_matchups_rolling_diff = utl.process_rolling_diff_stats(
    team_bs_matchups_df, 
    stats_cols, 
    target_cols=['GAME_RESULT', 'TOTAL_PTS', 'PLUS_MINUS'],
    window_size=5,   # the number of games to include in the rolling window
    min_obs=1,       # the minimum number of observations present within the window to yield an aggregate value
    stratify_by_season=True,  # should the rolling calculations be reset at the start of each new season or be contiguous across seasons? 
    exclude_initial_games=0   # number of initial games to exclude from the rolling averages (optionally by season)
)

In [19]:
team_bs_matchups_rolling_diff.tail()

Unnamed: 0,GAME_ID,GAME_RESULT,TOTAL_PTS,PLUS_MINUS,HOME_TEAM_NAME,SEASON_ID,GAME_DATE,ROLLDIFF_HOME_PTS,ROLLDIFF_HOME_FGM,ROLLDIFF_HOME_FGA,ROLLDIFF_HOME_FG_PCT,ROLLDIFF_HOME_FG3M,ROLLDIFF_HOME_FG3A,ROLLDIFF_HOME_FG3_PCT,ROLLDIFF_HOME_FTM,ROLLDIFF_HOME_FTA,ROLLDIFF_HOME_FT_PCT,ROLLDIFF_HOME_OREB,ROLLDIFF_HOME_DREB,ROLLDIFF_HOME_REB,ROLLDIFF_HOME_AST,ROLLDIFF_HOME_STL,ROLLDIFF_HOME_BLK,ROLLDIFF_HOME_TOV,ROLLDIFF_HOME_PF,AWAY_TEAM_NAME,ROLLDIFF_AWAY_PTS,ROLLDIFF_AWAY_FGM,ROLLDIFF_AWAY_FGA,ROLLDIFF_AWAY_FG_PCT,ROLLDIFF_AWAY_FG3M,ROLLDIFF_AWAY_FG3A,ROLLDIFF_AWAY_FG3_PCT,ROLLDIFF_AWAY_FTM,ROLLDIFF_AWAY_FTA,ROLLDIFF_AWAY_FT_PCT,ROLLDIFF_AWAY_OREB,ROLLDIFF_AWAY_DREB,ROLLDIFF_AWAY_REB,ROLLDIFF_AWAY_AST,ROLLDIFF_AWAY_STL,ROLLDIFF_AWAY_BLK,ROLLDIFF_AWAY_TOV,ROLLDIFF_AWAY_PF
2467,22300699,1,275,7.0,Atlanta Hawks,2023-24,2024-02-03,0.0,0.8,8.8,-0.041,-2.2,0.4,-0.076,0.6,0.4,-0.002,5.4,-2.6,2.8,-0.6,-0.8,-1.2,-3.4,-2.6,Golden State Warriors,0.6,3.6,3.4,0.024,-1.8,-4.4,-0.012,-4.8,-6.0,0.001,-0.6,2.0,1.4,0.4,-0.6,-0.2,-1.2,2.4
2540,22300707,0,214,-18.6,Charlotte Hornets,2023-24,2024-02-04,-16.4,-6.6,-1.0,-0.07,0.0,2.2,-0.012,-3.2,-2.4,-0.082,-2.2,-5.6,-7.8,-4.0,-1.2,0.0,1.6,4.0,Indiana Pacers,-2.8,0.0,6.8,-0.04,4.0,7.0,0.047,-6.8,-7.4,-0.068,-0.4,-4.4,-4.8,7.8,-0.2,-2.2,-0.2,3.4
2492,22300706,1,222,40.0,Boston Celtics,2023-24,2024-02-04,3.8,-0.8,-3.8,0.011,5.2,7.4,0.076,0.2,-0.2,0.038,-2.6,1.8,-0.8,-1.0,0.0,3.6,1.0,2.2,Memphis Grizzlies,-6.6,-3.0,-1.4,-0.027,2.0,4.6,0.009,-2.6,-0.8,-0.117,-0.6,-4.2,-4.8,-3.0,-0.8,-1.6,1.2,0.4
2666,22300704,0,210,-12.0,Detroit Pistons,2023-24,2024-02-04,-2.2,-2.0,-3.6,-0.005,0.8,-0.4,0.023,1.0,2.8,-0.044,-0.6,-0.2,-0.8,0.2,-4.0,-1.0,3.0,-2.2,Orlando Magic,1.4,0.8,6.2,-0.031,-0.8,-2.0,-0.008,0.6,1.2,-0.008,4.8,-4.0,0.8,-1.4,2.4,-0.6,-4.4,0.8
3171,22300705,0,252,-28.0,Washington Wizards,2023-24,2024-02-04,-11.8,-3.8,-6.0,-0.01,-0.8,1.2,-0.034,-3.4,-3.8,-0.034,-6.2,-3.6,-9.8,0.8,-1.6,1.0,2.2,2.4,Phoenix Suns,0.6,1.2,-10.2,0.078,-1.6,-6.4,0.024,-0.2,1.0,-0.021,-4.0,4.4,0.4,-0.4,-2.4,1.8,6.0,-2.4


In [21]:
# write out the matchups with rolling features
team_bs_matchups_rolling_diff.to_csv('../../data/processed/nba_team_matchups_rolling_diff_box_scores_2022_2024_r05.csv', index=False)