<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">NBA API</h4>
    <p style="font-size: 20px;">Data Gathering</p>
</div>

<a name="NBA"></a>

# Setup

In [15]:
import pandas as pd
from datetime import datetime, timedelta
import time

In [55]:
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import (
  scoreboard, leaguegamefinder, playercareerstats,
  boxscorematchupsv3, boxscoreadvancedv2, teamestimatedmetrics, 
  teamgamelogs, TeamGameLogs, TeamEstimatedMetrics, leaguedashteamstats,
    hustlestatsboxscore, boxscoremiscv2, boxscorefourfactorsv2,
    boxscorescoringv2, boxscoreusagev2, boxscoreplayertrackv2
)

# Team Data

In [17]:
# get_teams returns a list of 30 dictionaries, each an NBA team
nba_teams = teams.get_teams()
print("Number of teams fetched: {}".format(len(nba_teams)))
nba_teams_df = pd.DataFrame(nba_teams)
nba_teams_df.head()

Number of teams fetched: 30


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


# Players

In [18]:
# get_players returns a list of dictionaries, each representing a player
nba_players = players.get_players()
print("Number of players fetched: {}".format(len(nba_players)))
nba_players_df = pd.DataFrame(nba_players)
nba_players_df.head()

Number of players fetched: 4900


Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False


# Scoreboard

In [None]:
# Today's Score Board
games = scoreboard.ScoreBoard()

# json
games.get_json()

# dictionary
games.get_dict()

# League Game Finder

In [19]:
# get game data
team_ids = nba_teams_df['id'].tolist()

games_list = []

for id in team_ids:
    print(id)
    # query for games
    gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=id)
    # we want the first DataFrame of those returned
    games_list.append(gamefinder.get_data_frames()[0])
    # add time delay between requests
    time.sleep(3)

1610612737
1610612738
1610612739
1610612740
1610612741
1610612742
1610612743
1610612744
1610612745
1610612746
1610612747
1610612748
1610612749
1610612750
1610612751
1610612752
1610612753
1610612754
1610612755
1610612756
1610612757
1610612758
1610612759
1610612760
1610612761
1610612762
1610612763
1610612764
1610612765
1610612766


In [20]:
games_df = pd.concat(games_list)
print(games_df.shape)
games_df.head()

(104986, 28)


Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22023,1610612737,ATL,Atlanta Hawks,22300866,2024-03-02,ATL @ BKN,L,241,102,...,0.739,12.0,30.0,42.0,23,4.0,4,11,19,-12.0
1,22023,1610612737,ATL,Atlanta Hawks,22300851,2024-02-29,ATL @ BKN,L,241,97,...,0.714,8.0,31.0,39.0,18,3.0,8,11,14,-27.0
2,22023,1610612737,ATL,Atlanta Hawks,22300835,2024-02-27,ATL vs. UTA,W,239,124,...,0.87,15.0,40.0,55.0,27,8.0,8,17,15,27.0
3,22023,1610612737,ATL,Atlanta Hawks,22300821,2024-02-25,ATL vs. ORL,W,241,109,...,0.882,9.0,35.0,44.0,31,10.0,3,9,17,17.0
4,22023,1610612737,ATL,Atlanta Hawks,22300804,2024-02-23,ATL vs. TOR,L,239,121,...,0.708,20.0,34.0,54.0,27,5.0,4,9,11,-2.0


In [21]:
games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])

# get the latest date
earliest_date = games_df['GAME_DATE'].max()
print(earliest_date)

2024-03-03 00:00:00


In [None]:
games_df.to_csv('../data/original/nba_games_box_scores_1984_2024.csv', index=False)

# League Dash Team Stats

In [None]:
ldts = leaguedashteamstats.LeagueDashTeamStats(month=5, season='2023-24')

In [None]:
ldts.get_data_frames()[0].head()

In [None]:
ldts_list = []
seasons = []
for year in range(1996, 2024):
    season = f"{year}-{str(year + 1)[-2:]}"
    seasons.append(season)
months = range(1, 13)

for season in seasons:
    for month in months:
        print(f"Querying season {season}, month {month}")
        # query for months
        ldts = leaguedashteamstats.LeagueDashTeamStats(month=month, season=season)
        # get the first DataFrame of those returned
        df = ldts.get_data_frames()[0]
        
        # add columns for 'season' and 'month'
        df['SEASON'] = season
        df['MONTH'] = month
        
        # append the DataFrame to the list
        ldts_list.append(df)
        
        # add time delay between requests
        time.sleep(3)

# concatenate all DataFrames in the list into one large DataFrame
ldts_df = pd.concat(ldts_list, ignore_index=True)

In [None]:
ldts_df.head()

In [None]:
ldts_df.to_csv('../data/original/nba_dash_team_stats_1997_2024.csv', index=False)

In [None]:
ldts_df.value_counts('TEAM_NAME')

In [None]:
BC_2023 = ldts_df[(ldts_df['TEAM_NAME'] == 'Boston Celtics') & (ldts_df['SEASON'] == '2022-23')]
BC_2023.head()

# Player Career Statistics

In [None]:
# get player data
player_ids = nba_players_df['id'].tolist()

players_stats_list = []

for id in player_ids:
    print(id)
    # query for games
    career = playercareerstats.PlayerCareerStats(player_id=id)
    # we want the first DataFrame of those returned
    players_stats_list.append(career.get_data_frames()[0])
    # add time delay between requests
    time.sleep(1)

In [None]:
players_stats_df = pd.concat(players_stats_list)
print(players_stats_df.shape)
players_stats_df.head()

In [None]:
# get the latest season
earliest_date = players_stats_df['SEASON_ID'].min()
print(earliest_date)

In [None]:
players_stats_df.to_csv('../data/original/nba_players_statistics_1946_2024.csv', index=False)

# Box Score Matchups V3

In [None]:
games_df = pd.read_csv('../data/original/nba_games_box_scores_1984_2024.csv')
# get player data
game_ids = games_df['GAME_ID'].tolist()

In [None]:
game_ids[5000]

In [None]:
bs_matchups = boxscorematchupsv3.BoxScoreMatchupsV3(game_id=21000400)
bs_matchups.get_data_frames()[0]

# Box Score Advanced V3

In [None]:
bs_adv = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=21000400)
check = bs_adv.get_data_frames()[1]
check

In [22]:
# function to get game_ids
def get_game_ids(season_id):
    game_ids = []
    game_ids = games_df['GAME_ID'][(games_df.SEASON_ID == season_id)].tolist()
    return game_ids

#get game ids for 2023 - 2024 season
game_ids_2023_2024 = get_game_ids('22023')

#get game ids for 2022 - 2023 season
game_ids_2022_2023 = get_game_ids('22022')

#get game ids for 2021 - 2022 season
game_ids_2021_2022 = get_game_ids('22021')

In [None]:
# function to get team advanced stats per game for a given season
def get_adv_stats_df(game_id_list):
    adv_games_stats_list = []
    for id in game_id_list:
        print(id)
        # query for games
        games = boxscoreadvancedv3.BoxScoreAdvancedV3(game_id=id)
        adv_games_stats_list.append(games.get_data_frames()[1])
        time.sleep(3)
    adv_stats_df = pd.concat(adv_games_stats_list, ignore_index=True)
    adv_stats_df = adv_stats_df.drop_duplicates()
    return adv_stats_df

In [None]:
# get advanced stats df for 2021 - 2022 season
adv_stats_df_2021_2022 = get_adv_stats_df(game_ids_2021_2022)
adv_stats_df_2021_2022.head()

In [None]:
# get advanced stats df for 2022 - 2023 season
adv_stats_df_2022_2023 = get_adv_stats_df(game_ids_2022_2023)
adv_stats_df_2022_2023.head()

In [None]:
# get advanced stats df for 2023 - 2024 season
adv_stats_df_2023_2024 = get_adv_stats_df(game_ids_2023_2024)
adv_stats_df_2023_2024.head()

In [None]:
#combine advanced stat dataframes into one combined datafram
adv_stats_frames = [adv_stats_df_2021_2022, adv_stats_df_2022_2023, adv_stats_df_2023_2024]
adv_stats_df = pd.concat(adv_stat_frames)
adv_stats_df.head()

In [None]:
#creates team name field to match games_df
adv_stats_df['TEAM_NAME'] = adv_stats_df['teamCity'] + " " + adv_stats_df['teamName']

#renames fields that match games_df
adv_stats_df.rename(columns={'gameId':'GAME_ID','teamId':'TEAM_ID', 'teamTricode': 'TEAM_ABBREVIATION'}, inplace=True)

#drop redundant columns
adv_stats_df.drop(['teamCity', 'teamName', 'teamSlug'], inplace=True, axis=1)

adv_stats_df.head()

In [None]:
#get GAME_DATE, MATCHUP, GAME_ID, TEAM_ABBREVIATION fields from games_df
adv_stats_df = pd.merge(adv_stats_df, games_df[['SEASON_ID','GAME_DATE','MATCHUP', 'GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION']], on=['GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION'])

adv_stats_df.head()

In [None]:
#export csv
adv_stats_df.to_csv('../data/original/nba_advanced_statistics_2021_2024.csv', index=False)

# Team Game Logs

In [None]:
mav_id = '1610612742'
logs = teamgamelogs.TeamGameLogs.DataSet(data=)
#logs.get_data_frame(data=)

# Team Estimated Metrics

In [None]:
team_metrics = teamestimatedmetrics.TeamEstimatedMetrics.DataSet(data=)

In [None]:
from nba_api.stats.endpoints import commonplayerinfo

# Basic Request
player_info = commonplayerinfo.CommonPlayerInfo(player_id=2544)

In [None]:
player_info.available_seasons.get_data_frame()

# Team Hustle Stats

In [12]:
hustle = hustlestatsboxscore.HustleStatsBoxScore(game_id = "0022100209")
hustle.get_data_frames()[2]

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MINUTES,PTS,CONTESTED_SHOTS,CONTESTED_SHOTS_2PT,CONTESTED_SHOTS_3PT,...,SCREEN_ASSISTS,SCREEN_AST_PTS,OFF_LOOSE_BALLS_RECOVERED,DEF_LOOSE_BALLS_RECOVERED,LOOSE_BALLS_RECOVERED,OFF_BOXOUTS,DEF_BOXOUTS,BOX_OUT_PLAYER_TEAM_REBS,BOX_OUT_PLAYER_REBS,BOX_OUTS
0,22100209,1610612741,Bulls,CHI,Chicago,240.000000:00,121,45,21,24,...,9,20,2,3,5,1,2,3,0,3
1,22100209,1610612747,Lakers,LAL,Los Angeles,240.000000:00,103,53,30,23,...,6,13,1,4,5,1,1,2,2,2


In [24]:
# function to get team hustle stats per game for a given season
def get_hustle_stats_df(game_id_list):
    hustle_games_stats_list = []
    for id in game_id_list:
        print(id)
        # query for games
        hustle = hustlestatsboxscore.HustleStatsBoxScore(game_id=id)
        hustle_games_stats_list.append(hustle.get_data_frames()[2])
        time.sleep(1)
    hustle_stats_df = pd.concat(hustle_games_stats_list, ignore_index=True)
    hustle_stats_df = hustle_stats_df.drop_duplicates()
    return hustle_stats_df

In [None]:
# get hustle stats df for 2021 - 2022 season
#hustle_stats_df_2021_2022 = get_hustle_stats_df(game_ids_2021_2022)
hustle_stats_df_2021_2022.head()

In [None]:
# get hustle stats df for 2022 - 2023 season
#hustle_stats_df_2022_2023 = get_hustle_stats_df(game_ids_2022_2023)
hustle_stats_df_2022_2023.head()

In [None]:
# get hustle stats df for 2023 - 2024 season
#hustle_stats_df_2023_2024 = get_hustle_stats_df(game_ids_2023_2024)
hustle_stats_df_2023_2024.head()

In [None]:
#combine hustle stat dataframes into one combined dataframe
hustle_stats_frames = [hustle_stats_df_2021_2022, hustle_stats_df_2022_2023, hustle_stats_df_2023_2024]
hustle_stats_df = pd.concat(hustle_stats_frames)
hustle_stats_df.head()

In [None]:
#creates team name field to match games_df
hustle_stats_df['TEAM_NAME'] = hustle_stats_df['TEAM_CITY'] + " " + hustle_stats_df['TEAM_NAME']

#renames fields that match games_df
#hustle_stats_df.rename(columns={'gameId':'GAME_ID','teamId':'TEAM_ID', 'teamTricode': 'TEAM_ABBREVIATION'}, inplace=True)

#drop redundant columns
hustle_stats_df.drop(['TEAM_CITY'], inplace=True, axis=1)

hustle_stats_df.head()

In [None]:
hustle_stats_df = pd.merge(hustle_stats_df, games_df[['SEASON_ID','GAME_DATE','MATCHUP', 'GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION']], on=['GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION'])

hustle_stats_df.head()

In [None]:
#export csv
hustle_stats_df.to_csv('../data/original/nba_hustle_statistics_2021_2024.csv', index=False)

# Box Score Misc

In [None]:
misc = boxscoremiscv2.BoxScoreMiscV2(game_id = "0022100209")
misc.get_data_frames()[1]

In [26]:
# function to get team hustle stats per game for a given season
def get_misc_stats_df(game_id_list):
    misc_games_stats_list = []
    for id in game_id_list:
        print(id)
        # query for games
        misc = boxscoremiscv2.BoxScoreMiscV2(game_id=id)
        misc_games_stats_list.append(misc.get_data_frames()[1])
        time.sleep(1)
    misc_stats_df = pd.concat(misc_games_stats_list, ignore_index=True)
    misc_stats_df = misc_stats_df.drop_duplicates()
    return misc_stats_df

In [None]:
# get miscellaneous stats df for 2021 - 2022 season
misc_stats_df_2021_2022 = get_misc_stats_df(game_ids_2021_2022)
misc_stats_df_2021_2022.head()

In [None]:
# get miscellaneous stats df for 2022 - 2023 season
misc_stats_df_2022_2023 = get_misc_stats_df(game_ids_2022_2023)
misc_stats_df_2022_2023.head()

In [None]:
# get miscellaneous stats df for 2023 - 2024 season
misc_stats_df_2023_2024 = get_misc_stats_df(game_ids_2023_2024)
misc_stats_df_2023_2024.head()

In [None]:
#combine hustle stat dataframes into one combined dataframe
misc_stats_frames = [misc_stats_df_2021_2022, misc_stats_df_2022_2023, misc_stats_df_2023_2024]
misc_stats_df = pd.concat(misc_stats_frames)
misc_stats_df.head()

In [None]:
#creates team name field to match games_df
misc_stats_df['TEAM_NAME'] = misc_stats_df['TEAM_CITY'] + " " + misc_stats_df['TEAM_NAME']

#renames fields that match games_df
#hustle_stats_df.rename(columns={'gameId':'GAME_ID','teamId':'TEAM_ID', 'teamTricode': 'TEAM_ABBREVIATION'}, inplace=True)

#drop redundant columns
misc_stats_df.drop(['TEAM_CITY'], inplace=True, axis=1)

misc_stats_df.head()

In [None]:
misc_stats_df = pd.merge(misc_stats_df, games_df[['SEASON_ID','GAME_DATE','MATCHUP', 'GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION']], on=['GAME_ID','TEAM_ID', 'TEAM_ABBREVIATION'])

misc_stats_df.head()

In [None]:
#export csv
misc_stats_df.to_csv('../data/original/nba_misc_boxscore_statistics_2021_2024.csv', index=False)

# Four Factors

In [8]:
test = boxscorefourfactorsv2.BoxScoreFourFactorsV2(game_id = "0022100209")
test.get_data_frames()[1]

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22100209,1610612747,Lakers,LAL,Los Angeles,240.000000:00,0.494,0.392,0.183,0.182,0.635,0.124,0.159,0.119
1,22100209,1610612741,Bulls,CHI,Chicago,240.000000:00,0.635,0.124,0.159,0.095,0.494,0.392,0.183,0.227


In [27]:
# function to get team four factor stats per game for a given season
def get_factor_stats_df(game_id_list):
    factor_games_stats_list = []
    for id in game_id_list:
        print(id)
        # query for games
        games = boxscorefourfactorsv2.BoxScoreFourFactorsV2(game_id=id)
        factor_games_stats_list.append(games.get_data_frames()[1])
        #time.sleep(1)
    factor_stats_df = pd.concat(factor_games_stats_list, ignore_index=True)
    factor_stats_df = factor_stats_df.drop_duplicates()
    return factor_stats_df

In [28]:
# get four factor stats df for 2021 - 2022 season
#factor_stats_df_2021_2022 = get_factor_stats_df(game_ids_2021_2022)
factor_stats_df_2021_2022.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22101221,1610612745,Rockets,HOU,Houston,240.000000:00,0.556,0.225,0.08,0.118,0.638,0.276,0.133,0.326
1,22101221,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.638,0.276,0.133,0.283,0.556,0.225,0.08,0.216
2,22101207,1610612748,Heat,MIA,Miami,240.000000:00,0.632,0.389,0.175,0.111,0.523,0.216,0.153,0.354
3,22101207,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.523,0.216,0.153,0.271,0.632,0.389,0.175,0.167
4,22101192,1610612764,Wizards,WAS,Washington,240.000000:00,0.547,0.128,0.142,0.136,0.554,0.207,0.06,0.2


In [78]:
# get four factor stats df for 2022 - 2023 season
#factor_stats_df_2022_2023 = get_factor_stats_df(game_ids_2022_2023)
factor_stats_df_2022_2023.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22201216,1610612738,Celtics,BOS,Boston,240.000000:00,0.612,0.157,0.153,0.25,0.51,0.196,0.092,0.315
1,22201216,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.51,0.196,0.092,0.296,0.612,0.157,0.153,0.292
2,22201205,1610612755,76ers,PHI,Philadelphia,265.000000:00,0.554,0.284,0.164,0.241,0.609,0.304,0.173,0.128
3,22201205,1610612737,Hawks,ATL,Atlanta,265.000000:00,0.609,0.304,0.173,0.085,0.554,0.284,0.164,0.315
4,22201191,1610612764,Wizards,WAS,Washington,240.000000:00,0.511,0.277,0.13,0.226,0.589,0.476,0.159,0.333


In [79]:
# get four factor stats df for 2023 - 2024 season
#factor_stats_df_2023_2024 = get_factor_stats_df(game_ids_2023_2024)
factor_stats_df_2023_2024.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22300866,1610612751,Nets,BKN,Brooklyn,240.000000:00,0.551,0.25,0.114,0.245,0.489,0.264,0.124,0.314
1,22300866,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.489,0.264,0.124,0.235,0.551,0.25,0.114,0.306
2,22300851,1610612751,Nets,BKN,Brooklyn,240.000000:00,0.633,0.144,0.102,0.17,0.477,0.244,0.138,0.296
3,22300851,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.477,0.244,0.138,0.148,0.633,0.144,0.102,0.319
4,22300835,1610612762,Jazz,UTA,Utah,240.000000:00,0.431,0.202,0.166,0.203,0.559,0.247,0.17,0.354


In [31]:
#combine hustle stat dataframes into one combined dataframe
factor_stats_frames = [factor_stats_df_2021_2022, factor_stats_df_2022_2023, factor_stats_df_2023_2024]
factor_stats_df = pd.concat(factor_stats_frames)
factor_stats_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,22101221,1610612745,Rockets,HOU,Houston,240.000000:00,0.556,0.225,0.08,0.118,0.638,0.276,0.133,0.326
1,22101221,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.638,0.276,0.133,0.283,0.556,0.225,0.08,0.216
2,22101207,1610612748,Heat,MIA,Miami,240.000000:00,0.632,0.389,0.175,0.111,0.523,0.216,0.153,0.354
3,22101207,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.523,0.216,0.153,0.271,0.632,0.389,0.175,0.167
4,22101192,1610612764,Wizards,WAS,Washington,240.000000:00,0.547,0.128,0.142,0.136,0.554,0.207,0.06,0.2


In [36]:
import os

# Get the current working directory of the Python interpreter
cwd = os.getcwd()

# Print the current working directory
print(cwd)

C:\Users\smcne\Documents\School\Capstone\sports_betting\code\data_processing


In [None]:
#export csv
factor_stats_df.to_csv('../data/original/nba_four_factors_statistics_2021_2024.csv', index=False)

# Box Score Scoring

In [13]:
test = boxscorescoringv2.BoxScoreScoringV2(game_id = "0022100209")
test.get_data_frames()[1]

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,...,PCT_PTS_FB,PCT_PTS_FT,PCT_PTS_OFF_TOV,PCT_PTS_PAINT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM
0,22100209,1610612747,Lakers,LAL,Los Angeles,240.000000:00,0.595,0.405,0.583,0.194,...,0.117,0.243,0.194,0.388,0.433,0.567,1.0,0.0,0.528,0.472
1,22100209,1610612741,Bulls,CHI,Chicago,240.000000:00,0.618,0.382,0.562,0.116,...,0.165,0.066,0.157,0.446,0.559,0.441,0.8,0.2,0.633,0.367


In [51]:
# function to get team four factor stats per game for a given season
def get_scoring_stats_df(game_id_list):
    scoring_games_stats_list = []
    for id in game_id_list:
        print(id)
        # query for games
        games = boxscorescoringv2.BoxScoreScoringV2(game_id=id)
        scoring_games_stats_list.append(games.get_data_frames()[1])
        #time.sleep(1)
    scoring_stats_df = pd.concat(scoring_games_stats_list, ignore_index=True)
    scoring_stats_df = scoring_stats_df.drop_duplicates()
    return scoring_stats_df

In [58]:
# get scoring stats df for 2021 - 2022 season
#scoring_stats_df_2021_2022 = get_scoring_stats_df(game_ids_2021_2022)
scoring_stats_df_2021_2022.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,...,PCT_PTS_FB,PCT_PTS_FT,PCT_PTS_OFF_TOV,PCT_PTS_PAINT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM
0,22101221,1610612745,Rockets,HOU,Houston,240.000000:00,0.483,0.517,0.421,0.018,...,0.114,0.132,0.132,0.404,0.5,0.5,0.706,0.294,0.585,0.415
1,22101221,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.563,0.437,0.369,0.062,...,0.108,0.146,0.123,0.308,0.5,0.5,0.81,0.19,0.644,0.356
2,22101207,1610612748,Heat,MIA,Miami,240.000000:00,0.639,0.361,0.513,0.088,...,0.133,0.195,0.142,0.425,0.448,0.552,0.636,0.364,0.5,0.5
3,22101207,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.614,0.386,0.624,0.239,...,0.11,0.156,0.202,0.385,0.324,0.676,0.75,0.25,0.405,0.595
4,22101192,1610612764,Wizards,WAS,Washington,240.000000:00,0.593,0.407,0.621,0.194,...,0.126,0.087,0.078,0.427,0.5,0.5,1.0,0.0,0.619,0.381


In [59]:
# get scoring stats df for 2022 - 2023 season
#scoring_stats_df_2022_2023 = get_scoring_stats_df(game_ids_2022_2023)
scoring_stats_df_2022_2023.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,...,PCT_PTS_FB,PCT_PTS_FT,PCT_PTS_OFF_TOV,PCT_PTS_PAINT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM
0,22201216,1610612738,Celtics,BOS,Boston,240.000000:00,0.393,0.607,0.283,0.033,...,0.033,0.092,0.1,0.25,0.706,0.294,0.84,0.16,0.786,0.214
1,22201216,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.598,0.402,0.579,0.053,...,0.202,0.132,0.184,0.526,0.545,0.455,0.818,0.182,0.614,0.386
2,22201205,1610612755,76ers,PHI,Philadelphia,265.000000:00,0.784,0.216,0.588,0.015,...,0.132,0.169,0.154,0.574,0.575,0.425,0.909,0.091,0.647,0.353
3,22201205,1610612737,Hawks,ATL,Atlanta,265.000000:00,0.641,0.359,0.626,0.107,...,0.137,0.145,0.153,0.519,0.585,0.415,0.8,0.2,0.627,0.373
4,22201191,1610612764,Wizards,WAS,Washington,240.000000:00,0.702,0.298,0.672,0.103,...,0.017,0.172,0.164,0.569,0.436,0.564,1.0,0.0,0.511,0.489


In [63]:
# get scoring stats df for 2023 - 2024 season
#scoring_stats_df_2023_2024 = get_scoring_stats_df(game_ids_2023_2024)
scoring_stats_df_2023_2024.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,...,PCT_PTS_FB,PCT_PTS_FT,PCT_PTS_OFF_TOV,PCT_PTS_PAINT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM
0,22300866,1610612751,Nets,BKN,Brooklyn,240.000000:00,0.636,0.364,0.456,0.053,...,0.132,0.149,0.158,0.404,0.538,0.462,0.933,0.067,0.683,0.317
1,22300866,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.609,0.391,0.51,0.039,...,0.0,0.167,0.069,0.471,0.538,0.462,0.818,0.182,0.622,0.378
2,22300851,1610612751,Nets,BKN,Brooklyn,240.000000:00,0.489,0.511,0.387,0.065,...,0.097,0.081,0.202,0.323,0.5,0.5,0.818,0.182,0.652,0.348
3,22300851,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.674,0.326,0.598,0.103,...,0.144,0.155,0.144,0.495,0.448,0.552,0.625,0.375,0.486,0.514
4,22300835,1610612762,Jazz,UTA,Utah,240.000000:00,0.596,0.404,0.495,0.0,...,0.186,0.165,0.175,0.495,0.625,0.375,0.818,0.182,0.686,0.314


In [70]:
#combine scoring stat dataframes into one combined dataframe
scoring_stats_frames = [scoring_stats_df_2021_2022, scoring_stats_df_2022_2023, scoring_stats_df_2023_2024]
scoring_stats_df = pd.concat(scoring_stats_frames)
scoring_stats_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,...,PCT_PTS_FB,PCT_PTS_FT,PCT_PTS_OFF_TOV,PCT_PTS_PAINT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM
0,22101221,1610612745,Rockets,HOU,Houston,240.000000:00,0.483,0.517,0.421,0.018,...,0.114,0.132,0.132,0.404,0.5,0.5,0.706,0.294,0.585,0.415
1,22101221,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.563,0.437,0.369,0.062,...,0.108,0.146,0.123,0.308,0.5,0.5,0.81,0.19,0.644,0.356
2,22101207,1610612748,Heat,MIA,Miami,240.000000:00,0.639,0.361,0.513,0.088,...,0.133,0.195,0.142,0.425,0.448,0.552,0.636,0.364,0.5,0.5
3,22101207,1610612737,Hawks,ATL,Atlanta,240.000000:00,0.614,0.386,0.624,0.239,...,0.11,0.156,0.202,0.385,0.324,0.676,0.75,0.25,0.405,0.595
4,22101192,1610612764,Wizards,WAS,Washington,240.000000:00,0.593,0.407,0.621,0.194,...,0.126,0.087,0.078,0.427,0.5,0.5,1.0,0.0,0.619,0.381


In [72]:
#export csv
scoring_stats_df.to_csv('../data/original/nba_scoring_statistics_2021_2024.csv', index=False)

# Box Score Player Track

In [56]:
test = boxscoreplayertrackv2.BoxScorePlayerTrackV2(game_id = "0022100209")
test.get_data_frames()[1]

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,DIST,ORBC,DRBC,RBC,...,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22100209,1610612741,Bulls,CHI,Chicago,240:00,18.12,13,48,60,...,21,35,0.6,28,54,0.519,0.551,18,26,0.692
1,22100209,1610612747,Lakers,LAL,Los Angeles,240:00,17.51,24,41,63,...,14,27,0.519,22,52,0.423,0.456,20,26,0.769


In [66]:
# function to get team four factor stats per game for a given season
def get_track_stats_df(game_id_list):
    track_games_stats_list = []
    for id in game_id_list:
        print(id, len(track_games_stats_list))
        # query for games
        games = boxscoreplayertrackv2.BoxScorePlayerTrackV2(game_id=id)
        track_games_stats_list.append(games.get_data_frames()[1])
        #time.sleep(1)
    track_stats_df = pd.concat(track_games_stats_list, ignore_index=True)
    track_stats_df = track_stats_df.drop_duplicates()
    return track_stats_df

In [75]:
# get player track stats df for 2021 - 2022 season
#track_stats_df_2021_2022 = get_track_stats_df(game_ids_2021_2022)
track_stats_df_2021_2022.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,DIST,ORBC,DRBC,RBC,...,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22101221,1610612737,Hawks,ATL,Atlanta,240:00,16.86,27,52,77,...,13,30,0.433,32,57,0.561,0.517,16,24,0.667
1,22101221,1610612745,Rockets,HOU,Houston,240:00,17.14,20,53,69,...,15,26,0.577,26,63,0.413,0.461,16,29,0.552
2,22101207,1610612737,Hawks,ATL,Atlanta,240:00,17.36,25,40,61,...,20,30,0.667,22,58,0.379,0.477,16,18,0.889
3,22101207,1610612748,Heat,MIA,Miami,240:00,17.67,11,56,65,...,14,21,0.667,26,51,0.51,0.556,13,21,0.619
4,22101192,1610612764,Wizards,WAS,Washington,240:00,18.05,16,56,71,...,20,31,0.645,22,55,0.4,0.488,16,25,0.64


In [76]:
# get player track stats df for 2021 - 2022 season
#track_stats_df_2022_2023 = get_track_stats_df(game_ids_2022_2023)
track_stats_df_2022_2023.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,DIST,ORBC,DRBC,RBC,...,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22201216,1610612737,Hawks,ATL,Atlanta,240:00,17.58,32,56,87,...,19,32,0.594,25,65,0.385,0.454,11,22,0.5
1,22201216,1610612738,Celtics,BOS,Boston,240:00,18.05,22,56,77,...,13,26,0.5,29,63,0.46,0.472,21,28,0.75
2,22201205,1610612755,76ers,PHI,Philadelphia,265:00,20.33,34,62,90,...,25,51,0.49,26,51,0.51,0.5,27,34,0.794
3,22201205,1610612737,Hawks,ATL,Atlanta,265:00,19.45,19,50,66,...,20,32,0.625,31,60,0.517,0.554,26,42,0.619
4,22201191,1610612764,Wizards,WAS,Washington,240:00,18.93,21,49,69,...,28,48,0.583,17,46,0.37,0.479,18,27,0.667


In [77]:
# get player track stats df for 2021 - 2022 season
#track_stats_df_2023_2024 = get_track_stats_df(game_ids_2023_2024)
track_stats_df_2023_2024.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,DIST,ORBC,DRBC,RBC,...,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22300866,1610612737,Hawks,ATL,Atlanta,240:00,17.88,27,51,76,...,19,44,0.432,18,43,0.419,0.425,16,28,0.571
1,22300866,1610612751,Nets,BKN,Brooklyn,240:00,18.08,24,54,77,...,17,36,0.472,24,52,0.462,0.466,20,33,0.606
2,22300851,1610612737,Hawks,ATL,Atlanta,240:00,17.45,28,53,79,...,20,41,0.488,17,45,0.378,0.43,13,22,0.591
3,22300851,1610612751,Nets,BKN,Brooklyn,240:00,17.79,32,63,89,...,14,27,0.519,32,63,0.508,0.511,15,23,0.652
4,22300835,1610612762,Jazz,UTA,Utah,240:00,18.31,28,48,75,...,14,34,0.412,21,60,0.35,0.372,19,30,0.633


In [73]:
#combine track stat dataframes into one combined dataframe
track_stats_frames = [track_stats_df_2021_2022, track_stats_df_2022_2023, track_stats_df_2023_2024]
track_stats_df = pd.concat(track_stats_frames)
track_stats_df.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,DIST,ORBC,DRBC,RBC,...,CFGM,CFGA,CFG_PCT,UFGM,UFGA,UFG_PCT,FG_PCT,DFGM,DFGA,DFG_PCT
0,22101221,1610612737,Hawks,ATL,Atlanta,240:00,16.86,27,52,77,...,13,30,0.433,32,57,0.561,0.517,16,24,0.667
1,22101221,1610612745,Rockets,HOU,Houston,240:00,17.14,20,53,69,...,15,26,0.577,26,63,0.413,0.461,16,29,0.552
2,22101207,1610612737,Hawks,ATL,Atlanta,240:00,17.36,25,40,61,...,20,30,0.667,22,58,0.379,0.477,16,18,0.889
3,22101207,1610612748,Heat,MIA,Miami,240:00,17.67,11,56,65,...,14,21,0.667,26,51,0.51,0.556,13,21,0.619
4,22101192,1610612764,Wizards,WAS,Washington,240:00,18.05,16,56,71,...,20,31,0.645,22,55,0.4,0.488,16,25,0.64


In [74]:
#export csv
track_stats_df.to_csv('../data/original/nba_track_statistics_2021_2024.csv', index=False)