# Find Game IDs (and scrape team data?)

The objective of this notebook is to take the df with all team stats for each game and append the unique "game_id" to each game using nba_api.

The way this notebook is written, we could actually scrape team data via nba_api by specifying a season range.

Note: I did not fully implement this notebook since we do not need the game IDs yet. It can be used as a reference on how to send lots of requests to nba_api with a reasonable rate (e.g. request every 30 seconds). We can also use this as an alternative for scraping additional data as we are pulling game logs directly from nba.com/stats.

In [1]:
import numpy as np
import pandas as pd
from nba_api.stats.static import teams
from nba_api.stats.endpoints import teamgamelog
import datetime

In [2]:
PATH_TO_TEAM_DATA = "../../data/raw/nba_games_runtime.csv"

In [3]:
df = pd.read_csv(PATH_TO_TEAM_DATA, index_col=0)
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22040,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22041,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False
22042,240.0,240.0,49.0,92.0,0.533,20.0,42.0,0.476,22.0,28.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True


In [4]:
# Create a "game_id" column and move to the front
df.insert(0, 'game_id', None)
df

Unnamed: 0,game_id,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22040,,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22041,,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False
22042,,240.0,240.0,49.0,92.0,0.533,20.0,42.0,0.476,22.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True


In [5]:
# Print out all the team codes
all_team_codes = df["team"].unique()
print(len(all_team_codes))  # We should have 30 teams
all_team_codes

30


array(['DET', 'ATL', 'CLE', 'CHI', 'NOP', 'GSW', 'PHI', 'BOS', 'BRK',
       'UTA', 'DEN', 'HOU', 'MIN', 'LAL', 'MEM', 'CHO', 'MIA', 'NYK',
       'MIL', 'SAS', 'OKC', 'WAS', 'ORL', 'DAL', 'PHO', 'POR', 'LAC',
       'SAC', 'IND', 'TOR'], dtype=object)

In [6]:
# Fix team codes that do not match the NBA team codes
# BRK -> BKN
# CHO -> CHA
# PHO -> PHX

df = df.replace({"BRK": "BKN", "CHO": "CHA", "PHO": "PHX"})
all_team_codes = np.sort(df["team"].unique())
all_opp_team_codes = np.sort(df["team_opp"].unique())
print(all_team_codes)
print(all_opp_team_codes)

assert np.array_equal(all_team_codes, all_opp_team_codes)

['ATL' 'BKN' 'BOS' 'CHA' 'CHI' 'CLE' 'DAL' 'DEN' 'DET' 'GSW' 'HOU' 'IND'
 'LAC' 'LAL' 'MEM' 'MIA' 'MIL' 'MIN' 'NOP' 'NYK' 'OKC' 'ORL' 'PHI' 'PHX'
 'POR' 'SAC' 'SAS' 'TOR' 'UTA' 'WAS']
['ATL' 'BKN' 'BOS' 'CHA' 'CHI' 'CLE' 'DAL' 'DEN' 'DET' 'GSW' 'HOU' 'IND'
 'LAC' 'LAL' 'MEM' 'MIA' 'MIL' 'MIN' 'NOP' 'NYK' 'OKC' 'ORL' 'PHI' 'PHX'
 'POR' 'SAC' 'SAS' 'TOR' 'UTA' 'WAS']


In [7]:
# Create a python dict to store team ids
team_ids = dict()

# Get all the team ids using nba api
for code in all_team_codes:
    t_id = teams.find_team_by_abbreviation(code)['id']
    print(f"{code}: {t_id}")
    team_ids[code] = t_id

ATL: 1610612737
BKN: 1610612751
BOS: 1610612738
CHA: 1610612766
CHI: 1610612741
CLE: 1610612739
DAL: 1610612742
DEN: 1610612743
DET: 1610612765
GSW: 1610612744
HOU: 1610612745
IND: 1610612754
LAC: 1610612746
LAL: 1610612747
MEM: 1610612763
MIA: 1610612748
MIL: 1610612749
MIN: 1610612750
NOP: 1610612740
NYK: 1610612752
OKC: 1610612760
ORL: 1610612753
PHI: 1610612755
PHX: 1610612756
POR: 1610612757
SAC: 1610612758
SAS: 1610612759
TOR: 1610612761
UTA: 1610612762
WAS: 1610612764


In [8]:
import time

# Store all game logs in a python dict
game_logs = dict()

# Get starting and end dates - format them
starting_date = df["date"].iloc[0]
last_date = df["date"].iloc[-1]

fmt_starting_date = datetime.datetime.strptime(starting_date, "%Y-%m-%d").strftime("%m/%d/%Y")
fmt_last_date = datetime.datetime.strptime(last_date, "%Y-%m-%d").strftime("%m/%d/%Y")
print(fmt_starting_date, fmt_last_date)

start_season = fmt_starting_date[-4:]
last_season = fmt_last_date[-4:]

10/27/2015 02/15/2024


In [9]:
# Calculate season range
season_range = None

# If our last date in the dataset is near the start of a new NBA season
if(int(fmt_last_date[:2]) >= 10):
    season_range = range(int(start_season), int(last_season) + 1)
else:
    season_range = range(int(start_season), int(last_season))

print(list(season_range))

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]


In [None]:
# Fetch the game logs (add a 30 second delay in between requests)
for code in all_team_codes:
    
    curr_team_id = team_ids[code]
    combined_team_game_log_df = pd.DataFrame()
    
    for season in season_range:
        
        gamelog = teamgamelog.TeamGameLog(
            season = season,
            season_type_all_star = 'Regular Season',
            team_id = curr_team_id,
            timeout=100
        )
        gamelog_df = gamelog.team_game_log.get_data_frame()
        
        print(f"Gamelog fetched for {code} for the {season} season")
        
        combined_team_game_log_df = pd.concat([gamelog_df, combined_team_game_log_df], ignore_index=True)
        
#         print(combined_team_game_log_df.head(3))
#         print(combined_team_game_log_df.tail(3))
        
        # Add a 30 second delay in-between requests
        delay = 30
        time.sleep(delay)
    
    game_logs[code] = combined_team_game_log_df
    print(combined_team_game_log_df.head(2))
    print(combined_team_game_log_df.tail(2))
    print("-" * 30)

Gamelog fetched for ATL for the 2015 season
Gamelog fetched for ATL for the 2016 season
Gamelog fetched for ATL for the 2017 season
Gamelog fetched for ATL for the 2018 season
Gamelog fetched for ATL for the 2019 season
Gamelog fetched for ATL for the 2020 season
Gamelog fetched for ATL for the 2021 season
Gamelog fetched for ATL for the 2022 season
Gamelog fetched for ATL for the 2023 season
      Team_ID     Game_ID     GAME_DATE      MATCHUP    WL     W     L  W_PCT   
0  1610612737  0022300925  MAR 10, 2024  ATL vs. NOP  None   NaN   NaN    NaN  \
1  1610612737  0022300913  MAR 08, 2024    ATL @ MEM     W  29.0  34.0   0.46   

   MIN  FGM  ...  FT_PCT  OREB  DREB  REB  AST  STL  BLK  TOV  PF  PTS  
0   60   15  ...   0.750     7    13   20   12    4    2    5   3   39  
1  240   37  ...   0.667    15    32   47   21    9    5   14  20   99  

[2 rows x 27 columns]
        Team_ID     Game_ID     GAME_DATE      MATCHUP WL    W    L  W_PCT   
693  1610612737  0021500019  OCT 29, 201