In [1]:
import numpy as np
import pandas as pd
from icecream import ic

from tqdm import tqdm
import time
from nba_api.stats.endpoints import (
    boxscoreadvancedv3,
    boxscoretraditionalv2,
    boxscorefourfactorsv3,
    boxscorehustlev2,
    boxscoremiscv3,
    boxscoreplayertrackv3,
    boxscorescoringv3,
    leaguegamelog,
    leaguedashplayerstats,
    leaguedashteamstats,
)
from tenacity import retry, stop_after_attempt, wait_fixed

pd.options.mode.chained_assignment =  None

box_DIR = "../../data/box/"
shiny_DIR = "../../data/shiny/"

In [2]:
import requests
from nba_api.stats.library.http import NBAStatsHTTP

In [3]:
def get_gameids(season, name):
    df = pd.read_parquet(
        box_DIR + "NBA_Box_T_" + "Base" + "_" + season + ".parquet"
    )
    game_ids1 = df["GAME_ID"].tolist()
    game_ids1 = np.unique(game_ids1)
    try:
        dfr1 = pd.read_parquet(
            box_DIR + "NBA_Box_T_" + name + "_" + season + ".parquet1"
        )
        dfr2 = pd.read_parquet(
            box_DIR + "NBA_Box_P_" + name + "_" + season + ".parquet"
        )
        game_ids3 = dfr1["gameId"].tolist()
        game_ids2 = np.unique(game_ids3)
        game_ids2 = ["00" + str(s) for s in game_ids2]
        game_ids = list(set(game_ids1).difference(game_ids2))
    except Exception as error:
        ic(error)
        game_ids = game_ids1
        dfr1 = pd.DataFrame()
        dfr2 = pd.DataFrame()
    return game_ids, dfr1, dfr2
def get_game_box(game_id,fun):
    t1 = time.perf_counter()
    stats = fun(game_id=game_id)
    df = stats.get_data_frames()
    t2 = time.perf_counter() - t1
    tsleep = 0.6
    if t2<tsleep:
        time.sleep(tsleep-t2)
    return df

def get_games_box(game_ids, fun):
    df_ap1, df_ap2 = [], []
    for game_id in tqdm(game_ids):
        for attempt in range(5):
            try:
                df0 = get_game_box(game_id, fun)
                df1 = df0[1]
                df2 = df0[0]
                df_ap1.append(df1)
                df_ap2.append(df2)
                break
            except Exception as error:
                if attempt == 0:
                    ic(game_id)
                    ic(error)
                # time.sleep(60)
                session = requests.Session()
                session.headers.update({
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                NBAStatsHTTP.set_session(session)
                continue
    return df_ap1, df_ap2



In [4]:
boxscores = [
    {
        "name": "Trad",
        "fun": boxscoretraditionalv2.BoxScoreTraditionalV2,
    },
    {
        "name": "Adv",
        "fun": boxscoreadvancedv3.BoxScoreAdvancedV3,
    },
    {
        "name": "4Factor",
        "fun": boxscorefourfactorsv3.BoxScoreFourFactorsV3,
    },
    {
        "name": "Misc",
        "fun": boxscoremiscv3.BoxScoreMiscV3,
    },
    {
        "name": "Scoring", 
        "fun": boxscorescoringv3.BoxScoreScoringV3
    },
    {
        "name": "Track",
        "fun": boxscoreplayertrackv3.BoxScorePlayerTrackV3,
    },
    {
        "name": "Hustle",
        "fun": boxscorehustlev2.BoxScoreHustleV2,
    },
]

In [5]:
boxscore = boxscores[1]

In [None]:
for boxscore in boxscores[1:5]:
    for year in range(1996,2025):
        try:
            ic(year)          
            session = requests.Session()
            session.headers.update({
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })
            NBAStatsHTTP.set_session(session)
            season = str(year)
            name = boxscore["name"]
            fun = boxscore["fun"]
            game_ids, dfr1, dfr2 = get_gameids(season, name)
            df_ap1, df_ap2 = get_games_box(game_ids, fun)
            df1 = pd.concat(df_ap1)
            df2 = pd.concat(df_ap2)
            df3 = pd.concat([dfr1, df1])
            if name == "Trad":
                df3["gameId"] = df3["GAME_ID"]
                df3["MIN"] = df3["MIN"].astype("str")
            df3["gameId"] = df3["gameId"].astype(int)
            df3 = df3.sort_values(by=["gameId"]).reset_index(drop=True)
            df3.to_parquet(box_DIR + "NBA_Box_T_" + name + "_" + season + ".parquet")
            df4 = pd.concat([dfr2, df2])
            if name == "Trad":
                df4["gameId"] = df4["GAME_ID"]
                df4["MIN"] = df4["MIN"].astype("str")
            df4["gameId"] = df4["gameId"].astype(int)
            df4 = df4.sort_values(by=["gameId"]).reset_index(drop=True)
            df4.to_parquet(box_DIR+ "NBA_Box_P_"+ name + "_"+ season+ ".parquet")
        except Exception as error:
            # time.sleep(60)
            ic(error)
            continue

ic| year: 1996
| year: 1996
ic| error: FileNotFoundError(2, 'No such file or directory')
 28%|██▊       | 331/1189 [03:19<08:36,  1.66it/s]ic| game_id: '0029600332'
ic| error: AttributeError("'NoneType' object has no attribute 'keys'")
 31%|███       | 369/1189 [03:42<08:13,  1.66it/s]ic| game_id: '0029600370'
ic| error: AttributeError("'NoneType' object has no attribute 'keys'")
 82%|████████▏ | 972/1189 [09:46<02:10,  1.66it/s]ic| game_id: '0029600973'
ic| error: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)"))
100%|██████████| 1189/1189 [12:26<00:00,  1.59it/s]
ic| year: 1997
ic| error: FileNotFoundError(2, 'No such file or directory')
 51%|█████     | 602/1189 [06:02<05:53,  1.66it/s]ic| game_id: '0029700603'
ic| error: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)"))
100%|██████████| 1189/1189 [12:26<00:00,  1.59it/s] 
ic| year: 1998
ic| err