In [None]:
from nba_api.stats.endpoints import leaguegamelog, gamerotation, playbyplayv3
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

In [None]:
# from nba_api.stats.static import teams

# nba_teams = teams.get_teams()
# # Select the dictionary for the Celtics, which contains their team ID
# celtics = [team for team in nba_teams if team['abbreviation'] == 'BOS'][0]
# celtics_id = celtics['id']

# Get Games in a Season

In [None]:
# from nba_api.stats.endpoints import leaguegamefinder

# # Query for games where the Celtics were playing
# gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=celtics_id)
# # The first DataFrame of those returned is what we want.
# games = gamefinder.get_data_frames()[0]
# games.head()

In [None]:
games_df = leaguegamelog.LeagueGameLog(player_or_team_abbreviation = 'T', season = '2023-24', season_type_all_star = 'Regular Season').get_data_frames()[0]
games_df.head()

# Get Rotation for a Given Game

In [None]:
ht_rotation = gamerotation.GameRotation(game_id = games_df.GAME_ID[0]).get_data_frames()[1]
ht_rotation['home_away_flg'] = 'H'
at_rotation = gamerotation.GameRotation(game_id = games_df.GAME_ID[0]).get_data_frames()[0]
at_rotation['home_away_flg'] = 'A'
rotation_df = pd.concat([ht_rotation, at_rotation])
rotation_df.head()

In [None]:
rotation_df = rotation_df.sort_values(by = ['IN_TIME_REAL', 'OUT_TIME_REAL'], ascending = [True, True])
rotation_df = rotation_df.reset_index(drop = True)
rotation_df['IN_TIME_REAL'] = np.ceil(rotation_df['IN_TIME_REAL']/10)
rotation_df['OUT_TIME_REAL'] = np.ceil(rotation_df['OUT_TIME_REAL']/10)
rotation_df.head()

# Get PBP Stats for a Given Game

In [None]:
pbp = playbyplayv3.PlayByPlayV3(game_id = games_df.GAME_ID[0]).get_data_frames()[0]
pbp.head()

## Calculate Game Breakpoints

In [None]:
breakpoints = sorted(set(rotation_df['IN_TIME_REAL']).union(set(rotation_df['OUT_TIME_REAL'])))
breakpoints

In [None]:
len(breakpoints)

## Calculate Stints and Point Differential

In [None]:
#Get Actual Point Differential
pbp['time_completed'] = ((pbp['period']-1) * 12*60) + ((12*60) - pbp['clock'].str[2:4].astype(float)*60 - pbp['clock'].str[5:7].astype(float))
pbp_formatted = pbp[['clock', 'time_completed', 'scoreHome', 'scoreAway', 'description', 'actionType']].copy()
pbp_formatted['scoreHome'].replace('', np.nan, inplace = True)
pbp_formatted['scoreAway'].replace('', np.nan, inplace = True)
pbp_formatted['scoreHome'].fillna(method = 'ffill', inplace = True)
pbp_formatted['scoreAway'].fillna(method = 'ffill', inplace = True)

pbp_formatted['scoreHome'] = pbp_formatted['scoreHome'].astype(int)
pbp_formatted['scoreAway'] = pbp_formatted['scoreAway'].astype(int)




pbp_formatted['STINT_START_TIME'] = pbp_formatted['time_completed'].shift(1)
pbp_formatted['STINT_START_TIME'] = pbp_formatted['STINT_START_TIME'].fillna(0)

pbp_formatted.rename(columns = {'time_completed': 'STINT_END_TIME'}, inplace = True)
pbp_formatted = pbp_formatted.loc[pbp_formatted['STINT_END_TIME'].isin(breakpoints)]


pt_diff_df = pd.DataFrame(columns = ['STINT_START_TIME', 'STINT_END_TIME', 'PT_DIFF'])

for i in range(len(breakpoints) - 1):
    t = breakpoints[i]
    t_plus_1 = breakpoints[i+1]

    temp = pbp_formatted.loc[(pbp_formatted['STINT_START_TIME'] >= t) & (pbp_formatted['STINT_END_TIME'] <= t_plus_1)]
    res = pd.DataFrame(columns = ['STINT_START_TIME', 'STINT_END_TIME', 'PT_DIFF'])

    res.loc[0, 'STINT_START_TIME'] = t 
    res.loc[0, 'STINT_END_TIME'] = t_plus_1

    diff = 0

    if t == 0:
        diff = np.max(temp['scoreHome']) - np.max(temp['scoreAway'])
    
    else:
        end_home_score = temp.loc[(temp['STINT_END_TIME'] == t_plus_1) & (temp['actionType'].isin(['Substitution', 'period']))]['scoreHome'].values[0]
        end_away_score = temp.loc[(temp['STINT_END_TIME'] == t_plus_1) & (temp['actionType'].isin(['Substitution', 'period']))]['scoreAway'].values[0]
        # diff = (np.max(temp['scoreHome']) - np.min(temp['scoreHome'])) - (np.max(temp['scoreAway']) - np.min(temp['scoreAway']))
        diff = (end_home_score - np.min(temp['scoreHome'])) - (end_away_score - np.min(temp['scoreAway']))
    
    res.loc[0, 'PT_DIFF'] = diff 

    pt_diff_df = pd.concat([pt_diff_df, res], axis = 0)
    pt_diff_df = pt_diff_df.reset_index(drop = True)

pt_diff_df

# Get In-Time and Out-Time for each Player

In [None]:
players = list(set(rotation_df.PERSON_ID))
# players = ['PERSON_ID_'+str(i) for i in players]
players

In [None]:
player_id_cols = ['PERSON_ID_'+str(p) for p in players]
player_flags_df = pd.get_dummies(rotation_df, columns = ['PERSON_ID'], dtype = int).copy()
player_flags_df = player_flags_df.groupby(['IN_TIME_REAL', 'OUT_TIME_REAL'])[player_id_cols].sum().reset_index()
player_flags_df

# Final Stint-wise Point Differential DataFrame

In [None]:
final_stints_df = pd.DataFrame(columns = ['STINT_START_TIME', 'STINT_END_TIME'] + players + ['PT_DIFF'])

for i in tqdm(range(len(breakpoints) - 1)):
  t = breakpoints[i]
  t_plus_1 = breakpoints[i+1]
  stint_players = rotation_df.loc[(rotation_df['IN_TIME_REAL'] <= t) & (rotation_df['OUT_TIME_REAL'] > t)][['PERSON_ID']].values.flatten().tolist()
  home_away_flgs = rotation_df.loc[(rotation_df['IN_TIME_REAL'] <= t) & (rotation_df['OUT_TIME_REAL'] > t)][['home_away_flg']].values.tolist()
  # pt_diff = stints_df.loc[(stints_df['STINT_START_TIME_FIXED'] == t) & (stints_df['STINT_END_TIME'] == t_plus_1)]['PT_DIFF'].sum()
  pt_diff = pt_diff_df.loc[(pt_diff_df['STINT_START_TIME'] == t) & (pt_diff_df['STINT_END_TIME'] == t_plus_1)]['PT_DIFF'].values[0]

  X = pd.DataFrame(columns = ['STINT_START_TIME', 'STINT_END_TIME'] + players + ['PT_DIFF'])
  X.loc[0, 'STINT_START_TIME'] = t
  X.loc[0, 'STINT_END_TIME'] = t_plus_1
  X.loc[0, 'PT_DIFF'] = pt_diff

  for player in players:
    if player in stint_players:
      if home_away_flgs[stint_players.index(player)] == 'H':
        X.loc[0, player] = 1
      else:
        X.loc[0, player] = -1
    else:
      X.loc[0, player] = 0

  final_stints_df = pd.concat([final_stints_df, X], axis = 0)

final_stints_df = final_stints_df.reset_index(drop = True)
final_stints_df

In [None]:
np.sum(final_stints_df['PT_DIFF'])

# Scraping Definition

## GameLog to GameID

In [None]:
def gamelog_gameid(szn:str):
    X = leaguegamelog.LeagueGameLog(player_or_team_abbreviation = 'T', season = szn, season_type_all_star = 'Regular Season').get_data_frames()[0]
    return X, set(X['GAME_ID'])

In [None]:
gamelog_gameid('2024-25')

## Rotation Based on GameID

In [None]:
def get_rotation(game_id:str):
    ht_rotation = gamerotation.GameRotation(game_id = game_id).get_data_frames()[1]
    ht_rotation['home_away_flg'] = 'H'
    at_rotation = gamerotation.GameRotation(game_id = game_id).get_data_frames()[0]
    at_rotation['home_away_flg'] = 'A'
    rotation_df = pd.concat([ht_rotation, at_rotation])
    rotation_df = rotation_df.sort_values(by = ['IN_TIME_REAL', 'OUT_TIME_REAL'], ascending = [True, True])
    rotation_df = rotation_df.reset_index(drop = True)
    rotation_df['IN_TIME_REAL'] = np.ceil(rotation_df['IN_TIME_REAL']/10)
    rotation_df['OUT_TIME_REAL'] = np.ceil(rotation_df['OUT_TIME_REAL']/10)

    #Calc. Breakpoints
    breakpoints = sorted(set(rotation_df['IN_TIME_REAL']).union(set(rotation_df['OUT_TIME_REAL'])))

    return rotation_df, breakpoints

In [None]:
rotation_df, breakpoints = get_rotation('0022400061')

In [None]:
breakpoints

## Stints and Point Differential from PBP Data

In [None]:
def get_stints_pt_diff(game_id:str, breakpoints:list):
    pbp = playbyplayv3.PlayByPlayV3(game_id = game_id).get_data_frames()[0]

    #Get Actual Point Differential
    pbp['time_completed'] = ((pbp['period']-1) * 12*60) + ((12*60) - pbp['clock'].str[2:4].astype(float)*60 - pbp['clock'].str[5:7].astype(float))
    pbp_formatted = pbp[['clock', 'time_completed', 'scoreHome', 'scoreAway', 'description', 'actionType']].copy()
    pbp_formatted['scoreHome'].replace('', np.nan, inplace = True)
    pbp_formatted['scoreAway'].replace('', np.nan, inplace = True)
    pbp_formatted['scoreHome'].fillna(method = 'ffill', inplace = True)
    pbp_formatted['scoreAway'].fillna(method = 'ffill', inplace = True)
    
    pbp_formatted['scoreHome'] = pbp_formatted['scoreHome'].astype(int)
    pbp_formatted['scoreAway'] = pbp_formatted['scoreAway'].astype(int)
    
    
    
    
    pbp_formatted['STINT_START_TIME'] = pbp_formatted['time_completed'].shift(1)
    pbp_formatted['STINT_START_TIME'] = pbp_formatted['STINT_START_TIME'].fillna(0)
    
    pbp_formatted.rename(columns = {'time_completed': 'STINT_END_TIME'}, inplace = True)
    pbp_formatted = pbp_formatted.loc[pbp_formatted['STINT_END_TIME'].isin(breakpoints)]
    
    
    pt_diff_df = pd.DataFrame(columns = ['STINT_START_TIME', 'STINT_END_TIME', 'PT_DIFF'])
    
    for i in range(len(breakpoints) - 1):
        t = breakpoints[i]
        t_plus_1 = breakpoints[i+1]
    
        temp = pbp_formatted.loc[(pbp_formatted['STINT_START_TIME'] >= t) & (pbp_formatted['STINT_END_TIME'] <= t_plus_1)]
        res = pd.DataFrame(columns = ['STINT_START_TIME', 'STINT_END_TIME', 'PT_DIFF'])
    
        res.loc[0, 'STINT_START_TIME'] = t 
        res.loc[0, 'STINT_END_TIME'] = t_plus_1
    
        diff = 0
    
        if t == 0:
            diff = np.max(temp['scoreHome']) - np.max(temp['scoreAway'])
        
        else:
            end_home_score = temp.loc[(temp['STINT_END_TIME'] == t_plus_1) & (temp['actionType'].isin(['Substitution', 'period']))]['scoreHome'].values[0]
            end_away_score = temp.loc[(temp['STINT_END_TIME'] == t_plus_1) & (temp['actionType'].isin(['Substitution', 'period']))]['scoreAway'].values[0]
            # diff = (np.max(temp['scoreHome']) - np.min(temp['scoreHome'])) - (np.max(temp['scoreAway']) - np.min(temp['scoreAway']))
            diff = (end_home_score - np.min(temp['scoreHome'])) - (end_away_score - np.min(temp['scoreAway']))
        
        res.loc[0, 'PT_DIFF'] = diff 
    
        pt_diff_df = pd.concat([pt_diff_df, res], axis = 0)
        pt_diff_df = pt_diff_df.reset_index(drop = True)
    
    return pt_diff_df

In [None]:
pt_diff_df = get_stints_pt_diff('0022400061', breakpoints)
pt_diff_df

## PlayerID In-Time Out-Time

In [None]:
def get_player_id_in_out_time(rotation_df:pd.DataFrame):
    players = list(set(rotation_df.PERSON_ID))

    player_id_cols = ['PERSON_ID_'+str(p) for p in players]
    player_flags_df = pd.get_dummies(rotation_df, columns = ['PERSON_ID'], dtype = int).copy()
    player_flags_df = player_flags_df.groupby(['IN_TIME_REAL', 'OUT_TIME_REAL'])[player_id_cols].sum().reset_index()
    return players, player_flags_df

In [None]:
players, _ = get_player_id_in_out_time(rotation_df)
players

## Final Stint-wise Point Differential Function

In [None]:
def final_stint_pt_diff(players:list, breakpoints:list, rotation_df:pd.DataFrame, pt_diff_df:pd.DataFrame, game_id:str, season:str):
    final_stints_df = pd.DataFrame(columns = ['SEASON', 'GAME_ID', 'STINT_START_TIME', 'STINT_END_TIME'] + players + ['PT_DIFF'])

    for i in tqdm(range(len(breakpoints) - 1)):
      t = breakpoints[i]
      t_plus_1 = breakpoints[i+1]
      stint_players = rotation_df.loc[(rotation_df['IN_TIME_REAL'] <= t) & (rotation_df['OUT_TIME_REAL'] > t)][['PERSON_ID']].values.flatten().tolist()
      home_away_flgs = rotation_df.loc[(rotation_df['IN_TIME_REAL'] <= t) & (rotation_df['OUT_TIME_REAL'] > t)][['home_away_flg']].values.tolist()
      # pt_diff = stints_df.loc[(stints_df['STINT_START_TIME_FIXED'] == t) & (stints_df['STINT_END_TIME'] == t_plus_1)]['PT_DIFF'].sum()
      pt_diff = pt_diff_df.loc[(pt_diff_df['STINT_START_TIME'] == t) & (pt_diff_df['STINT_END_TIME'] == t_plus_1)]['PT_DIFF'].values[0]
    
      X = pd.DataFrame(columns = ['SEASON', 'GAME_ID', 'STINT_START_TIME', 'STINT_END_TIME'] + players + ['PT_DIFF'])
      #Assign Season and GameID
      X.loc[0, 'SEASON'] = season
      X.loc[0, 'GAME_ID'] = game_id
      X.loc[0, 'STINT_START_TIME'] = t
      X.loc[0, 'STINT_END_TIME'] = t_plus_1
      X.loc[0, 'PT_DIFF'] = pt_diff
    
      for player in players:
        if player in stint_players:
          if home_away_flgs[stint_players.index(player)] == 'H':
            X.loc[0, player] = 1
          else:
            X.loc[0, player] = -1
        else:
          X.loc[0, player] = 0
    
      final_stints_df = pd.concat([final_stints_df, X], axis = 0)
    
    final_stints_df = final_stints_df.reset_index(drop = True)
    return final_stints_df

In [None]:
final_stint_pt_diff(players, breakpoints, rotation_df, pt_diff_df, '0022400061', '2024-25')

## Process Game

In [None]:
def process_game(game_id: str, szn: int) -> pd.DataFrame:
    """Compute one game's stint-wise point differential dataframe."""
    # If you still need to throttle a bit to be polite to an API, uncomment:
    # time.sleep(0.10)

    rotation_df, breakpoints = get_rotation(game_id)                             # Get Rotation and Breakpoints
    pt_diff_df = get_stints_pt_diff(game_id, breakpoints)                        # Get Point Differential for each Stint
    players, _ = get_player_id_in_out_time(rotation_df)                          # Get List of Players
    game_res_df = final_stint_pt_diff(players, breakpoints, rotation_df,
                                      pt_diff_df, game_id, szn)                  # Final per-game DF
    return game_res_df

## Pulling All Functions together to Scrape for Multiple Games

In [None]:
final_df = pd.DataFrame(columns = ['SEASON', 'GAME_ID', 'STINT_START_TIME', 'STINT_END_TIME', 'PT_DIFF'])

In [None]:
SEASONS = ['2022-23', '2023-24', '2024-25']

### Serial Processing

In [None]:
# for szn in SEASONS:
#     games, game_ids = gamelog_gameid(szn)

#     print(f"Now Running Season: {szn}")

#     for game_id in tqdm(game_ids):
#         rotation_df, breakpoints = get_rotation(game_id) #Get Rotation and Breakpoints
#         pt_diff_df = get_stints_pt_diff(game_id, breakpoints) #Get Point Differential for each Stint
#         players, _ = get_player_id_in_out_time(rotation_df) #Get List of Players

#         game_res_df = final_stint_pt_diff(players, breakpoints, rotation_df, pt_diff_df, game_id, szn) #Final Stint-Wise Point Differential for the Game with Player Flags

#         final_df = pd.concat([final_df, game_res_df], axis = 0, ignore_index=True)


#         time.sleep(1) #Sleep 1 second in between games

#     time.sleep(5) #Sleep 5 Seconds in-between seasons

### Parallel Processing

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings("ignore")

# Tune this based on your API limits / machine
MAX_WORKERS = 8

final_df_list = []  # collect per-season concatenations here

for szn in SEASONS:
    print(f"\nNow Running Season: {szn}")
    _, game_ids = gamelog_gameid(szn)

    season_frames = []
    failures = []

    # Parallelize over games in this season
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(process_game, gid, szn): gid for gid in game_ids}

        # Progress bar advances as futures complete (order-agnostic)
        for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Season {szn}"):
            gid = futures[fut]
            try:
                season_frames.append(fut.result())
            except Exception as e:
                failures.append((gid, e))

    # Concatenate once per season for speed
    if season_frames:
        season_df = pd.concat(season_frames, axis=0, ignore_index=True)
        final_df_list.append(season_df)

    if failures:
        # You can log these and optionally retry
        print(f"[WARN] {len(failures)} games failed in {szn}. Examples:")
        for gid, err in failures[:5]:
            print(f"  - {gid}: {err}")

    # If you still want a short pause between seasons, keep this:
    time.sleep(5)

# Single concat at the very end
final_df = pd.concat(final_df_list, axis=0, ignore_index=True)
