In [32]:
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
from collections import defaultdict
from nba_api.stats.endpoints import (PlayByPlayV2,
                                     BoxScoreSummaryV2,
                                     BoxScoreTraditionalV2,
                                     LeagueGameFinder,
                                     GameRotation)

In [33]:
gamefinder = LeagueGameFinder(season_nullable='2023-24', season_type_nullable='Regular Season')
games_df = gamefinder.get_data_frames()[0]
all_game_ids = games_df['GAME_ID'].unique().tolist()

In [34]:
def seconds_to_pctimestring(seconds):
    m, s = divmod(int(seconds), 60)
    return f"{m}:{s:02}"

def get_period_and_clock(seconds):
    period = int(seconds // 720) + 1
    sec_into_period = seconds % 720
    return period, seconds_to_pctimestring(720 - sec_into_period)

In [35]:
def pctimestring_to_seconds(t):
    m, s = map(int, t.split(":"))
    return m * 60 + s

def convert_to_game_time(period, pctimestring):
    return (period - 1) * 720 + (720 - pctimestring_to_seconds(pctimestring))

def find_lineup(row, lineup_df):
    t = row['GAME_CLOCK_SEC']
    is_sub = row['EVENTMSGTYPE'] == 8
    for _, seg in lineup_df.iterrows():
        if (seg['start_time'] <= t < seg['end_time']) if is_sub else (seg['start_time'] < t <= seg['end_time']):
            return pd.Series([seg['home_lineup'], seg['away_lineup']])
    return pd.Series([None, None])


In [36]:
game_ids = all_game_ids[0:50] # next is [50:100]

In [37]:
previous_lineup_vs_lineup_df = pd.DataFrame()
previous_player_stats_df = pd.DataFrame()

In [38]:
previous_lineup_vs_lineup_df = pd.read_csv('updated_lineup_vs_lineup.csv', dtype={'game_id': 'object'})
previous_player_stats_df = pd.read_csv('updated_player_stats.csv', dtype={'game_id': 'object'})

In [39]:
# All missing lineup game_ids

all_game_ids_series = pd.Series(all_game_ids)

missing_game_ids = all_game_ids_series[~all_game_ids_series.isin(previous_lineup_vs_lineup_df['game_id'].unique())].tolist()
print(f"Missing game_ids: {missing_game_ids}")

Missing game_ids: ['0022301188', '0022301191', '0022301197', '0022301199', '0022301186', '0022301195', '0022301187', '0022301198', '0022301194', '0022301192', '0022301190', '0022301189', '0022301200', '0022301193', '0022301196', '0022301175', '0022301184', '0022301182', '0022301174', '0022301178', '0022301180', '0022301173', '0022301176', '0022301171', '0022301185', '0022301183', '0022301177', '0022301179', '0022301181', '0022301172', '0022301168', '0022301166', '0022301167', '0022301169', '0022301170', '0022301160', '0022301158', '0022301159', '0022301162', '0022301163', '0022301161', '0022301164', '0022301165', '0022301146', '0022301152', '0022301150', '0022301148', '0022301147', '0022301155', '0022301145', '0022301157', '0022301153', '0022301149', '0022301151', '0022301156', '0022301154', '0022301144', '0022301135', '0022301139', '0022301137', '0022301132', '0022301133', '0022301134', '0022301143', '0022301131', '0022301138', '0022301142', '0022301141', '0022301136', '0022301140', '

In [40]:
# Store all results
lineup_vs_lineup_data = []
player_stats_data = []
nr = 1
for game_id in tqdm(missing_game_ids):
    try:
        # --- Load game data ---
        print(f"----------------GAME nr.{nr}-----------\n")
        nr += 1
        # Load play-by-play data
        pbp = PlayByPlayV2(game_id=game_id).get_data_frames()[0]

        # Load player rotation data
        rotation = GameRotation(game_id=game_id)
        home_df = rotation.home_team.get_data_frame()
        away_df = rotation.away_team.get_data_frame()

        # Add team labels
        home_df['TEAM_SIDE'] = 'home'
        away_df['TEAM_SIDE'] = 'away'
        rotation_df = pd.concat([home_df, away_df], ignore_index=True)

        # Create a timeline of substitution events
        events = []
        for _, row in rotation_df.iterrows():
            player = f"{row['PLAYER_FIRST']} {row['PLAYER_LAST']}"
            team = row['TEAM_SIDE']
            events.append({'time': row['IN_TIME_REAL'], 'player': player, 'team': team, 'action': 'in'})
            events.append({'time': row['OUT_TIME_REAL'], 'player': player, 'team': team, 'action': 'out'})
        events = sorted(events, key=lambda x: x['time'])

        lineup_segments = []
        current_lineups = {'home': set(), 'away': set()}
        prev_time = 0

        for event in events:
            current_time = event['time']

            if all(len(lineup) == 5 for lineup in current_lineups.values()):
                lineup_segments.append({
                    'start_time': prev_time,
                    'end_time': current_time,
                    'home_lineup': tuple(sorted(current_lineups['home'])),
                    'away_lineup': tuple(sorted(current_lineups['away'])),
                    'duration': current_time - prev_time
                })

            team = event['team']
            player = event['player']
            if event['action'] == 'in':
                current_lineups[team].add(player)
            else:
                current_lineups[team].discard(player)

            prev_time = current_time

        lineup_df = pd.DataFrame(lineup_segments)
        lineup_df[['start_time', 'end_time', 'duration']] = lineup_df[['start_time', 'end_time', 'duration']] / 10
        lineup_df[['period', 'start_pctimestring']] = lineup_df['start_time'].apply(lambda x: pd.Series(get_period_and_clock(x)))
        lineup_df[['end_period', 'end_pctimestring']] = lineup_df['end_time'].apply(lambda x: pd.Series(get_period_and_clock(x)))

        pbp['GAME_CLOCK_SEC'] = pbp.apply(lambda row: convert_to_game_time(row['PERIOD'], row['PCTIMESTRING']), axis=1)
        pbp[['HOME_LINEUP', 'AWAY_LINEUP']] = pbp.apply(lambda row: find_lineup(row, lineup_df), axis=1)

        important_event_types = list(range(1, 14))
        lineup_timeline = []

        for _, row in pbp.iterrows():
            if row['EVENTMSGTYPE'] in important_event_types and isinstance(row['HOME_LINEUP'], tuple):
                lineup_timeline.append({
                    'EVENTNUM': row['EVENTNUM'],
                    'PERIOD': row['PERIOD'],
                    'TIME': row['PCTIMESTRING'],
                    'EVENT_TYPE': row['EVENTMSGTYPE'],
                    'SCORE': row['SCORE'],
                    'HOME_DESCRIPTION': str(row['HOMEDESCRIPTION']) if pd.notna(row['HOMEDESCRIPTION']) else '',
                    'AWAY_DESCRIPTION': str(row['VISITORDESCRIPTION']) if pd.notna(row['VISITORDESCRIPTION']) else '',
                    'HOME_LINEUP': row['HOME_LINEUP'],
                    'AWAY_LINEUP': row['AWAY_LINEUP'],
                    'PLAYER1_NAME': row['PLAYER1_NAME'],
                    'PLAYER2_NAME': row['PLAYER2_NAME'],
                    'PLAYER3_NAME': row['PLAYER3_NAME']
                })

        lineup_event_df = pd.DataFrame(lineup_timeline)
        lineup_event_df.head()

        lineup_segments = []
        segment = None
        prev_home = None
        prev_away = None
        last_shot_team = None

        SKIP_KEYWORDS = ['SUB', 'Jump Ball', 'Delay', 'Offensive', 'Timeout', 'Rebound']

        for _, row in lineup_event_df.iterrows():
            home_lineup = tuple(sorted(row['HOME_LINEUP']))
            away_lineup = tuple(sorted(row['AWAY_LINEUP']))

            # Start new segment if lineups changed
            if segment is None or home_lineup != prev_home or away_lineup != prev_away:
                if segment:
                    segment['end_event'] = row['EVENTNUM']
                    segment['end_time'] = row['TIME']
                    lineup_segments.append(segment)

                segment = {
                    'home_lineup': home_lineup,
                    'away_lineup': away_lineup,
                    'start_event': row['EVENTNUM'],
                    'start_time': row['TIME'],
                    'period': row['PERIOD'],
                    'team_stats': {'home': defaultdict(int), 'away': defaultdict(int)},
                    'player_stats': defaultdict(lambda: defaultdict(int)),
                    'events': []
                }

                prev_home = home_lineup
                prev_away = away_lineup

            segment['events'].append({
                'EVENTNUM': row['EVENTNUM'],
                'TIME': row['TIME'],
                'HOME_DESCRIPTION': row['HOME_DESCRIPTION'],
                'AWAY_DESCRIPTION': row['AWAY_DESCRIPTION']
            })

            # Process event descriptions
            for team_side, desc in [('home', row['HOME_DESCRIPTION']), ('away', row['AWAY_DESCRIPTION'])]:
                if not desc or any(skip in desc for skip in SKIP_KEYWORDS):
                    continue

                lineup = home_lineup if team_side == 'home' else away_lineup
                players_on_court = set(lineup)

                def inc(stat, players, value=1):
                    segment['team_stats'][team_side][stat] += value
                    for p in players:
                        if p in players_on_court:
                            segment['player_stats'][p][stat] += value

                # Detect player involvement
                involved_players = [row['PLAYER1_NAME'], row['PLAYER2_NAME'], row['PLAYER3_NAME']]
                involved_players = [p for p in involved_players if isinstance(p, str)]

                # Scoring
                if '3PT' in desc and 'MISS' not in desc:
                    inc('3pt_made', [row['PLAYER1_NAME']])
                    inc('points', [row['PLAYER1_NAME']], 3)
                elif 'Free Throw' in desc and 'MISS' not in desc:
                    inc('ft_made', [row['PLAYER1_NAME']])
                    inc('points', [row['PLAYER1_NAME']], 1)
                elif 'MISS' not in desc and any(kw in desc for kw in ['Fadeaway', 'Dunk', 'Layup', 'Jump Shot', 'Hook Shot']):
                    inc('2pt_made', [row['PLAYER1_NAME']])
                    inc('points', [row['PLAYER1_NAME']], 2)
                elif 'MISS' in desc:
                    last_shot_team = team_side

                # Other actions
                if 'AST' in desc:
                    inc('assists', [row['PLAYER2_NAME']])
                if 'Turnover' in desc:
                    inc('turnovers', [row['PLAYER1_NAME']])
                if 'STL' in desc:
                    inc('steals', [row['PLAYER2_NAME']])
                if 'BLK' in desc:
                    inc('blocks', [row['PLAYER2_NAME']])
                if '.FOUL' in desc:
                    inc('fouls', [row['PLAYER1_NAME']])

                # Rebounds
                if 'REBOUND' in desc:
                    rebound_team = team_side
                    rebounder = row['PLAYER1_NAME']
                    if last_shot_team:
                        if rebound_team == last_shot_team:
                            inc('off_rebounds', [rebounder])
                        else:
                            inc('def_rebounds', [rebounder])
                    last_shot_team = None

        # Final flush
        if segment:
            segment['end_event'] = row['EVENTNUM']
            segment['end_time'] = row['TIME']
            lineup_segments.append(segment)

        

        for seg in lineup_segments:
            row = {
                'period': seg['period'],
                'start_event': seg['start_event'],
                'end_event': seg['end_event'],
                'start_time': seg['start_time'],
                'end_time': seg['end_time'],
                'home_lineup': seg['home_lineup'],
                'away_lineup': seg['away_lineup']
            }

            for team in ['home', 'away']:
                for stat, val in seg['team_stats'][team].items():
                    row[f'{team}_{stat}'] = val

            lineup_vs_lineup_data.append(row)

        # lineup_vs_lineup_df = pd.DataFrame(lineup_vs_lineup_data)

        

        for seg in lineup_segments:
            for player, stats in seg['player_stats'].items():
                row = {
                    'player': player,
                    'period': seg['period'],
                    'start_event': seg['start_event'],
                    'end_event': seg['end_event'],
                    'start_time': seg['start_time'],
                    'end_time': seg['end_time'],
                    'home_lineup': seg['home_lineup'],
                    'away_lineup': seg['away_lineup']
                }

                for stat, val in stats.items():
                    row[stat] = val

                player_stats_data.append(row)

        player_stats_df = pd.DataFrame(player_stats_data)

        print(f"Finished game: {game_id}")

        
                # Convert to DataFrame
        lineup_vs_lineup_df = pd.DataFrame(lineup_vs_lineup_data)
        player_stats_df = pd.DataFrame(player_stats_data)

        # Concatenate with previous data
        lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
        player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)

        # Save to CSV
        lineup_vs_lineup_df.to_csv('updated_lineup_vs_lineup.csv', index=False)
        player_stats_df.to_csv('updated_player_stats.csv', index=False)
        print("Data saved to CSV files.")

        # Respect rate limits
        time.sleep(1)

    except Exception as e:
        print(f"Failed on game {game_id}: {e}")
        continue


  0%|          | 0/1757 [00:00<?, ?it/s]

----------------GAME nr.1-----------

Finished game: 0022301188
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 1/1757 [00:01<48:39,  1.66s/it]

----------------GAME nr.2-----------

Finished game: 0022301191
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 2/1757 [00:03<48:38,  1.66s/it]

----------------GAME nr.3-----------

Finished game: 0022301197
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 3/1757 [00:04<46:22,  1.59s/it]

----------------GAME nr.4-----------

Finished game: 0022301199
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 4/1757 [00:06<45:24,  1.55s/it]

----------------GAME nr.5-----------

Finished game: 0022301186
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 5/1757 [00:07<44:59,  1.54s/it]

----------------GAME nr.6-----------

Finished game: 0022301195
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 6/1757 [00:09<45:27,  1.56s/it]

----------------GAME nr.7-----------

Finished game: 0022301187
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 7/1757 [00:10<44:03,  1.51s/it]

----------------GAME nr.8-----------

Finished game: 0022301198
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  0%|          | 8/1757 [00:12<43:58,  1.51s/it]

----------------GAME nr.9-----------

Finished game: 0022301194
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 9/1757 [00:13<44:02,  1.51s/it]

----------------GAME nr.10-----------

Finished game: 0022301192
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 10/1757 [00:15<44:10,  1.52s/it]

----------------GAME nr.11-----------

Finished game: 0022301190
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 11/1757 [00:17<45:04,  1.55s/it]

----------------GAME nr.12-----------

Finished game: 0022301189
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 12/1757 [00:18<45:15,  1.56s/it]

----------------GAME nr.13-----------

Finished game: 0022301200
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 13/1757 [00:20<45:45,  1.57s/it]

----------------GAME nr.14-----------

Finished game: 0022301193
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 14/1757 [00:21<45:37,  1.57s/it]

----------------GAME nr.15-----------

Finished game: 0022301196
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 15/1757 [00:23<48:34,  1.67s/it]

----------------GAME nr.16-----------

Finished game: 0022301175
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 16/1757 [00:25<50:08,  1.73s/it]

----------------GAME nr.17-----------

Finished game: 0022301184
Data saved to CSV files.


  lineup_vs_lineup_df = pd.concat([previous_lineup_vs_lineup_df, lineup_vs_lineup_df], ignore_index=True)
  player_stats_df = pd.concat([previous_player_stats_df, player_stats_df], ignore_index=True)
  1%|          | 17/1757 [00:28<57:54,  2.00s/it]

----------------GAME nr.18-----------



  1%|          | 18/1757 [00:58<5:02:20, 10.43s/it]

Failed on game 0022301182: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
----------------GAME nr.19-----------



  1%|          | 19/1757 [01:28<7:53:18, 16.34s/it]

Failed on game 0022301174: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
----------------GAME nr.20-----------



  1%|          | 19/1757 [01:58<3:00:49,  6.24s/it]


KeyboardInterrupt: 