In [None]:
!pip install --no-cache-dir --force-reinstall numpy==1.26.4 pandas==2.2.2

In [1]:
!pip install nba_api --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/284.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.9/284.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
from collections import defaultdict
from nba_api.stats.endpoints import (PlayByPlayV2,
                                     BoxScoreSummaryV2,
                                     BoxScoreTraditionalV2,
                                     LeagueGameFinder)

In [3]:
gamefinder = LeagueGameFinder(season_nullable='2023-24', season_type_nullable='Regular Season')
games_df = gamefinder.get_data_frames()[0]
all_game_ids = games_df['GAME_ID'].unique().tolist()

In [4]:
# Helper to convert clock time string to seconds
def time_str_to_sec(t):
    if isinstance(t, str):
        m, s = map(int, t.strip().split(":"))
        return m * 60 + s
    return 0

# Helper to compute duration between lineup segments
def compute_duration(row):
    start_sec = time_str_to_sec(row['start_time'])
    end_sec = time_str_to_sec(row['end_time'])
    start_period = row['period']
    end_period = row['period']
    if end_sec > start_sec:
        end_period = start_period + 1
    period_diff = end_period - start_period
    return start_sec + period_diff * 720 - end_sec

In [5]:
game_ids = all_game_ids[0:50] # next is [50:100]

In [6]:
# Store all results
all_lineup_vs_lineup = []
all_player_stats = []
nr = 1
for game_id in tqdm(game_ids):
    try:
        # --- Load game data ---
        print(f"-----------------GAME nr.{nr}-----------\n")
        nr += 1

        pbp = PlayByPlayV2(game_id=game_id).get_data_frames()[0]
        box = BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[0]
        summary = BoxScoreSummaryV2(game_id=game_id).get_data_frames()[0]

        home_team_id = int(summary['HOME_TEAM_ID'][0])
        away_team_id = int(summary['VISITOR_TEAM_ID'][0])

        starters_df = box[box['START_POSITION'] != '']
        starters_by_team = starters_df.groupby('TEAM_ID')['PLAYER_NAME'].apply(list).to_dict()

        current_lineups = {
            home_team_id: starters_by_team.get(home_team_id, [])[:],
            away_team_id: starters_by_team.get(away_team_id, [])[:]
        }

        # --- Build lineup_df ---
        important_event_types = list(range(1, 14))
        lineup_timeline = []

        for _, row in pbp.iterrows():
            event_type = row['EVENTMSGTYPE']
            home_desc = str(row['HOMEDESCRIPTION']) if pd.notna(row['HOMEDESCRIPTION']) else ''
            away_desc = str(row['VISITORDESCRIPTION']) if pd.notna(row['VISITORDESCRIPTION']) else ''

            if event_type == 8:
                out_player = row['PLAYER1_NAME']
                in_player = row['PLAYER2_NAME']
                team_id = row['PLAYER1_TEAM_ID'] if pd.notna(row['PLAYER1_TEAM_ID']) else row['PLAYER2_TEAM_ID']
                team_id = int(team_id) if pd.notna(team_id) else None

                if team_id in current_lineups:
                    if out_player in current_lineups[team_id]:
                        current_lineups[team_id].remove(out_player)
                    if in_player not in current_lineups[team_id]:
                        current_lineups[team_id].append(in_player)

            if event_type in important_event_types:
                lineup_timeline.append({
                    'EVENTNUM': row['EVENTNUM'],
                    'PERIOD': row['PERIOD'],
                    'TIME': row['PCTIMESTRING'],
                    'EVENT_TYPE': event_type,
                    'SCORE': row['SCORE'],
                    'HOME_DESCRIPTION': home_desc,
                    'AWAY_DESCRIPTION': away_desc,
                    'HOME_LINEUP': current_lineups[home_team_id][:],
                    'AWAY_LINEUP': current_lineups[away_team_id][:],
                    'PLAYER1_NAME': row['PLAYER1_NAME'],
                    'PLAYER2_NAME': row['PLAYER2_NAME'],
                    'PLAYER3_NAME': row['PLAYER3_NAME']
                })

        lineup_df = pd.DataFrame(lineup_timeline)

        # --- Build lineup segments ---
        lineup_segments = []
        segment = None
        prev_home = None
        prev_away = None
        last_shot_team = None
        SKIP_KEYWORDS = ['SUB', 'Jump Ball', 'Delay', 'Offensive', 'Timeout', 'Rebound']

        for _, row in lineup_df.iterrows():
            home_lineup = tuple(sorted(row['HOME_LINEUP']))
            away_lineup = tuple(sorted(row['AWAY_LINEUP']))

            if segment is None or home_lineup != prev_home or away_lineup != prev_away:
                if segment:
                    segment['end_event'] = row['EVENTNUM']
                    segment['end_time'] = row['TIME']
                    lineup_segments.append(segment)
                segment = {
                    'home_lineup': home_lineup,
                    'away_lineup': away_lineup,
                    'start_event': row['EVENTNUM'],
                    'start_time': row['TIME'],
                    'period': row['PERIOD'],
                    'team_stats': {'home': defaultdict(int), 'away': defaultdict(int)},
                    'player_stats': defaultdict(lambda: defaultdict(int)),
                    'events': []
                }
                prev_home = home_lineup
                prev_away = away_lineup

            segment['events'].append({
                'EVENTNUM': row['EVENTNUM'],
                'TIME': row['TIME'],
                'HOME_DESCRIPTION': row['HOME_DESCRIPTION'],
                'AWAY_DESCRIPTION': row['AWAY_DESCRIPTION']
            })

            for team_side, desc in [('home', row['HOME_DESCRIPTION']), ('away', row['AWAY_DESCRIPTION'])]:
                if not desc or any(skip_word in desc for skip_word in SKIP_KEYWORDS):
                    continue

                lineup = segment[f'{team_side}_lineup']
                players_on_court = set(lineup)

                def inc(stat, players, value=1):
                    segment['team_stats'][team_side][stat] += value
                    for p in players:
                        if p in players_on_court:
                            segment['player_stats'][p][stat] += value

                players = [p for p in [row['PLAYER1_NAME'], row['PLAYER2_NAME'], row['PLAYER3_NAME']]
                           if isinstance(p, str) and p in players_on_court]

                if '3PT' in desc and 'MISS' not in desc:
                    inc('3pt_made', [row['PLAYER1_NAME']])
                    inc('points', [row['PLAYER1_NAME']], value=3)
                elif 'Free Throw' in desc and 'MISS' not in desc:
                    inc('ft_made', [row['PLAYER1_NAME']])
                    inc('points', [row['PLAYER1_NAME']], value=1)
                elif 'MISS' not in desc and any(kw in desc for kw in ['Fadeaway', 'Dunk', 'Layup', 'Jump Shot', 'Hook Shot']):
                    inc('2pt_made', [row['PLAYER1_NAME']])
                    inc('points', [row['PLAYER1_NAME']], value=2)
                elif 'MISS' in desc:
                    last_shot_team = team_side

                if 'AST' in desc: inc('assists', [row['PLAYER2_NAME']])
                if 'Turnover' in desc: inc('turnovers', [row['PLAYER1_NAME']])
                if 'STL' in desc: inc('steals', [row['PLAYER2_NAME']])
                if 'BLK' in desc: inc('blocks', [row['PLAYER2_NAME']])
                if '.FOUL' in desc: inc('fouls', [row['PLAYER1_NAME']])

                if 'REBOUND' in desc:
                    rebounder = row['PLAYER1_NAME']
                    if last_shot_team:
                        if team_side == last_shot_team:
                            inc('off_rebounds', [rebounder])
                        else:
                            inc('def_rebounds', [rebounder])
                        last_shot_team = None

        if segment:
            segment['end_event'] = row['EVENTNUM']
            segment['end_time'] = row['TIME']
            lineup_segments.append(segment)

        # --- Build lineup_vs_lineup_df ---
        for seg in lineup_segments:
            row = {
                'game_id': game_id,
                'period': seg['period'],
                'start_event': seg['start_event'],
                'end_event': seg['end_event'],
                'start_time': seg['start_time'],
                'end_time': seg['end_time'],
                'home_lineup': seg['home_lineup'],
                'away_lineup': seg['away_lineup'],
            }
            for team in ['home', 'away']:
                for stat, value in seg['team_stats'][team].items():
                    row[f'{team}_{stat}'] = value
            row['duration'] = compute_duration(row)
            all_lineup_vs_lineup.append(row)

        # --- Build player_stats_df ---
        for seg in lineup_segments:
            for player_name, stats in seg['player_stats'].items():
                row = {
                    'game_id': game_id,
                    'player': player_name,
                    'period': seg['period'],
                    'start_event': seg['start_event'],
                    'end_event': seg['end_event'],
                    'start_time': seg['start_time'],
                    'end_time': seg['end_time'],
                    'home_lineup': seg['home_lineup'],
                    'away_lineup': seg['away_lineup'],
                }
                for stat, value in stats.items():
                    row[stat] = value
                all_player_stats.append(row)

        print(f"Finished game: {game_id}")
        # Respect rate limits
        time.sleep(1)

    except Exception as e:
        print(f"Failed on game {game_id}: {e}")
        continue

  0%|          | 0/50 [00:00<?, ?it/s]

-----------------GAME nr.1-----------

Finished game: 0022301191


  2%|▏         | 1/50 [00:01<01:08,  1.41s/it]

-----------------GAME nr.2-----------

Finished game: 0022301195


  4%|▍         | 2/50 [00:02<01:05,  1.36s/it]

-----------------GAME nr.3-----------

Finished game: 0022301192


  6%|▌         | 3/50 [00:04<01:02,  1.34s/it]

-----------------GAME nr.4-----------

Finished game: 0022301196


  8%|▊         | 4/50 [00:05<01:00,  1.32s/it]

-----------------GAME nr.5-----------

Finished game: 0022301187


 10%|█         | 5/50 [00:06<00:58,  1.31s/it]

-----------------GAME nr.6-----------

Finished game: 0022301190


 12%|█▏        | 6/50 [00:07<00:56,  1.28s/it]

-----------------GAME nr.7-----------

Finished game: 0022301189


 14%|█▍        | 7/50 [00:09<00:54,  1.26s/it]

-----------------GAME nr.8-----------

Finished game: 0022301186


 16%|█▌        | 8/50 [00:10<00:52,  1.24s/it]

-----------------GAME nr.9-----------

Finished game: 0022301188


 18%|█▊        | 9/50 [00:11<00:50,  1.23s/it]

-----------------GAME nr.10-----------

Finished game: 0022301200


 20%|██        | 10/50 [00:12<00:49,  1.23s/it]

-----------------GAME nr.11-----------

Finished game: 0022301197


 22%|██▏       | 11/50 [00:13<00:47,  1.23s/it]

-----------------GAME nr.12-----------

Finished game: 0022301194


 24%|██▍       | 12/50 [00:15<00:46,  1.23s/it]

-----------------GAME nr.13-----------

Finished game: 0022301193


 26%|██▌       | 13/50 [00:16<00:45,  1.23s/it]

-----------------GAME nr.14-----------

Finished game: 0022301199


 28%|██▊       | 14/50 [00:17<00:44,  1.25s/it]

-----------------GAME nr.15-----------

Finished game: 0022301198


 30%|███       | 15/50 [00:18<00:43,  1.24s/it]

-----------------GAME nr.16-----------

Finished game: 0022301172


 32%|███▏      | 16/50 [00:20<00:42,  1.25s/it]

-----------------GAME nr.17-----------

Finished game: 0022301177


 34%|███▍      | 17/50 [00:21<00:40,  1.24s/it]

-----------------GAME nr.18-----------

Finished game: 0022301171


 36%|███▌      | 18/50 [00:22<00:39,  1.23s/it]

-----------------GAME nr.19-----------

Finished game: 0022301182


 38%|███▊      | 19/50 [00:23<00:38,  1.24s/it]

-----------------GAME nr.20-----------

Finished game: 0022301173


 40%|████      | 20/50 [00:25<00:37,  1.23s/it]

-----------------GAME nr.21-----------

Finished game: 0022301181


 42%|████▏     | 21/50 [00:26<00:35,  1.22s/it]

-----------------GAME nr.22-----------

Finished game: 0022301179


 44%|████▍     | 22/50 [00:27<00:34,  1.22s/it]

-----------------GAME nr.23-----------

Finished game: 0022301184


 46%|████▌     | 23/50 [00:28<00:33,  1.24s/it]

-----------------GAME nr.24-----------

Finished game: 0022301175


 48%|████▊     | 24/50 [00:30<00:32,  1.25s/it]

-----------------GAME nr.25-----------

Finished game: 0022301183


 50%|█████     | 25/50 [00:31<00:31,  1.25s/it]

-----------------GAME nr.26-----------

Finished game: 0022301178


 52%|█████▏    | 26/50 [00:32<00:29,  1.24s/it]

-----------------GAME nr.27-----------

Finished game: 0022301176


 54%|█████▍    | 27/50 [00:33<00:28,  1.26s/it]

-----------------GAME nr.28-----------

Finished game: 0022301180


 56%|█████▌    | 28/50 [00:35<00:27,  1.25s/it]

-----------------GAME nr.29-----------

Finished game: 0022301174


 58%|█████▊    | 29/50 [00:36<00:26,  1.24s/it]

-----------------GAME nr.30-----------

Finished game: 0022301185


 60%|██████    | 30/50 [00:37<00:24,  1.23s/it]

-----------------GAME nr.31-----------

Finished game: 0022301170


 62%|██████▏   | 31/50 [00:38<00:23,  1.23s/it]

-----------------GAME nr.32-----------

Finished game: 0022301166


 64%|██████▍   | 32/50 [00:39<00:22,  1.22s/it]

-----------------GAME nr.33-----------

Finished game: 0022301169


 66%|██████▌   | 33/50 [00:41<00:21,  1.24s/it]

-----------------GAME nr.34-----------

Finished game: 0022301168


 68%|██████▊   | 34/50 [00:42<00:20,  1.25s/it]

-----------------GAME nr.35-----------

Finished game: 0022301167


 70%|███████   | 35/50 [00:43<00:18,  1.24s/it]

-----------------GAME nr.36-----------

Finished game: 0022301161


 72%|███████▏  | 36/50 [00:44<00:17,  1.23s/it]

-----------------GAME nr.37-----------

Finished game: 0022301163


 74%|███████▍  | 37/50 [00:46<00:15,  1.23s/it]

-----------------GAME nr.38-----------

Finished game: 0022301159


 76%|███████▌  | 38/50 [00:47<00:14,  1.23s/it]

-----------------GAME nr.39-----------

Finished game: 0022301165


 78%|███████▊  | 39/50 [00:48<00:13,  1.23s/it]

-----------------GAME nr.40-----------

Finished game: 0022301158


 80%|████████  | 40/50 [00:49<00:12,  1.23s/it]

-----------------GAME nr.41-----------

Finished game: 0022301162


 82%|████████▏ | 41/50 [00:50<00:10,  1.22s/it]

-----------------GAME nr.42-----------

Finished game: 0022301164


 84%|████████▍ | 42/50 [00:52<00:09,  1.24s/it]

-----------------GAME nr.43-----------

Finished game: 0022301160


 86%|████████▌ | 43/50 [00:53<00:08,  1.26s/it]

-----------------GAME nr.44-----------

Finished game: 0022301153


 88%|████████▊ | 44/50 [00:54<00:07,  1.27s/it]

-----------------GAME nr.45-----------

Finished game: 0022301152


 90%|█████████ | 45/50 [00:56<00:06,  1.27s/it]

-----------------GAME nr.46-----------

Finished game: 0022301144


 92%|█████████▏| 46/50 [00:57<00:05,  1.26s/it]

-----------------GAME nr.47-----------

Finished game: 0022301148


 94%|█████████▍| 47/50 [00:58<00:03,  1.24s/it]

-----------------GAME nr.48-----------

Finished game: 0022301154


 96%|█████████▌| 48/50 [00:59<00:02,  1.23s/it]

-----------------GAME nr.49-----------

Finished game: 0022301155


 98%|█████████▊| 49/50 [01:00<00:01,  1.22s/it]

-----------------GAME nr.50-----------

Finished game: 0022301147


100%|██████████| 50/50 [01:02<00:00,  1.24s/it]


In [7]:
batch_index = 1  # Increment after every run

# Create DataFrames from this batch
lineup_vs_lineup_df = pd.DataFrame(all_lineup_vs_lineup)
player_stats_df = pd.DataFrame(all_player_stats)

# Save to CSV
lineup_vs_lineup_df.to_csv(f'lineup_vs_lineup_batch_{batch_index}.csv', index=False)
player_stats_df.to_csv(f'player_stats_batch_{batch_index}.csv', index=False)

print(f"Saved batch {batch_index} with {len(lineup_vs_lineup_df)} lineup segments and {len(player_stats_df)} player segments.")

Saved batch 1 with 2048 lineup segments and 6125 player segments.
