# CBBD Play-by-Play Batch Pipeline

End-to-end pipeline: enter a date (or date range), fetch all games + plays, run the full per-game analysis (possessions, shots, lineups, player stats), and save combined CSVs.

**Output DataFrames** (all carry `gameId`):

| Variable | Description |
|---|---|
| `all_possessions_df` | Play-level possession tracking |
| `all_poss_enriched_df` | Possession-level summary (type, prev ender, refined outcome) |
| `all_shots_df` | Shots with x/y, lineups on court |
| `all_lineup_stints_df` | Lineup stints with +/- |
| `all_players_df` | Player box scores |
| `all_pbp_flat_df` | Full PBP flattened (no nested objects) |
| `all_ff_df` | Four Factors per team-game |

## 1. Setup

In [None]:
import cbbd
import pandas as pd
import numpy as np
import re
import os
import time
import getpass
from datetime import datetime, timedelta
from cbbd.rest import ApiException

## 2. Configuration

In [None]:
# API key (hidden input)
print("Enter your CollegeBasketballData.com API key")
api_key = getpass.getpass("API Key: ")
if not api_key.strip():
    raise ValueError("API key is required.")

configuration = cbbd.Configuration(
    host="https://api.collegebasketballdata.com",
    access_token=api_key,
)

# Season: use 2026 for the 2025-26 season
season = 2026

print(f"API configured.  Season: {season-1}/{season}")

## 3. Enter Date(s)

In [None]:
# Enter a single date or a date range.
# Single date  -> "2026-02-14"
# Date range   -> "2026-02-14 to 2026-02-16"
date_input = input("Date or date range (YYYY-MM-DD [to YYYY-MM-DD]): ").strip()

if ' to ' in date_input:
    start_str, end_str = [s.strip() for s in date_input.split(' to ', 1)]
    start_date = datetime.strptime(start_str, '%Y-%m-%d')
    end_date   = datetime.strptime(end_str,   '%Y-%m-%d')
else:
    start_date = datetime.strptime(date_input, '%Y-%m-%d') if date_input else datetime.now()
    end_date   = start_date

print(f"Date range: {start_date.date()} -> {end_date.date()}")

## 4. Fetch Games & Play-by-Play

In [None]:
with cbbd.ApiClient(configuration) as api_client:
    games_api = cbbd.GamesApi(api_client)
    plays_api = cbbd.PlaysApi(api_client)

    # Get games in date range
    games = games_api.get_games(
        start_date_range=start_date,
        end_date_range=end_date,
        season=season,
    )
    games_df = pd.DataFrame([g.to_dict() for g in games])
    print(f"Games found: {len(games_df)}")

    # Fetch PBP for each game
    all_plays = []
    for idx, game in games_df.iterrows():
        gid = game['id']
        try:
            plays = plays_api.get_plays(game_id=gid)
            all_plays.extend([p.to_dict() for p in plays])
        except ApiException as e:
            print(f"  Game {gid}: PBP fetch failed ({e})")
        time.sleep(0.5)

    plays_df = pd.DataFrame(all_plays)

print(f"Total plays collected: {len(plays_df):,} across {plays_df['gameId'].nunique()} games")

## 5. Analysis Functions

In [None]:
# ---------------------------------------------------------------------------
# Substitution parser
# ---------------------------------------------------------------------------
def parse_substitution(play_text):
    """Parse 'Player Name subbing in/out for Team'."""
    match = re.search(r'(.+?)\s+subbing\s+(in|out)\s+for\s+(.+?)$', play_text, re.IGNORECASE)
    if match:
        return {'player': match.group(1).strip(),
                'action': match.group(2).lower(),
                'team':   match.group(3).strip()}
    return None


# ---------------------------------------------------------------------------
# Lineup tracker (uses real starters from API)
# ---------------------------------------------------------------------------
def track_lineups_with_real_starters(game_df, starters_by_team):
    """Track which 5 players are on court for each team throughout the game."""
    teams = game_df[game_df['team'].notna()]['team'].unique()
    lineups = {}
    for team in teams:
        lineups[team] = set(starters_by_team.get(team, []))

    game_df_sorted = game_df.sort_values(['period', 'secondsRemaining'], ascending=[True, False])

    lineup_log = []
    current_period = None
    current_clock = None
    pending_plays = []

    def process_pending():
        nonlocal pending_plays
        if not pending_plays:
            return
        # Apply all subs first
        for play in pending_plays:
            if play.get('playType') == 'Substitution':
                team = play.get('team')
                sub = parse_substitution(play.get('playText', ''))
                if sub and team:
                    if sub['action'] == 'in':
                        lineups[team].add(sub['player'])
                    else:
                        lineups[team].discard(sub['player'])
        # Log lineup state for each play
        for play in pending_plays:
            lineup_log.append({
                'play_id':          play.get('id'),
                'period':           play.get('period'),
                'clock':            play.get('clock'),
                'seconds_remaining':play.get('secondsRemaining'),
                'play_type':        play.get('playType'),
                'team':             play.get('team'),
                'home_lineup':      list(lineups.get(teams[0], set())) if len(teams) > 0 else [],
                'away_lineup':      list(lineups.get(teams[1], set())) if len(teams) > 1 else [],
                'home_lineup_size': len(lineups.get(teams[0], set())) if len(teams) > 0 else 0,
                'away_lineup_size': len(lineups.get(teams[1], set())) if len(teams) > 1 else 0,
            })
        pending_plays = []

    for idx, play in game_df_sorted.iterrows():
        period = play.get('period')
        clock  = play.get('clock')
        if (period, clock) != (current_period, current_clock):
            process_pending()
            current_period = period
            current_clock  = clock
        pending_plays.append(play.to_dict())

    process_pending()
    return pd.DataFrame(lineup_log), lineups


# ---------------------------------------------------------------------------
# Lineup key helper
# ---------------------------------------------------------------------------
def lineup_to_key(lineup_list):
    """Convert lineup list to sorted pipe-delimited string."""
    if isinstance(lineup_list, list):
        return ' | '.join(sorted(lineup_list))
    return None


# ---------------------------------------------------------------------------
# Lineup stints
# ---------------------------------------------------------------------------
def get_lineup_stints(pbp_df):
    """Calculate stint lengths for each unique lineup combination."""
    stints = []
    prev_home = prev_away = None
    stint_start = stint_start_score = None

    for idx, row in pbp_df.iterrows():
        home = tuple(sorted(row['home_lineup']))
        away = tuple(sorted(row['away_lineup']))
        if (home, away) != (prev_home, prev_away):
            if prev_home is not None:
                stints.append({
                    'home_lineup_key': ' | '.join(prev_home),
                    'away_lineup_key': ' | '.join(prev_away),
                    'start_seconds':    stint_start,
                    'end_seconds':      row['secondsRemaining'],
                    'start_home_score': stint_start_score[0],
                    'start_away_score': stint_start_score[1],
                    'end_home_score':   row['homeScore'],
                    'end_away_score':   row['awayScore'],
                })
            prev_home = home
            prev_away = away
            stint_start = row['secondsRemaining']
            stint_start_score = (row['homeScore'], row['awayScore'])

    if prev_home is not None:
        stints.append({
            'home_lineup_key': ' | '.join(prev_home),
            'away_lineup_key': ' | '.join(prev_away),
            'start_seconds':    stint_start,
            'end_seconds':      0,
            'start_home_score': stint_start_score[0],
            'start_away_score': stint_start_score[1],
            'end_home_score':   pbp_df.iloc[-1]['homeScore'],
            'end_away_score':   pbp_df.iloc[-1]['awayScore'],
        })
    return pd.DataFrame(stints)


# ---------------------------------------------------------------------------
# Text-format helpers (API uses two formats across seasons)
#   NEW: "Player makes 18-foot jumper" / "makes free throw 2 of 2"
#   OLD: "Player made Layup." / "made Free Throw." (no N-of-N)
# ---------------------------------------------------------------------------
FG_TYPES = {'JumpShot', 'LayUpShot', 'DunkShot', 'TipShot'}


def _is_made(txt):
    """Check if play text indicates a made shot (handles both formats)."""
    return 'makes' in txt or (' made ' in txt and 'missed' not in txt)


def _is_missed(txt):
    """Check if play text indicates a missed shot (handles both formats)."""
    return 'misses' in txt or 'missed' in txt


def _safe_txt(val):
    """Safely convert a value to lowercase string (handles NaN)."""
    if pd.notna(val):
        return str(val).lower()
    return ''


def _safe_str(val):
    """Safely convert a value to string (handles NaN)."""
    if pd.notna(val):
        return str(val)
    return ''


def _precompute_last_ft_flags(game_df):
    """Pre-compute which MadeFreeThrow rows are the last FT in a sequence.

    NEW format: has 'N of N' pattern (e.g. '2 of 2', '1 of 1').
    OLD format: no 'N of N'; detect last FT by checking whether the next
                play is also a MadeFreeThrow at the same clock time.
    """
    sorted_df = game_df.sort_values(
        ['period', 'secondsRemaining'], ascending=[True, False]
    ).reset_index()
    is_last_ft = {}  # original_index -> bool

    ft_mask = sorted_df['playType'] == 'MadeFreeThrow'
    ft_indices = sorted_df.index[ft_mask].tolist()

    for pos in ft_indices:
        row = sorted_df.iloc[pos]
        txt_lower = _safe_txt(row['playText'])
        orig_idx = row['index']  # original DataFrame index

        # New format: has "N of N"
        if any(f'{n} of {n}' in txt_lower for n in ('1', '2', '3')):
            is_last_ft[orig_idx] = True
            continue

        # Old format: check if the NEXT row is also a FT at the same clock
        if pos + 1 < len(sorted_df):
            next_row = sorted_df.iloc[pos + 1]
            same_clock = (next_row['period'] == row['period']
                          and next_row['secondsRemaining'] == row['secondsRemaining'])
            next_is_ft = next_row['playType'] == 'MadeFreeThrow'
            if same_clock and next_is_ft:
                is_last_ft[orig_idx] = False
            else:
                is_last_ft[orig_idx] = True
        else:
            is_last_ft[orig_idx] = True  # last play in game

    return is_last_ft


# ---------------------------------------------------------------------------
# Possession tracker (state machine) â€” v4
#   - dual text format (makes/made, misses/missed)
#   - and-1 detection
#   - old FT format without N-of-N (lookahead last-FT detection)
#   - NaN-safe text handling
# ---------------------------------------------------------------------------
def track_possessions_v2(game_df):
    """State-machine possession tracker."""
    game_df = game_df.sort_values(
        ['period', 'secondsRemaining'], ascending=[True, False]
    ).copy()
    teams = [t for t in game_df['team'].unique() if pd.notna(t)]

    def other_team(t):
        others = [x for x in teams if x != t]
        return others[0] if others else None

    # Pre-compute last-FT flags
    last_ft_flags = _precompute_last_ft_flags(game_df)

    poss_id = 0
    poss_team = None
    records = []
    last_end_reason = None
    last_end_team = None

    for idx, row in game_df.iterrows():
        pt   = _safe_str(row.get('playType'))
        txt  = _safe_txt(row.get('playText'))
        team = row.get('team')
        outcome = None
        end_poss = False
        next_team = None

        if pt == 'Jumpball':
            if 'won' in txt and team and poss_team is None:
                poss_team = team
        elif pt in FG_TYPES:
            if poss_team is None:
                poss_team = team
            if _is_made(txt):
                outcome = 'made_fg'
                end_poss = True
                next_team = other_team(poss_team)
        elif pt == 'MadeFreeThrow':
            # --- And-1 detection ---
            if (team and poss_team is not None
                    and team != poss_team
                    and last_end_reason == 'made_fg'
                    and last_end_team == team):
                poss_id -= 1
                poss_team = team
                for rec in reversed(records):
                    if rec['possession_id'] == poss_id + 1:
                        rec['possession_id'] = poss_id
                        rec['possession_team'] = poss_team
                    else:
                        break
                last_end_reason = 'and1_ft'
            elif poss_team is None and team:
                poss_team = team

            is_last = last_ft_flags.get(idx, False)
            if is_last:
                if _is_made(txt):
                    outcome = 'made_ft'
                    end_poss = True
                    next_team = other_team(poss_team)
                else:
                    outcome = 'missed_last_ft'
        elif 'Turnover' in pt:
            if poss_team is None and team:
                poss_team = team
            outcome = 'turnover'
            end_poss = True
            next_team = other_team(poss_team)
        elif pt == 'Steal':
            outcome = 'steal'
        elif pt == 'Defensive Rebound':
            outcome = 'def_rebound'
            end_poss = True
            next_team = team
        elif pt == 'Offensive Rebound':
            outcome = 'off_rebound'
        elif pt == 'Dead Ball Rebound':
            outcome = 'dead_ball_rebound'
            end_poss = True
            next_team = team
        elif pt in ('End Period', 'End Game'):
            outcome = 'end_period'
            end_poss = True
            next_team = None

        records.append({
            'play_id': row.get('id'),
            'gameId':  row.get('gameId'),
            'possession_id': poss_id,
            'possession_team': poss_team,
            'play_type': pt,
            'play_text': row.get('playText', ''),
            'team': team,
            'outcome': outcome,
        })

        if end_poss:
            last_end_reason = outcome
            last_end_team = poss_team
            poss_id += 1
            poss_team = next_team
        elif pt not in ('PersonalFoul', 'MadeFreeThrow', 'Substitution',
                        'Official TV Timeout', ''):
            last_end_reason = None
            last_end_team = None

    return pd.DataFrame(records)


# ---------------------------------------------------------------------------
# Possession classifier (prev ender + type)
# ---------------------------------------------------------------------------
def classify_possessions(possessions_df, game_df):
    """Build possession-level features: refined_outcome, prev_poss_ender, possession_type."""
    game_sorted = game_df.sort_values(
        ['period', 'secondsRemaining'], ascending=[True, False]
    ).reset_index(drop=True)
    game_sorted['play_order'] = range(len(game_sorted))
    time_info = game_sorted[['id', 'secondsRemaining', 'period', 'play_order']].rename(
        columns={'id': 'play_id'}
    )
    poss = possessions_df.merge(time_info, on='play_id', how='left')
    poss = poss.sort_values('play_order').reset_index(drop=True)

    possession_rows = []
    for pid in sorted(poss['possession_id'].unique()):
        grp = poss[poss['possession_id'] == pid].sort_values('play_order')
        plays = grp.to_dict('records')
        game_id   = grp['gameId'].iloc[0] if 'gameId' in grp.columns else None
        poss_team = grp['possession_team'].iloc[0]
        period    = grp['period'].iloc[0]
        start_sec = grp['secondsRemaining'].iloc[0]
        end_sec   = grp['secondsRemaining'].iloc[-1]
        duration  = start_sec - end_sec

        all_outcomes = [p['outcome'] for p in plays if p['outcome'] is not None]
        final_outcome = all_outcomes[-1] if all_outcomes else None
        outcome_set = set(all_outcomes)
        has_steal = 'steal' in outcome_set
        has_oreb  = 'off_rebound' in outcome_set

        # Refined outcome
        if final_outcome == 'turnover':
            refined = 'live_ball_turnover' if has_steal else 'dead_ball_turnover'
        elif final_outcome == 'def_rebound':
            miss_type = 'fga'
            found_dreb = False
            for p in reversed(plays):
                if not found_dreb:
                    if p['outcome'] == 'def_rebound':
                        found_dreb = True
                    continue
                if p['play_type'] in FG_TYPES:
                    miss_type = 'fga'; break
                if p['play_type'] == 'MadeFreeThrow' and _is_missed(
                        _safe_txt(p.get('play_text'))):
                    miss_type = 'fta'; break
            refined = f'{miss_type}_def_rebound'
        elif final_outcome in ('made_fg', 'made_ft', 'end_period', 'dead_ball_rebound'):
            refined = final_outcome
        else:
            refined = final_outcome

        # Block OOB override
        for i_p, p in enumerate(plays):
            pt_lower = _safe_txt(p.get('play_text'))
            if (p['play_type'] in FG_TYPES
                    and 'block' in pt_lower
                    and _is_missed(pt_lower)):
                remaining = plays[i_p + 1:]
                has_reb = any(r['outcome'] in ('def_rebound', 'off_rebound', 'dead_ball_rebound') for r in remaining)
                if not has_reb:
                    next_p = poss[poss['possession_id'] == pid + 1]
                    if len(next_p) > 0:
                        refined = 'block_oob'
                    break

        # Possession type
        fga_plays = [p for p in plays if p['play_type'] in FG_TYPES]
        first_fga_sec = fga_plays[0]['secondsRemaining'] if fga_plays else None
        time_to_first_fga = (start_sec - first_fga_sec) if first_fga_sec is not None else None

        oreb_list = [p for p in plays if p['outcome'] == 'off_rebound']
        time_oreb_to_fga = None
        if oreb_list:
            oreb_sec = oreb_list[0]['secondsRemaining']
            post_oreb_fga = [p for p in fga_plays if p['secondsRemaining'] < oreb_sec]
            if post_oreb_fga:
                time_oreb_to_fga = oreb_sec - post_oreb_fga[0]['secondsRemaining']

        foul_plays = [p for p in plays
                      if 'Foul' in (p.get('play_type') or '')
                      and 'shooting' not in _safe_txt(p.get('play_text'))]
        foul_within_10s = False
        if foul_plays:
            if (start_sec - foul_plays[0]['secondsRemaining']) <= 10:
                foul_within_10s = True

        if has_oreb:
            poss_type = 'scramble_putback' if (time_oreb_to_fga is not None and time_oreb_to_fga <= 3) else 'second_chance'
        elif foul_plays and foul_within_10s and not fga_plays and start_sec <= 120:
            poss_type = 'intentional_foul'
        elif time_to_first_fga is not None:
            poss_type = 'transition' if time_to_first_fga <= 7 else 'half_court'
        else:
            poss_type = 'half_court'

        possession_rows.append({
            'gameId': game_id, 'possession_id': pid, 'possession_team': poss_team,
            'period': period, 'start_seconds': start_sec, 'end_seconds': end_sec,
            'duration_sec': duration, 'raw_outcome': final_outcome,
            'refined_outcome': refined, 'possession_type': poss_type,
            'has_oreb': has_oreb, 'time_to_first_fga': time_to_first_fga,
            'time_oreb_to_fga': time_oreb_to_fga,
        })

    result = pd.DataFrame(possession_rows)

    # Previous possession ender
    prev_enders = ['start_of_period']
    for i_r in range(1, len(result)):
        if result.iloc[i_r]['period'] != result.iloc[i_r - 1]['period']:
            prev_enders.append('start_of_period')
        else:
            prev_enders.append(result.iloc[i_r - 1]['refined_outcome'])
    result['prev_poss_ender'] = prev_enders
    return result


# ---------------------------------------------------------------------------
# Four Factors
# ---------------------------------------------------------------------------
def compute_four_factors(game_df):
    """Compute Four Factors for each team from raw play-by-play data."""
    teams = game_df[game_df['team'].notna()]['team'].unique()
    results = {}
    for team in teams:
        tp  = game_df[game_df['team'] == team]
        opp = [t for t in teams if t != team]
        opp_plays = game_df[game_df['team'] == opp[0]] if opp else pd.DataFrame()

        fg  = tp[tp['playType'].isin(FG_TYPES)]
        fga = len(fg)
        # Handle both text formats for FGM
        fg_txt = fg['playText'].fillna('').str.lower()
        fgm = (fg_txt.str.contains('makes') | (fg_txt.str.contains(' made ') & ~fg_txt.str.contains('missed'))).sum()
        tpa = fg['playText'].fillna('').str.contains('three point', case=False, na=False).sum()
        tpm_mask = (fg_txt.str.contains('three point')) & (fg_txt.str.contains('makes') | (fg_txt.str.contains(' made ') & ~fg_txt.str.contains('missed')))
        tpm = tpm_mask.sum()
        ft  = tp[tp['playType'] == 'MadeFreeThrow']
        fta = len(ft)
        ft_txt = ft['playText'].fillna('').str.lower()
        ftm = (ft_txt.str.contains('makes') | (ft_txt.str.contains(' made ') & ~ft_txt.str.contains('missed'))).sum()
        tov = len(tp[tp['playType'].str.contains('Turnover', na=False)])
        orb = len(tp[tp['playType'] == 'Offensive Rebound'])
        drb = len(tp[tp['playType'] == 'Defensive Rebound'])
        opp_drb = len(opp_plays[opp_plays['playType'] == 'Defensive Rebound']) if len(opp_plays) else 0

        possessions = fga - orb + tov + 0.475 * fta
        efg   = (fgm + 0.5 * tpm) / fga * 100 if fga else 0
        to_p  = tov / possessions * 100 if possessions else 0
        orb_p = orb / (orb + opp_drb) * 100 if (orb + opp_drb) else 0
        ft_r  = fta / fga * 100 if fga else 0
        tpa_r = tpa / fga * 100 if fga else 0

        n_per = game_df['period'].max()
        mins  = 40 if n_per <= 2 else 40 + (n_per - 2) * 5
        tempo = possessions / (mins / 40)

        results[team] = {
            'FGA': fga, 'FGM': int(fgm), '3PA': tpa, '3PM': int(tpm),
            '2PA': fga - tpa, '2PM': int(fgm - tpm),
            'FTA': fta, 'FTM': int(ftm), 'TOV': tov,
            'ORB': orb, 'DRB': drb, 'Opp_DRB': opp_drb,
            'Possessions': round(possessions, 1),
            'eFG%': round(efg, 1), 'TO%': round(to_p, 1),
            'ORB%': round(orb_p, 1), 'FT_Rate': round(ft_r, 1),
            '3PA_Rate': round(tpa_r, 1), 'Tempo': round(tempo, 1),
        }
    return results


print("All analysis functions defined.")

## 6. Per-Game Pipeline Function

In [None]:
def fetch_game_roster(game_id, game_df, configuration, season):
    """Fetch roster via API and return (players_flat_df, starters_by_team)."""
    game_date  = game_df['gameStartDate'].iloc[0]
    game_teams = game_df[game_df['team'].notna()]['team'].unique()

    with cbbd.ApiClient(configuration) as api_client:
        games_api = cbbd.GamesApi(api_client)
        all_players = []
        for team in game_teams:
            gp = games_api.get_game_players(
                start_date_range=game_date,
                end_date_range=game_date,
                team=team,
                season=season,
            )
            all_players.extend([p.to_dict() for p in gp])

    gp_df = pd.DataFrame(all_players)
    flat = []
    for _, row in gp_df.iterrows():
        team = row['team']
        for player in (row.get('players') or []):
            player['team'] = team
            flat.append(player)
    players_flat_df = pd.DataFrame(flat)

    starters_by_team = {}
    for team in players_flat_df['team'].unique():
        starters_by_team[team] = players_flat_df[
            (players_flat_df['team'] == team) & (players_flat_df['starter'] == True)
        ]['name'].tolist()

    return players_flat_df, starters_by_team


def process_single_game(game_id, game_df, configuration, season):
    """
    Full per-game pipeline.  Returns dict of DataFrames:
        possessions_df, poss_enriched, shots_df,
        lineup_stints_df, players_df, pbp_flat
    """
    teams = game_df[game_df['team'].notna()]['team'].unique()
    if len(teams) < 2:
        raise ValueError(f"Game {game_id}: found {len(teams)} teams, need 2")

    # Roster & starters (API call)
    players_flat_df, starters_by_team = fetch_game_roster(
        game_id, game_df, configuration, season
    )

    # Lineup tracking
    lineup_df, _ = track_lineups_with_real_starters(game_df, starters_by_team)

    # Merge lineups with PBP
    pbp_with_lineups = game_df.merge(
        lineup_df[['play_id', 'home_lineup', 'away_lineup',
                    'home_lineup_size', 'away_lineup_size']],
        left_on='id', right_on='play_id', how='left',
    )

    # ---- shots_df ----
    shots_df = pbp_with_lineups[pbp_with_lineups['shootingPlay'] == True].copy()
    for idx, row in shots_df.iterrows():
        si = row.get('shotInfo')
        if si and isinstance(si, dict):
            shots_df.at[idx, 'shooter_name'] = si.get('shooter', {}).get('name')
            shots_df.at[idx, 'shooter_id']   = si.get('shooter', {}).get('id')
            shots_df.at[idx, 'made']         = si.get('made')
            shots_df.at[idx, 'assisted']     = si.get('assisted')
            shots_df.at[idx, 'assisted_by']  = si.get('assistedBy', {}).get('name')
            shots_df.at[idx, 'shot_range']   = si.get('range')
            shots_df.at[idx, 'x']            = si.get('location', {}).get('x')
            shots_df.at[idx, 'y']            = si.get('location', {}).get('y')
    shots_df['distance']  = shots_df['playText'].str.extract(r'(\d+)-foot').astype(float)
    shots_df['is_three']  = shots_df['playText'].str.contains('three point', case=False, na=False)
    shots_df['home_lineup_key'] = shots_df['home_lineup'].apply(lineup_to_key)
    shots_df['away_lineup_key'] = shots_df['away_lineup'].apply(lineup_to_key)
    shots_df = shots_df.drop(columns=['home_lineup', 'away_lineup', 'shotInfo', 'participants'], errors='ignore')

    # ---- players_df ----
    players_df = players_flat_df.copy()
    players_df['gameId'] = game_id
    for col in ['rebounds', 'freeThrows', 'threePointFieldGoals', 'twoPointFieldGoals', 'fieldGoals']:
        if col in players_df.columns:
            for idx, row in players_df.iterrows():
                if isinstance(row[col], dict):
                    for key, val in row[col].items():
                        players_df.at[idx, f'{col}_{key}'] = val
            players_df = players_df.drop(columns=[col])

    # ---- lineup_stints_df ----
    lineup_stints_df = get_lineup_stints(
        pbp_with_lineups.sort_values('secondsRemaining', ascending=False)
    )
    lineup_stints_df['gameId'] = game_id
    lineup_stints_df['duration_seconds'] = lineup_stints_df['start_seconds'] - lineup_stints_df['end_seconds']
    lineup_stints_df['home_pts_scored']  = lineup_stints_df['end_home_score'] - lineup_stints_df['start_home_score']
    lineup_stints_df['away_pts_scored']  = lineup_stints_df['end_away_score'] - lineup_stints_df['start_away_score']
    lineup_stints_df['home_plus_minus']  = lineup_stints_df['home_pts_scored'] - lineup_stints_df['away_pts_scored']

    # ---- pbp_flat ----
    pbp_flat = pbp_with_lineups.copy()
    pbp_flat['home_lineup_key'] = pbp_flat['home_lineup'].apply(lineup_to_key)
    pbp_flat['away_lineup_key'] = pbp_flat['away_lineup'].apply(lineup_to_key)
    pbp_flat = pbp_flat.drop(columns=['home_lineup', 'away_lineup', 'shotInfo', 'participants'], errors='ignore')

    # ---- possessions ----
    possessions_df = track_possessions_v2(game_df)
    poss_enriched  = classify_possessions(possessions_df, game_df)

    return {
        'possessions_df':   possessions_df,
        'poss_enriched':    poss_enriched,
        'shots_df':         shots_df,
        'lineup_stints_df': lineup_stints_df,
        'players_df':       players_df,
        'pbp_flat':         pbp_flat,
    }

print("process_single_game() defined.")

## 7. Run Pipeline Over All Games

In [None]:
unique_game_ids = plays_df['gameId'].unique()
n_games = len(unique_game_ids)
print(f"Processing {n_games} games...\n")

_accum = {k: [] for k in
          ['possessions_df', 'poss_enriched', 'shots_df',
           'lineup_stints_df', 'players_df', 'pbp_flat']}
_failed_games = []
_ff_rows = []

for i, gid in enumerate(unique_game_ids):
    game_df = plays_df[plays_df['gameId'] == gid].copy()
    teams = game_df[game_df['team'].notna()]['team'].unique()
    label = f"[{i+1}/{n_games}] Game {gid}"
    try:
        print(f"{label}: {' vs '.join(teams[:2])} -- ", end="", flush=True)
        result = process_single_game(gid, game_df, configuration, season)
        for key in _accum:
            _accum[key].append(result[key])

        # Four Factors (lightweight, no API call)
        ff = compute_four_factors(game_df)
        for team, stats in ff.items():
            stats['game_id'] = gid
            stats['team'] = team
            stats['opponent'] = [t for t in teams if t != team][0]
            _ff_rows.append(stats)

        print(f"{len(result['pbp_flat'])} plays, {len(result['poss_enriched'])} poss")
    except Exception as e:
        print(f"FAILED -- {e}")
        _failed_games.append({'gameId': gid, 'error': str(e)})

    # Rate limit: 1 s between games (roster API calls)
    if i < n_games - 1:
        time.sleep(1.0)

# Concatenate
all_possessions_df   = pd.concat(_accum['possessions_df'],   ignore_index=True) if _accum['possessions_df']   else pd.DataFrame()
all_poss_enriched_df = pd.concat(_accum['poss_enriched'],    ignore_index=True) if _accum['poss_enriched']    else pd.DataFrame()
all_shots_df         = pd.concat(_accum['shots_df'],         ignore_index=True) if _accum['shots_df']         else pd.DataFrame()
all_lineup_stints_df = pd.concat(_accum['lineup_stints_df'], ignore_index=True) if _accum['lineup_stints_df'] else pd.DataFrame()
all_players_df       = pd.concat(_accum['players_df'],       ignore_index=True) if _accum['players_df']       else pd.DataFrame()
all_pbp_flat_df      = pd.concat(_accum['pbp_flat'],         ignore_index=True) if _accum['pbp_flat']         else pd.DataFrame()
all_ff_df            = pd.DataFrame(_ff_rows)

print(f"\n{'='*65}")
print("PIPELINE COMPLETE")
print(f"{'='*65}")
print(f"  Games processed : {n_games - len(_failed_games)}/{n_games}")
if _failed_games:
    print(f"  Games failed    : {len(_failed_games)}")
    for fg in _failed_games:
        print(f"    Game {fg['gameId']}: {fg['error']}")
print(f"  all_possessions_df   : {len(all_possessions_df):>7,} rows")
print(f"  all_poss_enriched_df : {len(all_poss_enriched_df):>7,} rows")
print(f"  all_shots_df         : {len(all_shots_df):>7,} rows")
print(f"  all_lineup_stints_df : {len(all_lineup_stints_df):>7,} rows")
print(f"  all_players_df       : {len(all_players_df):>7,} rows")
print(f"  all_pbp_flat_df      : {len(all_pbp_flat_df):>7,} rows")
print(f"  all_ff_df            : {len(all_ff_df):>7,} rows")

## 8. Save Combined CSVs

In [None]:
output_dir = 'cbbd_data'
os.makedirs(output_dir, exist_ok=True)

date_str = start_date.strftime('%Y%m%d')
if start_date != end_date:
    date_str += '_' + end_date.strftime('%Y%m%d')

# Raw collection data
games_df.to_csv(f'{output_dir}/games_{date_str}_{season}.csv', index=False)
plays_df.to_csv(f'{output_dir}/plays_{date_str}_{season}.csv', index=False)
print(f"Saved {len(games_df)} games and {len(plays_df):,} raw plays")

# Combined analysis DataFrames
_to_save = {
    'possessions':          (all_possessions_df,   'Play-level possession tracking'),
    'possessions_enriched': (all_poss_enriched_df, 'Possession-level summaries'),
    'shots':                (all_shots_df,         'Shots with x/y and lineups'),
    'lineup_stints':        (all_lineup_stints_df, 'Lineup stints with +/-'),
    'players':              (all_players_df,       'Player box scores'),
    'pbp_flat':             (all_pbp_flat_df,      'Full PBP flattened'),
    'four_factors':         (all_ff_df,            'Four Factors per team-game'),
}

for name, (df, desc) in _to_save.items():
    if df is not None and len(df) > 0:
        fname = f'{output_dir}/{name}_{date_str}_{season}.csv'
        df.to_csv(fname, index=False)
        n_g = df['gameId'].nunique() if 'gameId' in df.columns else (df['game_id'].nunique() if 'game_id' in df.columns else '?')
        print(f"  {len(df):>7,} rows ({n_g} games) -> {fname}  -- {desc}")

print(f"\nAll files saved to {output_dir}/")