# Prequisite

In [None]:
!pip install --no-cache-dir --force-reinstall numpy==1.26.4 pandas==2.2.2

In [None]:
!pip install nba_api

In [6]:
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
from collections import defaultdict
from nba_api.stats.endpoints import (PlayByPlayV2,
                                     BoxScoreSummaryV2,
                                     BoxScoreTraditionalV2,
                                     LeagueGameFinder)

# Pbp data Lineup events

In [158]:
game_id = '0022300001'

In [159]:
pbp = PlayByPlayV2(game_id=game_id).get_data_frames()[0] # for events
box = BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[0] # for starter players
summary = BoxScoreSummaryV2(game_id=game_id).get_data_frames()[0] # for home and away team ids

home_team_id = summary['HOME_TEAM_ID'][0]
away_team_id = summary['VISITOR_TEAM_ID'][0]

starters_df = box[box['START_POSITION'] != '']
starters_by_team = starters_df.groupby('TEAM_ID')['PLAYER_NAME'].apply(list).to_dict()

current_lineups = {
    home_team_id: starters_by_team[home_team_id][:],
    away_team_id: starters_by_team[away_team_id][:]
}

We’ve got:

- Clean PBP DataFrame
- Clear home/away team IDs
- Initial 5-man lineup per team

In [160]:
important_event_types = list(range(1, 14))  # all 13 official event types

lineup_timeline = []

for _, row in pbp.iterrows():
    event_type = row['EVENTMSGTYPE']
    home_desc = str(row['HOMEDESCRIPTION']) if pd.notna(row['HOMEDESCRIPTION']) else ''
    away_desc = str(row['VISITORDESCRIPTION']) if pd.notna(row['VISITORDESCRIPTION']) else ''

    # Handle substitutions
    if event_type == 8:
        out_player = row['PLAYER1_NAME']
        in_player = row['PLAYER2_NAME']
        team_id = row['PLAYER1_TEAM_ID'] if pd.notna(row['PLAYER1_TEAM_ID']) else row['PLAYER2_TEAM_ID']
        team_id = int(team_id) if pd.notna(team_id) else None

        if team_id in current_lineups:
            if out_player in current_lineups[team_id]:
                current_lineups[team_id].remove(out_player)
            if in_player not in current_lineups[team_id]:
                current_lineups[team_id].append(in_player)

    # Record all important events with lineup snapshot
    if event_type in important_event_types:
        lineup_timeline.append({
            'EVENTNUM': row['EVENTNUM'],
            'PERIOD': row['PERIOD'],
            'TIME': row['PCTIMESTRING'],
            'EVENT_TYPE': event_type,
            'SCORE': row['SCORE'],
            'HOME_DESCRIPTION': home_desc,
            'AWAY_DESCRIPTION': away_desc,
            'HOME_LINEUP': current_lineups[home_team_id][:],
            'AWAY_LINEUP': current_lineups[away_team_id][:],
            'PLAYER1_NAME': row['PLAYER1_NAME'],
            'PLAYER2_NAME': row['PLAYER2_NAME'],
            'PLAYER3_NAME': row['PLAYER3_NAME']
        })

lineup_df = pd.DataFrame(lineup_timeline)

In [None]:
# What kind of 2PTS shot names exist
result_df = lineup_df[lineup_df['SCORE'].notna()][['SCORE', 'HOME_DESCRIPTION', 'AWAY_DESCRIPTION']]

proba_df = result_df[
    ~result_df['HOME_DESCRIPTION'].str.contains('|'.join([ 'Fadeaway','3PT','Free Throw', 'Dunk', 'Layup', 'Jump Shot', 'Hook Shot']), case=False, na=False) &
    ~result_df['AWAY_DESCRIPTION'].str.contains('|'.join(['Fadeaway', '3PT','Free Throw','Dunk', 'Layup', 'Jump Shot', 'Hook Shot']), case=False, na=False)
]
proba_df

# Constructing lineup_segments
Stretches of continuous play with the same home and away lineups.
For each:
  - Count team-level stats (points, assists, TOs, etc.)
  - Count player-level stats for only those on the floor
  - End the segment when either team makes a substitution



In [162]:
from collections import defaultdict

lineup_segments = []
segment = None
prev_home = None
prev_away = None
last_shot_team = None  # Used to infer rebound type (off vs def)
SKIP_KEYWORDS = [
    'SUB',
    'Jump Ball',
    'Delay',
    'Offensive',
    'Timeout',
    'Rebound'
]

# Loop through every event in the play-by-play lineup_df
for _, row in lineup_df.iterrows():
    home_lineup = tuple(sorted(row['HOME_LINEUP']))
    away_lineup = tuple(sorted(row['AWAY_LINEUP']))

    # ---- 1. START NEW SEGMENT IF LINEUPS CHANGED ----
    if segment is None or home_lineup != prev_home or away_lineup != prev_away:
        if segment:
            segment['end_event'] = row['EVENTNUM']
            segment['end_time'] = row['TIME']
            lineup_segments.append(segment)

        segment = {
            'home_lineup': home_lineup,
            'away_lineup': away_lineup,
            'start_event': row['EVENTNUM'],
            'start_time': row['TIME'],
            'period': row['PERIOD'],
            'team_stats': {
                'home': defaultdict(int),
                'away': defaultdict(int)
            },
            'player_stats': defaultdict(lambda: defaultdict(int)),
            'events': []
        }

        prev_home = home_lineup
        prev_away = away_lineup

    # ---- 2. LOG THE EVENT ----
    segment['events'].append({
        'EVENTNUM': row['EVENTNUM'],
        'TIME': row['TIME'],
        'HOME_DESCRIPTION': row['HOME_DESCRIPTION'],
        'AWAY_DESCRIPTION': row['AWAY_DESCRIPTION']
    })

    # ---- 3. PROCESS HOME AND AWAY DESCRIPTIONS ----
    for team_side, desc in [('home', row['HOME_DESCRIPTION']), ('away', row['AWAY_DESCRIPTION'])]:
        if not desc or any(skip_word in desc for skip_word in SKIP_KEYWORDS):
            continue

        lineup = home_lineup if team_side == 'home' else away_lineup
        players_on_court = set(lineup)

        # Helper to increment team & player stats
        def inc(stat, players, value=1):
            segment['team_stats'][team_side][stat] += value
            for p in players:
                if p in players_on_court:
                    segment['player_stats'][p][stat] += value

        # Get involved players from row (if names are not null and on court)
        players = []
        for p in [row['PLAYER1_NAME'], row['PLAYER2_NAME'], row['PLAYER3_NAME']]:
            if isinstance(p, str) and p in players_on_court:
                players.append(p)

        # ---- 4. ACTION DETECTION ----

        # --- Shots ---
        if '3PT' in desc and 'MISS' not in desc:
            inc('3pt_made', [row['PLAYER1_NAME']])
            inc('points', [row['PLAYER1_NAME']], value=3)
        elif 'Free Throw' in desc and 'MISS' not in desc:
            inc('ft_made', [row['PLAYER1_NAME']])
            inc('points', [row['PLAYER1_NAME']])
        elif 'MISS' not in desc and any(kw in desc for kw in ['Fadeaway', 'Dunk', 'Layup', 'Jump Shot', 'Hook Shot']):
            inc('2pt_made', [row['PLAYER1_NAME']])
            inc('points', [row['PLAYER1_NAME']], value=2)
        elif 'MISS' in desc:
            last_shot_team = team_side

        if 'AST' in desc:
            inc('assists', [row['PLAYER2_NAME']])
        if 'Turnover' in desc:
            inc('turnovers', [row['PLAYER1_NAME']])
        if 'STL' in desc:
            inc('steals', [row['PLAYER2_NAME']])
        if 'BLK' in desc:
            inc('blocks', [row['PLAYER2_NAME']])
        if '.FOUL' in desc:
            inc('fouls', [row['PLAYER1_NAME']])

        # --- Rebounds ---
        if 'REBOUND' in desc:
            rebound_team = team_side
            rebounder = row['PLAYER1_NAME']
            if last_shot_team:
                if rebound_team == last_shot_team:
                    inc('off_rebounds', [rebounder])
                else:
                    inc('def_rebounds', [rebounder])
            last_shot_team = None  # Reset after rebound

# ---- 5. FLUSH FINAL SEGMENT ----
if segment:
    segment['end_event'] = row['EVENTNUM']
    segment['end_time'] = row['TIME']
    lineup_segments.append(segment)

# lineup_vs_lineup_df
**Team stats per segment**<br>
Each row represents:
 - One continuous period where the same home and away lineups were on the floor
 - Aggregated team-level stats (points, 3pt, rebounds, assists, TOs, etc.)

In [163]:
lineup_vs_lineup_data = []

for seg in lineup_segments:
    row = {
        'period': seg['period'],
        'start_event': seg['start_event'],
        'end_event': seg['end_event'],
        'start_time': seg['start_time'],
        'end_time': seg['end_time'],
        'home_lineup': seg['home_lineup'],
        'away_lineup': seg['away_lineup'],
    }

    # Flatten team stats into prefixed keys
    for team in ['home', 'away']:
        for stat, value in seg['team_stats'][team].items():
            row[f'{team}_{stat}'] = value

    lineup_vs_lineup_data.append(row)

lineup_vs_lineup_df = pd.DataFrame(lineup_vs_lineup_data)

In [165]:
def time_str_to_sec(t):
    """Convert 'M:SS' string to total seconds."""
    if isinstance(t, str):
        m, s = map(int, t.strip().split(":"))
        return m * 60 + s
    return 0  # handle missing or bad values

def compute_duration(row):
    start_sec = time_str_to_sec(row['start_time'])
    end_sec = time_str_to_sec(row['end_time'])

    start_period = row['period']
    end_period = row['period']  # assume same period unless time increases

    # If clock resets (e.g. start_time = 0:03, end_time = 10:39), period must've changed
    if time_str_to_sec(row['end_time']) > time_str_to_sec(row['start_time']):
        end_period = start_period + 1

    # Duration across possible multiple periods
    period_diff = end_period - start_period
    duration = start_sec + period_diff * 720 - end_sec

    return duration


In [166]:
lineup_vs_lineup_df['duration'] = lineup_vs_lineup_df.apply(compute_duration, axis=1)

In [167]:
non_stat_cols = ['home_lineup', 'away_lineup', 'start_time', 'end_time', 'period', 'start_event', 'end_event', 'duration']
stat_cols = [col for col in lineup_vs_lineup_df.columns if col not in non_stat_cols]
lineup_vs_lineup_df['non_nan_stats'] = lineup_vs_lineup_df[stat_cols].notna().sum(axis=1)

In [175]:
# Filter: remove short or empty segments
lu_vs_lu_filtered = lineup_vs_lineup_df[
    (lineup_vs_lineup_df['duration'] >= 25) |   # at least 24 seconds
    (lineup_vs_lineup_df['non_nan_stats'] >= 3)  # at least 2 non-NaN stats
].copy()
lu_vs_lu_filtered = lu_vs_lu_filtered.fillna(0)

In [176]:
total_home_points = lineup_vs_lineup_df['home_points'].sum()
total_away_points = lineup_vs_lineup_df['away_points'].sum()
print(f"The total home points is: {total_home_points}")
print(f"The total away points is: {total_away_points}")

The total home points is: 121.0
The total away points is: 116.0


# player_stats_df
**Player stats per segment**<br>
Each row:
- Represents a single player's contributions in a specific segment
- Includes which lineups they were in
- Stats across that segment only

In [170]:
player_stats_data = []

for seg in lineup_segments:
    for player_name, stats in seg['player_stats'].items():
        row = {
            'player': player_name,
            'period': seg['period'],
            'start_event': seg['start_event'],
            'end_event': seg['end_event'],
            'start_time': seg['start_time'],
            'end_time': seg['end_time'],
            'home_lineup': seg['home_lineup'],
            'away_lineup': seg['away_lineup'],
        }

        # Flatten stat values
        for stat, value in stats.items():
            row[stat] = value

        player_stats_data.append(row)

player_stats_df = pd.DataFrame(player_stats_data)

In [174]:
player_stats_df.head(10)

Unnamed: 0,player,period,start_event,end_event,start_time,end_time,home_lineup,away_lineup,2pt_made,points,def_rebounds,turnovers,3pt_made,steals,ft_made,assists,off_rebounds,fouls
0,Myles Turner,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",1.0,7.0,2.0,1.0,1.0,1.0,2.0,,,
1,Tyrese Haliburton,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",,,1.0,1.0,,,,1.0,,
2,Bruce Brown,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",2.0,7.0,2.0,,1.0,,,1.0,1.0,
3,Bennedict Mathurin,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",,3.0,,,1.0,,,2.0,1.0,
4,Donovan Mitchell,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",2.0,7.0,,1.0,1.0,,,,,
5,Darius Garland,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",,,,,,,,4.0,,1.0
6,Max Strus,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",,,1.0,,,,,,,1.0
7,Jarrett Allen,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",2.0,4.0,1.0,,,1.0,,,1.0,1.0
8,Obi Toppin,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",,,,,,,,1.0,,
9,Evan Mobley,1,2,69,12:00,6:43,"(Bennedict Mathurin, Bruce Brown, Myles Turner...","(Darius Garland, Donovan Mitchell, Evan Mobley...",,,,,,1.0,,,1.0,


In [173]:
player_stats_filtered_df = player_stats_df.merge(
    lu_vs_lu_filtered[['start_event', 'end_event', 'period', 'duration']],
    on=['start_event', 'end_event', 'period'],
    how='inner'
)

In [177]:
player_stats_filtered_df = player_stats_filtered_df.fillna(0)