In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob

import time
import requests

from bs4 import BeautifulSoup
from IPython.display import display, HTML

# Set up notebook display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)

# NBA Player Scoring Progression Analysis - Data Collection
This notebook collects NBA player statistics from Basketball-Reference.com 

download_season_data(season_year, season_end_year=None, data_type='per_game'): Downloads statistics for a single NBA season.

download_multiple_seasons(start_year, end_year, data_type='per_game', delay=3): Iteratively downloads data for a range of seasons.

# Data
The downloaded data includes per-game statistics for NBA players with the following key metrics:

* Player demographics (name, position, team)
* Scoring statistics (points, field goal %, three-point %, free throw %)
* Minutes played and games played
* Season identifiers

This data will form the foundation for our visualization of scoring progression patterns by position.


In [None]:
# Define helper functions

# download season data
def download_season_data(season_year, season_end_year=None, data_type='per_game'):
    """
    Download NBA player statistics from Basketball-Reference for a specific season
    
    Parameters:
    season_year (int): Starting year of the season (e.g., 2022 for the 2022-23 season)
    season_end_year (int, optional): Ending year of the season (calculated automatically if not provided)
    data_type (str): Type of statistics to download ('per_game', 'totals', 'advanced', etc.)
    
    Returns:
    pd.DataFrame: DataFrame containing the player statistics
    """
    if season_end_year is None:
        # Calculate the end year of the season (e.g., 2022 -> 2023)
        season_end_year = season_year + 1
    
    # Generate the URL based on the pattern used by Basketball-Reference
    url = f"https://www.basketball-reference.com/leagues/NBA_{season_end_year}_{data_type}.html"
    
    print(f"Downloading data from {url}...")
    
    try:
        # Send a request to the website with proper headers to simulate a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the main stats table - Basketball-Reference uses id="per_game_stats" for per game data
            table_id = f"{data_type}_stats"
            table = soup.find('table', id=table_id)
            
            if table is None:
                print(f"Error: Table with id '{table_id}' not found on the page.")
                return None
            
            # Extract the data from the table
            df = pd.read_html(str(table))[0]
            
            # Handle the fact that Basketball-Reference includes repeated headers in the table
            df = df[df['Rk'] != 'Rk']
            
            # Convert numeric columns
            for col in df.columns:
                if col not in ['Player', 'Pos', 'Tm']:
                    try:
                        df[col] = pd.to_numeric(df[col])
                    except:
                        pass
            
            # Add season information
            df['Season'] = f"{season_year}-{str(season_end_year)[2:]}"
            df['Season_Year'] = season_end_year
            
            print(f"Successfully downloaded data for {season_year}-{str(season_end_year)[2:]} season.")
            
            # Save the data to a CSV file
            filename = f"NBA_{season_year}_{data_type}.csv"
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            
            return df
        else:
            print(f"Error: Failed to retrieve data. Status code: {response.status_code}")
            return None
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def download_multiple_seasons(start_year, end_year, data_type='per_game', delay=3):
    """
    Download NBA player statistics for multiple seasons
    
    Parameters:
    start_year (int): First season to download (starting year)
    end_year (int): Last season to download (starting year)
    data_type (str): Type of statistics to download
    delay (int): Delay between requests in seconds (to avoid overloading the server)
    
    Returns:
    pd.DataFrame: Combined DataFrame containing statistics for all seasons
    """
    all_seasons = []
    
    for year in range(start_year, end_year + 1):
        season_data = download_season_data(year, data_type=data_type)
        
        if season_data is not None:
            all_seasons.append(season_data)
            
            # Display a small preview of the data
            display(HTML(f"<h4>Preview of {year}-{year+1} Season Data:</h4>"))
            display(season_data.head(3))
        
        # Add a delay to avoid overloading the server
        if year < end_year:
            print(f"Waiting {delay} seconds before the next request...")
            time.sleep(delay)
    
    if all_seasons:
        # Combine all seasons into a single DataFrame
        combined_data = pd.concat(all_seasons, ignore_index=True)
        
        # Save the combined data
        combined_filename = f"NBA_{start_year}_to_{end_year}_{data_type}.csv"
        combined_data.to_csv(combined_filename, index=False)
        print(f"Combined data saved to {combined_filename}")
        
        return combined_data
    else:
        print("No data was downloaded successfully.")
        return None


# Load the Data from BB reference or from the available local files

In [2]:
start_year = 1975
end_year = 2024

In [3]:
# execute this to download a fresh copy of the data - should only need to be done if we want more years
# or identify some corruption in the data - once the files are local, we can load from the local csvs

if False: 
    # Execute the download
    nba_data = download_multiple_seasons(start_year, end_year, 'per_game', delay=3)
    
    # Display information about the downloaded data
    if nba_data is not None:
        print("\nDownload Summary:")
        print(f"Total seasons: {end_year - start_year + 1}")
        print(f"Total player-seasons: {len(nba_data)}")
        print(f"Unique players: {nba_data['Player'].nunique()}")
        
        # Display available columns
        print("\nAvailable statistics:")
        for column in sorted(nba_data.columns):
            print(f"- {column}")
        
        # Create a preliminary plot of the data
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='Season', y='PTS', data=nba_data)
        plt.title('Points Per Game Distribution by Season')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # Show position distribution
        plt.figure(figsize=(10, 6))
        pos_counts = nba_data['Pos'].value_counts()
        sns.barplot(x=pos_counts.index, y=pos_counts.values)
        plt.title('Player Count by Position')
        plt.ylabel('Number of Player-Seasons')
        plt.tight_layout()
        plt.show()

### Read from local files by default - download manually above for first run thru

In [3]:
def load_nba_data(start_year, end_year, data_type='per_game'):
    """
    Load NBA data from local CSV files for the specified years
    
    Parameters:
    start_year (int): First season to load (starting year)
    end_year (int): Last season to load (starting year)
    data_type (str): Type of statistics to load ('per_game', 'totals', 'advanced', etc.)
    
    Returns:
    pd.DataFrame: Combined DataFrame with all seasons' data
    """
    # Check if combined file exists
    combined_filename = f"NBA_{start_year}_to_{end_year}_{data_type}.csv"
    
    if os.path.exists(combined_filename):
        print(f"Loading combined data from {combined_filename}")
        return pd.read_csv(combined_filename)
    
    # If not, load individual season files
    print("Loading individual season files...")
    all_seasons = []
    
    for year in range(start_year, end_year + 1):
        filename = f"NBA_{year}_{data_type}.csv"
        
        if os.path.exists(filename):
            print(f"Loading {filename}")
            season_df = pd.read_csv(filename)
            all_seasons.append(season_df)
        else:
            print(f"Warning: Could not find {filename}")
    
    if not all_seasons:
        raise FileNotFoundError("No data files found for the specified years.")
    
    # Combine all the loaded seasons
    print(f"Combining data from {len(all_seasons)} seasons...")
    combined_data = pd.concat(all_seasons, ignore_index=True)
    
    return combined_data


nba_data = load_nba_data(start_year, end_year)
    
# Display basic information about the dataset
print("\nData Summary:")
print(f"Total player-seasons: {len(nba_data)}")
print(f"Unique players: {nba_data['Player'].nunique()}")
print(f"Seasons: {nba_data['Season'].nunique()}")
    
# Preview the data
print("\nData Preview:")
display(nba_data.head())

Loading combined data from NBA_1975_to_2024_per_game.csv

Data Summary:
Total player-seasons: 25950
Unique players: 3894
Seasons: 50

Data Preview:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Awards,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
0,1.0,Bob McAdoo,24.0,BUF,C,78.0,,42.7,12.0,24.6,0.487,7.2,9.4,0.762,3.1,9.3,12.4,4.0,1.2,2.1,3.8,31.1,"MVP-2,AS",1975-76,1976,,,,,,,,
1,2.0,Kareem Abdul-Jabbar,28.0,LAL,C,82.0,82.0,41.2,11.1,21.1,0.529,5.5,7.8,0.703,3.3,13.5,16.9,5.0,1.5,4.1,3.6,27.7,"MVP-1,AS,NBA1",1975-76,1976,,,,,,,,
2,3.0,Pete Maravich,28.0,NOJ,SG,62.0,,38.3,9.7,21.2,0.459,6.4,7.9,0.811,0.7,4.1,4.8,5.4,1.4,0.4,3.2,25.9,"MVP-15,NBA1",1975-76,1976,,,,,,,,
3,4.0,Tiny Archibald,27.0,KCK,PG,78.0,,40.8,9.2,20.3,0.453,6.4,8.0,0.802,0.9,1.9,2.7,7.9,1.6,0.2,2.2,24.8,"MVP-9,AS,NBA1",1975-76,1976,,,,,,,,
4,5.0,Fred Brown,27.0,SEA,SG,76.0,,33.1,9.8,20.0,0.488,3.6,4.1,0.869,1.5,2.7,4.2,2.7,1.9,0.2,2.4,23.1,"MVP-13,AS",1975-76,1976,,,,,,,,


# Data cleaning
### Drop columns we don't expect to use
### Drop NAs in critical columns
### need to clean up stats for players who had stints on multiple teams in a season
The stats from basketballreference.com have multiple rows for players who were traded or had short stints on multiple teams within a single year - some of these include totals for the year with correct per-game stats, labeled '2TM' or '3TM' - select these for the yearly totals, and drop the stats for the individual team stints

In [4]:
print(f"NBA starting dataset rows: {nba_data.shape[0]}")

# Drop players with NAs on team - not usable
nba_data = nba_data.dropna(subset=['Team'])
#
nba_data.drop('Awards', axis=1, inplace=True)

print(f"Dropped rows with NaN Team values. Remaining rows: {len(nba_data)}")


NBA starting dataset rows: 25950
Dropped rows with NaN Team values. Remaining rows: 25900


### Example of duplicate data - Eric Bledsoe 2017-18 has entries for '2TM', 'PHO', and 'MIL'
We will be working with yearly totals, so will be dropping 'PHO' and 'MIL' and keeping '2TM'

In [5]:
# Prep for removing multiple entries
nba_data_clean = nba_data.copy()

# These are the 'Team' values used to indicate the
multi_team_indicators = ['2TM', '3TM', '4TM', '5TM', 'TOT']

multi_team_mask = nba_data_clean['Team'].isin(multi_team_indicators)
multi_team_players = nba_data_clean[multi_team_mask][['Player', 'Season']].drop_duplicates()
    
print(f"Found {len(multi_team_players)} player-season pairs with multi-team entries")

display(HTML(f"<h4>Sample Single Player Data with multiple entries:</h4>"))
display(nba_data_clean[(nba_data_clean['Player'] == 'Eric Bledsoe') & (nba_data_clean['Season'] == '2017-18')].head(10))



Found 2350 player-season pairs with multi-team entries


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
20362,45.0,Eric Bledsoe,28.0,2TM,PG,74.0,74.0,31.4,6.3,13.4,0.473,3.4,4.3,0.795,0.7,3.1,3.8,5.0,2.0,0.6,2.5,17.7,2017-18,2018,2.9,1.7,4.9,0.347,4.6,8.5,0.545,0.536
20363,46.0,Eric Bledsoe,28.0,PHO,PG,3.0,3.0,27.7,5.3,13.3,0.4,3.7,4.7,0.786,1.0,1.3,2.3,3.0,1.3,0.7,2.0,15.7,2017-18,2018,3.3,1.3,4.3,0.308,4.0,9.0,0.444,0.45
20364,47.0,Eric Bledsoe,28.0,MIL,PG,71.0,71.0,31.5,6.4,13.4,0.476,3.4,4.3,0.795,0.7,3.2,3.9,5.1,2.0,0.6,2.5,17.8,2017-18,2018,2.9,1.7,4.9,0.349,4.7,8.5,0.55,0.54


In [6]:

# we're going to create a column called 'player_season_key' and use that for a boolean mask
# to keep or drop the rows
try:
    nba_data_clean.drop('player_season_key', axis=1, inplace=True)
except: # don't care about dropping a non-existent column if we're re-running this in test
    pass
nba_data_clean['player_season_key'] = nba_data_clean['Player'] + "|" + nba_data_clean['Season']

multi_team_players['player_season_key'] = multi_team_players['Player'] + "|" + multi_team_players['Season']
multi_team_keys = set(multi_team_players['player_season_key'])

# Create a mask for rows to keep
keep_mask = ~(
    nba_data_clean['player_season_key'].isin(multi_team_keys) & 
    ~nba_data_clean['Team'].isin(multi_team_indicators)
)
    
# Apply the mask to get the clean data
nba_data_clean = nba_data_clean[keep_mask].copy()

# Remove the temporary key column
nba_data_clean.drop('player_season_key', axis=1, inplace=True)

# print out the sample data to show the 2017-18 season has been cleaned for our test player
display(HTML(f"<h4>Single Player Data:</h4>"))
display(nba_data_clean[nba_data_clean['Player'] == 'Eric Bledsoe'].head(10))


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
16437,311.0,Eric Bledsoe,21.0,LAC,PG,81.0,25.0,22.7,2.5,6.0,0.424,1.2,1.6,0.744,0.8,1.9,2.8,3.6,1.1,0.3,1.6,6.7,2010-11,2011,2.4,0.4,1.4,0.276,2.1,4.6,0.47,0.457
17175,423.0,Eric Bledsoe,22.0,LAC,PG,40.0,1.0,11.6,1.2,3.2,0.389,0.7,1.1,0.636,0.6,1.1,1.6,1.7,0.8,0.4,1.5,3.3,2011-12,2012,1.2,0.2,0.8,0.2,1.1,2.4,0.448,0.413
17503,199.0,Eric Bledsoe,23.0,LAC,PG,76.0,12.0,20.4,3.3,7.5,0.445,1.4,1.8,0.791,1.0,1.9,3.0,3.1,1.4,0.7,1.5,8.5,2012-13,2013,1.8,0.4,1.0,0.397,2.9,6.4,0.453,0.473
17919,41.0,Eric Bledsoe,24.0,PHO,PG,43.0,40.0,32.9,6.2,12.9,0.477,4.3,5.5,0.772,0.6,4.1,4.7,5.5,1.6,0.3,2.3,17.7,2013-14,2014,3.3,1.2,3.3,0.357,5.0,9.7,0.517,0.522
18528,38.0,Eric Bledsoe,25.0,PHO,PG,81.0,81.0,34.6,5.8,12.9,0.447,4.4,5.4,0.8,0.9,4.3,5.2,6.1,1.6,0.6,2.3,17.0,2014-15,2015,3.4,1.1,3.4,0.324,4.7,9.6,0.491,0.489
19164,22.0,Eric Bledsoe,26.0,PHO,PG,31.0,31.0,34.2,7.2,15.9,0.453,4.5,5.5,0.802,0.6,3.5,4.0,6.1,2.0,0.6,2.4,20.4,2015-16,2016,3.5,1.5,4.2,0.372,5.6,11.7,0.482,0.502
19751,30.0,Eric Bledsoe,27.0,PHO,PG,66.0,66.0,33.0,6.8,15.7,0.434,5.9,6.9,0.847,0.8,4.1,4.8,6.3,1.4,0.5,2.5,21.1,2016-17,2017,3.4,1.6,4.7,0.335,5.2,11.0,0.477,0.485
20362,45.0,Eric Bledsoe,28.0,2TM,PG,74.0,74.0,31.4,6.3,13.4,0.473,3.4,4.3,0.795,0.7,3.1,3.8,5.0,2.0,0.6,2.5,17.7,2017-18,2018,2.9,1.7,4.9,0.347,4.6,8.5,0.545,0.536
21059,77.0,Eric Bledsoe,29.0,MIL,PG,78.0,78.0,29.1,6.0,12.4,0.484,2.3,3.0,0.75,1.1,3.6,4.6,5.5,1.5,0.4,2.0,15.9,2018-19,2019,2.1,1.6,4.8,0.329,4.4,7.6,0.582,0.548
21792,101.0,Eric Bledsoe,30.0,MIL,PG,61.0,61.0,27.0,5.5,11.5,0.475,2.8,3.5,0.79,0.7,3.9,4.6,5.4,0.9,0.4,2.1,14.9,2019-20,2020,2.4,1.2,3.5,0.344,4.3,8.0,0.533,0.528


# remaining outliers
From visual inspection, we see that these players are individuals with common names and may be distinct individuals. We will retain these entries, and treat them as separate datapoints.

In [7]:

# Let's test if this was successful or if there are still outliers

print(f"Original dataset: {len(nba_data)} rows")
print(f"Cleaned dataset: {len(nba_data_clean)} rows")
print(f"Removed {len(nba_data) - len(nba_data_clean)} individual team entries")

player_season_counts = nba_data_clean.groupby(['Player', 'Season']).size()
if (player_season_counts > 1).any():
    print("\nWarning: Some player-seasons still have multiple entries")
    print(f"Number of player-seasons with multiple entries: {sum(player_season_counts > 1)}")

    # Find the specific player-seasons with multiple entries
    problematic_pairs = player_season_counts[player_season_counts > 1].reset_index()
    print("\nPlayer-seasons with multiple entries:")
    display(problematic_pairs)
    
    # Show the actual data for these problematic pairs
    print("\nDetailed data for these problematic entries:")
    for _, row in problematic_pairs.iterrows():
        player = row['Player']
        season = row['Season']
        entry_count = row[0]  # The count is in the first unnamed column
        
        print(f"\n{player} in {season} has {entry_count} entries:")
        display(nba_data_clean[(nba_data_clean['Player'] == player) & 
                             (nba_data_clean['Season'] == season)])
else:
    print("\nSuccess: Each player has exactly one entry per season")



Original dataset: 25900 rows
Cleaned dataset: 21036 rows
Removed 4864 individual team entries

Number of player-seasons with multiple entries: 21

Player-seasons with multiple entries:


Unnamed: 0,Player,Season,0
0,Charles Jones,1985-86,2
1,Charles Jones,1987-88,2
2,Charles Jones,1988-89,2
3,Charles Smith,1989-90,2
4,Charles Smith,1990-91,2
...,...,...,...
16,George Johnson,1984-85,2
17,George Johnson,1985-86,2
18,Marcus Williams,2008-09,2
19,Michael Smith,1994-95,2



Detailed data for these problematic entries:

Charles Jones in 1985-86 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
3811,264.0,Charles Jones,24.0,PHO,PF,43.0,18.0,17.3,1.7,3.8,0.457,1.2,2.3,0.51,1.5,3.0,4.5,1.2,0.7,0.6,2.0,4.7,1985-86,1986,1.3,0.0,0.0,0.0,1.7,3.8,0.46,0.457
3840,293.0,Charles Jones,28.0,WSB,C,81.0,58.0,19.9,1.6,3.1,0.508,0.7,1.1,0.628,1.5,2.5,4.0,0.9,0.7,1.6,2.9,3.9,1985-86,1986,0.9,0.0,0.0,0.0,1.6,3.1,0.51,0.508



Charles Jones in 1987-88 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
4665,359.0,Charles Jones,30.0,WSB,PF,69.0,49.0,19.0,1.0,2.6,0.407,0.8,1.1,0.707,1.5,3.2,4.7,0.9,0.8,1.6,3.3,2.9,1987-88,1988,0.8,0.0,0.0,0.0,1.0,2.6,0.409,0.407
4716,410.0,Charles Jones,26.0,POR,PF,37.0,0.0,5.0,0.4,1.1,0.4,0.5,0.9,0.576,0.3,0.5,0.8,0.2,0.1,0.2,0.8,1.4,1987-88,1988,0.3,0.0,0.0,0.0,0.4,1.1,0.41,0.4



Charles Jones in 1988-89 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
5092,359.0,Charles Jones,31.0,WSB,C,53.0,45.0,21.8,1.1,2.4,0.48,0.3,0.5,0.64,1.5,3.4,4.8,0.8,0.7,1.4,3.5,2.6,1988-89,1989,0.7,0.0,0.0,0.0,1.1,2.3,0.484,0.48
5093,360.0,Charles Jones,27.0,WSB,PF,43.0,0.0,12.0,0.9,1.9,0.463,0.8,1.2,0.623,1.3,2.0,3.3,0.4,0.4,0.4,1.1,2.6,1988-89,1989,0.5,0.0,0.1,0.333,0.9,1.8,0.468,0.47



Charles Smith in 1989-90 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
5199,27.0,Charles Smith,24.0,LAC,PF,78.0,76.0,35.0,7.6,14.7,0.52,5.8,7.3,0.794,2.3,4.4,6.7,1.5,1.1,1.5,3.8,21.1,1989-90,1990,2.1,0.0,0.2,0.083,7.6,14.5,0.524,0.52
5537,365.0,Charles Smith,22.0,BOS,PG,60.0,0.0,8.7,1.0,2.2,0.444,0.9,1.3,0.697,0.2,0.9,1.2,1.7,0.6,0.1,1.3,2.9,1989-90,1990,0.6,0.0,0.1,0.0,1.0,2.1,0.468,0.444



Charles Smith in 1990-91 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
5659,27.0,Charles Smith,25.0,LAC,SF,74.0,74.0,36.5,7.4,15.8,0.469,5.2,6.5,0.793,2.9,5.3,8.2,1.8,1.1,2.0,3.6,20.0,1990-91,1991,2.2,0.0,0.1,0.0,7.4,15.7,0.472,0.469
6040,408.0,Charles Smith,23.0,BOS,PG,5.0,0.0,6.0,0.6,1.4,0.429,0.6,1.0,0.6,0.0,0.4,0.4,1.2,0.2,0.0,1.4,1.8,1990-91,1991,0.6,0.0,0.0,,0.6,1.4,0.429,0.429



Chris Johnson in 2012-13 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
17706,402.0,Chris Johnson,27.0,MIN,C,30.0,0.0,9.5,1.6,2.5,0.64,0.7,1.1,0.618,0.7,1.3,2.0,0.3,0.2,0.9,1.6,3.9,2012-13,2013,0.4,0.0,0.0,,1.6,2.5,0.64,0.64
17719,415.0,Chris Johnson,22.0,MEM,SF,8.0,0.0,12.8,1.4,3.1,0.44,0.1,0.3,0.5,0.6,0.8,1.4,0.3,0.5,0.0,0.6,3.6,2012-13,2013,0.4,0.8,2.3,0.333,0.6,0.9,0.714,0.56



Eddie Johnson in 1981-82 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
2117,43.0,Eddie Johnson,26.0,ATL,SG,68.0,57.0,34.0,6.7,14.9,0.45,4.3,5.7,0.764,0.9,1.9,2.8,5.3,1.5,0.2,2.8,17.8,1981-82,1982,2.7,0.1,0.4,0.233,6.6,14.4,0.457,0.454
2242,168.0,Eddie Johnson,22.0,KCK,SF,74.0,27.0,20.5,4.0,8.7,0.459,1.3,2.0,0.664,1.7,2.6,4.4,1.5,0.7,0.2,2.8,9.3,1981-82,1982,1.3,0.0,0.1,0.091,4.0,8.5,0.465,0.46



Eddie Johnson in 1982-83 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
2479,31.0,Eddie Johnson,23.0,KCK,SF,82.0,82.0,35.8,8.3,16.7,0.494,3.0,3.9,0.779,2.3,3.8,6.1,2.6,0.9,0.2,3.2,19.8,1982-83,1983,2.2,0.2,0.9,0.282,8.0,15.8,0.506,0.501
2503,55.0,Eddie Johnson,27.0,ATL,SG,61.0,57.0,29.7,6.4,14.1,0.453,3.0,3.9,0.785,0.4,1.6,2.0,5.2,1.0,0.1,2.3,16.0,1982-83,1983,2.6,0.2,0.7,0.341,6.1,13.4,0.459,0.462



Eddie Johnson in 1983-84 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
2856,16.0,Eddie Johnson,24.0,KCK,SF,82.0,82.0,35.6,9.2,18.9,0.485,3.3,4.0,0.81,2.0,3.5,5.5,3.6,0.9,0.3,3.2,21.9,1983-84,1984,2.6,0.2,0.8,0.313,8.9,18.1,0.493,0.492
2915,75.0,Eddie Johnson,28.0,ATL,SG,67.0,43.0,28.3,5.3,11.9,0.442,2.4,3.2,0.77,0.5,1.7,2.2,5.6,0.9,0.1,2.3,13.2,1983-84,1984,2.6,0.2,0.6,0.372,5.0,11.3,0.446,0.452



Eddie Johnson in 1984-85 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
3196,12.0,Eddie Johnson,25.0,KCK,SF,82.0,81.0,36.9,9.4,19.1,0.491,4.0,4.5,0.871,1.8,3.1,5.0,3.3,1.0,0.3,2.9,22.9,1984-85,1985,2.7,0.2,0.7,0.241,9.2,18.4,0.5,0.496
3240,56.0,Eddie Johnson,29.0,ATL,SG,73.0,66.0,32.4,6.2,13.0,0.479,3.6,4.5,0.798,0.5,2.1,2.6,7.8,0.6,0.1,2.5,16.3,1984-85,1985,3.3,0.3,1.0,0.306,5.9,12.0,0.493,0.49



Eddie Johnson in 1986-87 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
3964,37.0,Eddie Johnson,27.0,SAC,SF,81.0,30.0,30.3,7.5,16.2,0.463,3.3,4.0,0.829,1.8,2.6,4.4,3.1,0.5,0.2,2.7,18.7,1986-87,1987,2.0,0.5,1.5,0.314,7.0,14.7,0.478,0.477
4079,152.0,Eddie Johnson,31.0,SEA,SG,24.0,0.0,21.2,3.5,7.8,0.457,1.8,2.3,0.764,0.5,1.5,1.9,4.8,0.5,0.0,1.5,9.0,1986-87,1987,1.7,0.2,0.6,0.333,3.3,7.1,0.468,0.47



George Johnson in 1978-79 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
1210,203.0,George Johnson,30.0,NJN,C,78.0,,26.4,2.6,6.2,0.427,1.3,1.8,0.761,2.6,5.3,7.9,1.1,0.9,3.2,4.0,6.6,1978-79,1979,2.3,,,,,,,
1224,217.0,George Johnson,22.0,MIL,PF,67.0,,17.3,2.5,5.1,0.482,1.3,1.7,0.718,1.6,3.8,5.4,1.2,1.1,0.7,2.8,6.2,1978-79,1979,1.5,,,,,,,



George Johnson in 1979-80 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
1500,148.0,George Johnson,23.0,DEN,PF,75.0,,25.8,4.1,8.7,0.476,2.0,2.5,0.783,2.5,5.3,7.8,2.1,1.1,0.9,3.5,10.2,1979-80,1980,2.0,0.0,0.1,0.222,4.1,8.5,0.48,0.478
1564,212.0,George Johnson,31.0,NJN,C,81.0,,26.2,3.1,6.7,0.457,1.1,1.6,0.706,2.4,5.1,7.4,2.1,0.7,3.2,3.9,7.2,1979-80,1980,2.5,0.0,0.0,0.0,3.1,6.7,0.458,0.457



George Johnson in 1980-81 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
1831,121.0,George Johnson,24.0,IND,PF,43.0,,21.6,4.2,9.2,0.462,2.2,2.8,0.762,2.3,4.2,6.5,2.0,1.1,0.5,2.8,10.6,1980-81,1981,2.0,0.0,0.1,0.0,4.2,9.0,0.468,0.462
1955,245.0,George Johnson,32.0,SAS,C,82.0,82.0,23.6,2.0,4.2,0.473,1.0,1.3,0.734,2.6,4.7,7.3,1.1,0.6,3.4,3.3,5.0,1980-81,1981,1.3,0.0,0.0,,2.0,4.2,0.473,0.473



George Johnson in 1981-82 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
2326,252.0,George Johnson,25.0,IND,PF,59.0,4.0,12.2,2.0,4.9,0.412,1.0,1.4,0.75,1.2,2.5,3.7,0.7,0.6,0.4,2.5,5.1,1981-82,1982,1.2,0.0,0.0,0.0,2.0,4.9,0.415,0.412
2403,329.0,George Johnson,33.0,SAS,C,75.0,62.0,21.0,1.2,2.6,0.467,0.6,0.9,0.672,2.0,4.0,6.1,1.1,0.3,3.1,3.5,3.0,1981-82,1982,1.2,0.0,0.0,,1.2,2.6,0.467,0.467



George Johnson in 1982-83 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
2559,111.0,George Johnson,26.0,IND,SF,82.0,64.0,28.0,5.0,10.5,0.477,1.5,2.1,0.733,2.1,4.5,6.6,2.7,0.9,0.6,3.4,11.6,1982-83,1983,3.0,0.1,0.5,0.184,4.9,10.0,0.49,0.481
2824,376.0,George Johnson,34.0,ATL,C,37.0,0.0,12.5,0.7,1.5,0.439,0.4,0.5,0.737,1.2,2.0,3.2,0.5,0.3,1.6,1.9,1.7,1982-83,1983,0.5,0.0,0.0,,0.7,1.5,0.439,0.439



George Johnson in 1984-85 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
3429,245.0,George Johnson,28.0,PHI,SF,55.0,3.0,13.7,1.9,4.8,0.407,0.9,1.0,0.875,0.9,2.1,3.0,0.7,0.6,0.3,1.8,4.8,1984-85,1985,0.9,0.0,0.2,0.1,1.9,4.6,0.419,0.409
3518,334.0,George Johnson,36.0,NJN,C,65.0,0.0,12.3,0.6,1.2,0.532,0.3,0.4,0.815,1.1,1.7,2.8,0.3,0.3,1.2,2.3,1.6,1984-85,1985,0.6,0.0,0.0,1.0,0.6,1.2,0.526,0.538



George Johnson in 1985-86 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
3897,350.0,George Johnson,29.0,WSB,PF,2.0,0.0,3.5,0.5,1.5,0.333,1.0,1.0,1.0,0.5,0.5,1.0,0.0,0.0,0.0,0.5,2.0,1985-86,1986,0.5,0.0,0.0,,0.5,1.5,0.333,0.333
3916,369.0,George Johnson,37.0,SEA,C,41.0,0.0,6.4,0.3,0.6,0.522,0.3,0.4,0.688,0.6,0.8,1.5,0.3,0.1,0.9,1.1,0.9,1985-86,1986,0.3,0.0,0.0,,0.3,0.6,0.522,0.522



Marcus Williams in 2008-09 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
15498,534.0,Marcus Williams,22.0,SAS,PF,2.0,0.0,1.5,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.5,2.0,2008-09,2009,0.0,0.0,0.0,,1.0,1.0,1.0,1.0
15523,559.0,Marcus Williams,23.0,GSW,PG,9.0,0.0,6.0,0.4,1.9,0.235,0.2,0.7,0.333,0.0,0.4,0.4,1.4,0.1,0.1,0.7,1.3,2008-09,2009,0.4,0.2,0.7,0.333,0.2,1.2,0.182,0.294



Michael Smith in 1994-95 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
7687,222.0,Michael Smith,22.0,SAC,PF,82.0,0.0,21.2,2.7,5.0,0.542,1.5,3.2,0.485,2.1,3.8,5.9,0.8,0.7,0.6,2.9,6.9,1994-95,1995,1.3,0.0,0.0,0.0,2.7,4.9,0.545,0.542
7729,264.0,Michael Smith,29.0,LAC,PF,29.0,0.0,11.0,2.2,4.6,0.47,0.9,1.0,0.867,0.4,1.5,1.9,0.7,0.2,0.1,1.4,5.3,1994-95,1995,0.6,0.0,0.3,0.125,2.1,4.3,0.492,0.474



Tony Mitchell in 2013-14 has 2 entries:


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%
18422,544.0,Tony Mitchell,24.0,MIL,SF,3.0,0.0,3.3,1.0,1.7,0.6,0.0,0.0,,0.3,0.0,0.3,0.3,0.3,0.0,0.0,2.0,2013-14,2014,0.0,0.0,0.3,0.0,1.0,1.3,0.75,0.6
18462,584.0,Tony Mitchell,21.0,DET,PF,21.0,0.0,3.8,0.2,0.6,0.417,0.5,0.9,0.579,0.7,0.5,1.2,0.1,0.3,0.1,0.4,1.0,2013-14,2014,0.2,0.0,0.0,1.0,0.2,0.5,0.364,0.458


# adding positional data
Some positions are closely related - let's add in some groupings to make the data a bit richer

In [8]:

position_mapping = {
    'PG': 'Guard',
    'SG': 'Guard',
    'G': 'Guard',
    'SF': 'Forward',
    'PF': 'Forward',
    'F': 'Forward',
    'C': 'Center',
    # Handle combined positions
    'PG-SG': 'Guard',
    'SG-PG': 'Guard',
    'SF-SG': 'Forward',
    'SG-SF': 'Forward',
    'PF-SF': 'Forward',
    'SF-PF': 'Forward',
    'PF-C': 'Forward',
    'C-PF': 'Center',
    'F-C': 'Forward',
    'C-F': 'Center'
}

# Map positions to standard groups
nba_data_clean['Position_Group'] = nba_data_clean['Pos'].map(position_mapping)
        
# Handle any missing mappings
if nba_data_clean['Position_Group'].isna().any():
    missing_positions = nba_data_clean[nba_data_clean['Position_Group'].isna()][pos_column].unique()
    print(f"Warning: Some positions couldn't be mapped: {missing_positions}")
    # Set unknown positions to their original value
    nba_data_clean.loc[nba_data_clean['Position_Group'].isna(), 'Position_Group'] = nba_data_clean[pos_column]

# Building the Interactions


This visualization tracks NBA scoring progression throughout player careers, with separate trend lines for guards (blue), forwards (orange), and centers (green). It reveals that guards consistently score more points than forwards and centers across all age ranges, with peak scoring occurring around age 28 for guards, with forwards following a similar trajectory while centers reach their scoring peak slightly earlier. The confidence bands and point sizes provide statistical context, showing both the uncertainty in the averages (wider bands indicate greater uncertainty) and the number of players contributing to each data point (larger circles represent more players), with the data becoming less reliable at the extremes of the age spectrum where sample sizes are smaller.

In [13]:
# Scoring Progression by Age and Position with Era Filter (alternative approach)

import altair as alt
import pandas as pd
import numpy as np

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# Create position grouping if not already done
if 'Position_Group' not in nba_data_clean.columns:
    position_mapping = {
        'PG': 'Guard',
        'SG': 'Guard',
        'G': 'Guard',
        'SF': 'Forward',
        'PF': 'Forward',
        'F': 'Forward',
        'C': 'Center',
        # Handle combined positions
        'PG-SG': 'Guard',
        'SG-PG': 'Guard',
        'SF-SG': 'Forward',
        'SG-SF': 'Forward',
        'PF-SF': 'Forward',
        'SF-PF': 'Forward',
        'PF-C': 'Forward',
        'C-PF': 'Center',
        'F-C': 'Forward',
        'C-F': 'Center'
    }
    nba_data_clean['Position_Group'] = nba_data_clean['Pos'].map(position_mapping)

# Create Era column based on Season_Year
# Define eras with an additional pre-2000 era
def assign_era(year):
    if year < 1981:
        return "Pre-1980"    
    if year < 1986:
        return "1981-1985"
    if year < 1991:
        return "1986-1990"
    if year < 1996:
        return "1991-1995"    
    elif year < 2001:
        return "1996-2000"
    elif year < 2006:
        return "2001-2005"
    elif year < 2011:
        return "2006-2010"
    elif year < 2016:
        return "2011-2015"       
    elif year < 2021:
        return "2016-2020"
    else:
        return "2021-present"

nba_data_clean['Era'] = nba_data_clean['Season_Year'].apply(assign_era)

era_order = ["All Eras", 
             "Pre-1980", "1981-1985", "1986-1990", 
             "1991-1995", "1996-2000",
             "2001-2005", "2006-2010",
             "2011-2015", "2016-2020", 
             "2021-present"]


# Print the era distribution to check
era_counts = nba_data_clean['Era'].value_counts().sort_index()
print("Era distribution:")
for era, count in era_counts.items():
    print(f"{era}: {count} entries")



data_for_viz = nba_data_clean.dropna(subset=['Age', 'Position_Group'])
# Filter for a reasonable age range to avoid outliers
data_for_viz = data_for_viz[(data_for_viz['Age'] >= 19) & (data_for_viz['Age'] <= 40)]
# Filter for minimum playing time to ensure meaningful stats
data_for_viz = data_for_viz[data_for_viz['MP'] >= 15]


nba_data_clean.head()


Era distribution:
1981-1985: 1565 entries
1986-1990: 1725 entries
1991-1995: 1973 entries
1996-2000: 2187 entries
2001-2005: 2215 entries
2006-2010: 2253 entries
2011-2015: 2373 entries
2016-2020: 2561 entries
2021-present: 2799 entries
Pre-1980: 1385 entries


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,PF,PTS,Season,Season_Year,TOV,3P,3PA,3P%,2P,2PA,2P%,eFG%,Position_Group,Era
0,1.0,Bob McAdoo,24.0,BUF,C,78.0,,42.7,12.0,24.6,0.487,7.2,9.4,0.762,3.1,9.3,12.4,4.0,1.2,2.1,3.8,31.1,1975-76,1976,,,,,,,,,Center,Pre-1980
1,2.0,Kareem Abdul-Jabbar,28.0,LAL,C,82.0,82.0,41.2,11.1,21.1,0.529,5.5,7.8,0.703,3.3,13.5,16.9,5.0,1.5,4.1,3.6,27.7,1975-76,1976,,,,,,,,,Center,Pre-1980
2,3.0,Pete Maravich,28.0,NOJ,SG,62.0,,38.3,9.7,21.2,0.459,6.4,7.9,0.811,0.7,4.1,4.8,5.4,1.4,0.4,3.2,25.9,1975-76,1976,,,,,,,,,Guard,Pre-1980
3,4.0,Tiny Archibald,27.0,KCK,PG,78.0,,40.8,9.2,20.3,0.453,6.4,8.0,0.802,0.9,1.9,2.7,7.9,1.6,0.2,2.2,24.8,1975-76,1976,,,,,,,,,Guard,Pre-1980
4,5.0,Fred Brown,27.0,SEA,SG,76.0,,33.1,9.8,20.0,0.488,3.6,4.1,0.869,1.5,2.7,4.2,2.7,1.9,0.2,2.4,23.1,1975-76,1976,,,,,,,,,Guard,Pre-1980


In [14]:

import altair as alt
import pandas as pd
import numpy as np

# Set the renderer to a more stable option
alt.renderers.enable('default')


RendererRegistry.enable('default')

### Initial version of interactive Altair chart


In [15]:

# Calculate average points per game by age and position
scoring_by_age_pos = data_for_viz.groupby(['Age', 'Position_Group'])['PTS'].agg(
    ['mean', 'count', 'std']
).reset_index()

scoring_by_age_pos.columns = ['Age', 'Position_Group', 'Avg_Points', 'Player_Count', 'Std_Dev']

# Create confidence intervals
scoring_by_age_pos['CI_Lower'] = scoring_by_age_pos['Avg_Points'] - (
    1.96 * scoring_by_age_pos['Std_Dev'] / np.sqrt(scoring_by_age_pos['Player_Count'])
)
scoring_by_age_pos['CI_Upper'] = scoring_by_age_pos['Avg_Points'] + (
    1.96 * scoring_by_age_pos['Std_Dev'] / np.sqrt(scoring_by_age_pos['Player_Count'])
)

# Create the main line chart with Altair
line_chart = alt.Chart(scoring_by_age_pos).mark_line().encode(
    x=alt.X('Age:Q', title='Age'),
    y=alt.Y('Avg_Points:Q', title='Average Points Per Game'),
    color=alt.Color('Position_Group:N', title='Position', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                   range=['#1f77b4', '#ff7f0e', '#2ca02c'])),
    tooltip=['Age', 'Position_Group', 'Avg_Points', 'Player_Count']
)

# Add confidence interval bands
ci_bands = alt.Chart(scoring_by_age_pos).mark_area(opacity=0.2).encode(
    x='Age:Q',
    y='CI_Lower:Q',
    y2='CI_Upper:Q',
    color='Position_Group:N'
)

# Add points to show sample size
points = alt.Chart(scoring_by_age_pos).mark_circle().encode(
    x='Age:Q',
    y='Avg_Points:Q',
    color='Position_Group:N',
    size=alt.Size('Player_Count:Q', title='Number of Players',
                 scale=alt.Scale(range=[10, 200])),
    tooltip=['Age', 'Position_Group', 'Avg_Points', 'Player_Count']
)

# Combine the visualizations
chart = (line_chart + ci_bands + points).properties(
    title='NBA Scoring Progression by Age and Position',
    width=800,
    height=500
).interactive()

# Add annotations for peak scoring ages
peak_ages = scoring_by_age_pos.loc[scoring_by_age_pos.groupby('Position_Group')['Avg_Points'].idxmax()]

annotations = alt.Chart(peak_ages).mark_text(
    align='left',
    baseline='middle',
    dx=15,
    fontSize=12,
    fontWeight='bold'
).encode(
    x='Age:Q',
    y='Avg_Points:Q',
    text=alt.Text('Age:Q', format='.0f', title='Peak Age'),
    color='Position_Group:N'
)

# Combine everything
final_chart = (chart + annotations).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the chart
final_chart

In [16]:
# Interactive Scoring Progression Chart with "All" Option for Eras

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# Calculate average points per game by age, position, and era
scoring_by_age_pos_era = data_for_viz.groupby(['Age', 'Position_Group', 'Era'])['PTS'].agg(
    ['mean', 'count', 'std']
).reset_index()

# Rename columns for clarity
scoring_by_age_pos_era.columns = ['Age', 'Position_Group', 'Era', 'Avg_Points', 'Player_Count', 'Std_Dev']

# Create confidence intervals
scoring_by_age_pos_era['CI_Lower'] = scoring_by_age_pos_era['Avg_Points'] - (
    1.96 * scoring_by_age_pos_era['Std_Dev'] / np.sqrt(scoring_by_age_pos_era['Player_Count'])
)
scoring_by_age_pos_era['CI_Upper'] = scoring_by_age_pos_era['Avg_Points'] + (
    1.96 * scoring_by_age_pos_era['Std_Dev'] / np.sqrt(scoring_by_age_pos_era['Player_Count'])
)

# Also calculate aggregate data across all eras
scoring_by_age_pos_all = data_for_viz.groupby(['Age', 'Position_Group'])['PTS'].agg(
    ['mean', 'count', 'std']
).reset_index()
scoring_by_age_pos_all.columns = ['Age', 'Position_Group', 'Avg_Points', 'Player_Count', 'Std_Dev']
scoring_by_age_pos_all['Era'] = 'All Eras'  # Add Era column with 'All Eras' value
scoring_by_age_pos_all['CI_Lower'] = scoring_by_age_pos_all['Avg_Points'] - (
    1.96 * scoring_by_age_pos_all['Std_Dev'] / np.sqrt(scoring_by_age_pos_all['Player_Count'])
)
scoring_by_age_pos_all['CI_Upper'] = scoring_by_age_pos_all['Avg_Points'] + (
    1.96 * scoring_by_age_pos_all['Std_Dev'] / np.sqrt(scoring_by_age_pos_all['Player_Count'])
)

# Combine the era-specific data with the all-eras data
scoring_combined = pd.concat([scoring_by_age_pos_era, scoring_by_age_pos_all], ignore_index=True)

# Pre-calculate peak ages for each position and era (including 'All Eras')
peak_ages_list = []
for era in scoring_combined['Era'].unique():
    for position in scoring_combined['Position_Group'].unique():
        era_pos_data = scoring_combined[(scoring_combined['Era'] == era) & 
                                     (scoring_combined['Position_Group'] == position)]
        if not era_pos_data.empty:
            max_idx = era_pos_data['Avg_Points'].idxmax()
            peak_row = era_pos_data.loc[max_idx].copy()
            peak_row['Label'] = f"{int(peak_row['Age'])}"
            peak_ages_list.append(peak_row)

peak_ages_df = pd.DataFrame(peak_ages_list)

# Define explicit ordering for eras, with 'All Eras' as the first option
#era_order = ["All Eras", 
#             "Pre-1980", "1981-1985" "1986-1990", 
#             "1991-1995", "1996-2000",
#             "2001-2005", "2006-2010",
#             "2011-2015", "2016-2020", 
#             "2021-present"]

# Verify all eras from the data are in our ordering
all_eras = set(scoring_combined['Era'].unique())
if not all(era in era_order for era in all_eras):
    missing_eras = [era for era in all_eras if era not in era_order]
    print(f"Warning: Some eras in the data aren't in our manual ordering: {missing_eras}")
    # Add any missing eras to the end of the list
    era_order.extend([era for era in all_eras if era not in era_order])

# Create parameters for era and position selection
era_param = alt.param(name='Era', value='All Eras', 
                      bind=alt.binding_radio(options=era_order, name='Era:'))

# Create a radio button selection for positions
position_options = ['All', 'Guard', 'Forward', 'Center']
position_param = alt.param(name='Position', value='All',
                          bind=alt.binding_radio(options=position_options, name='Position:'))

# Create the main line chart
line_chart = alt.Chart(scoring_combined).mark_line().encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('Avg_Points:Q', title='Average Points Per Game', 
           scale=alt.Scale(domain=[5, 25])),  # Fixed Y-scale
    color=alt.Color('Position_Group:N', title='Position', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                   range=['#1f77b4', '#ff7f0e', '#2ca02c'])),
    tooltip=['Age', 'Position_Group', 'Era', 'Avg_Points', 'Player_Count']
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add confidence interval bands (clip=True to prevent extreme values from affecting the chart)
ci_bands = alt.Chart(scoring_combined).mark_area(opacity=0.2, clip=True).encode(
    x='Age:Q',
    y=alt.Y('CI_Lower:Q', scale=alt.Scale(domain=[5, 25])),  # Same fixed Y-scale
    y2=alt.Y2('CI_Upper:Q'),
    color='Position_Group:N'
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add points to show sample size
points = alt.Chart(scoring_combined).mark_circle().encode(
    x='Age:Q',
    y=alt.Y('Avg_Points:Q', scale=alt.Scale(domain=[5, 25])),  # Same fixed Y-scale
    color='Position_Group:N',
    size=alt.Size('Player_Count:Q', title='Number of Players',
                 scale=alt.Scale(range=[10, 200])),
    tooltip=['Age', 'Position_Group', 'Era', 'Avg_Points', 'Player_Count']
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add peak age labels
peak_labels = alt.Chart(peak_ages_df).mark_text(
    align='left',
    baseline='middle',
    dx=15,
    fontSize=12,
    fontWeight='bold'
).encode(
    x='Age:Q',
    y=alt.Y('Avg_Points:Q', scale=alt.Scale(domain=[5, 25])),  # Same fixed Y-scale
    text='Label:N',
    color='Position_Group:N'
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create a supplementary chart showing player count
player_count_chart = alt.Chart(scoring_combined).mark_bar().encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('sum(Player_Count):Q', title='Number of Players'),
    color=alt.Color('Position_Group:N', title='Position')
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
).properties(
    title='Sample Size by Age and Position',
    width=800,
    height=200
)

# Combine the visualizations
chart = (line_chart + ci_bands + points + peak_labels).properties(
    title='NBA Scoring Progression by Age and Position',
    width=800,
    height=500
).add_params(
    era_param,
    position_param
).interactive()

# Combine main chart with sample size chart
final_chart = alt.vconcat(
    chart,
    player_count_chart
).resolve_scale(
    color='shared'
).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the chart
final_chart

In [17]:
# 3-Point Shooting Analysis by Age, Position, and Era

import altair as alt
import pandas as pd
import numpy as np

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# Filter for players with at least 1 three-point attempt per game
three_pt_data = data_for_viz[data_for_viz['3PA'] >= 1.0].copy()

# Add an "All Eras" copy of the data for aggregation
all_eras_data = three_pt_data.copy()
all_eras_data['Era'] = 'All Eras'

# Combine the original data with the 'All Eras' data
combined_data = pd.concat([three_pt_data, all_eras_data], ignore_index=True)

# Group by age, position, and era
grouped_data = combined_data.groupby(['Age', 'Position_Group', 'Era']).agg({
    '3P': 'mean',
    '3PA': 'mean',
    '3P%': ['mean', 'std', 'count']
}).reset_index()

# Flatten the multi-level columns
grouped_data.columns = [
    f"{col[0]}_{col[1]}" if col[1] != '' else col[0] 
    for col in grouped_data.columns
]

# Calculate confidence intervals
grouped_data['CI_Lower'] = grouped_data['3P%_mean'] - (
    1.96 * grouped_data['3P%_std'] / np.sqrt(grouped_data['3P%_count'])
)
grouped_data['CI_Upper'] = grouped_data['3P%_mean'] + (
    1.96 * grouped_data['3P%_std'] / np.sqrt(grouped_data['3P%_count'])
)

# Ensure percentage values are within reasonable ranges
grouped_data['CI_Lower'] = grouped_data['CI_Lower'].clip(0, 1)
grouped_data['CI_Upper'] = grouped_data['CI_Upper'].clip(0, 1)

# Define era order
#era_order = ["All Eras", "Pre-2000", "2000-2005", "2005-2010", "2011-2015", "2016-2020", "2021-present"]

# Create parameters for era and position selection
era_param = alt.param(name='Era', value='All Eras', 
                     bind=alt.binding_radio(options=era_order, name='Era:'))

position_options = ['All', 'Guard', 'Forward', 'Center']
position_param = alt.param(name='Position', value='All',
                          bind=alt.binding_radio(options=position_options, name='Position:'))

# Create the main line chart showing 3-point percentage
line_chart = alt.Chart(grouped_data).mark_line().encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('3P%_mean:Q', title='3-Point Percentage', 
           scale=alt.Scale(domain=[0.25, 0.45])),
    color=alt.Color('Position_Group:N', title='Position', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                   range=['#1f77b4', '#ff7f0e', '#2ca02c'])),
    tooltip=['Age', 'Position_Group', 'Era', '3P%_mean', '3PA_mean', '3P%_count']
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add confidence interval bands
ci_bands = alt.Chart(grouped_data).mark_area(opacity=0.2).encode(
    x='Age:Q',
    y='CI_Lower:Q',
    y2='CI_Upper:Q',
    color='Position_Group:N'
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add points sized by 3-point attempts
points = alt.Chart(grouped_data).mark_circle().encode(
    x='Age:Q',
    y='3P%_mean:Q',
    color='Position_Group:N',
    size=alt.Size('3PA_mean:Q', title='3PA Per Game',
                 scale=alt.Scale(range=[10, 200])),
    tooltip=['Age', 'Position_Group', 'Era', '3P%_mean', '3PA_mean', '3P%_count']
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create chart showing 3-point attempt volume by age
volume_chart = alt.Chart(grouped_data).mark_line().encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('3PA_mean:Q', title='3-Point Attempts Per Game',
           scale=alt.Scale(domain=[0, 8])),
    color='Position_Group:N',
    strokeDash=alt.value([5, 5])  # Dashed lines for attempt volume
).transform_filter(
    alt.datum.Era == era_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
).properties(
    title='3-Point Attempt Volume by Age and Position',
    width=800,
    height=200
)

# Combine the visualizations
main_chart = (line_chart + ci_bands + points).properties(
    title='NBA 3-Point Shooting Percentage by Age and Position',
    width=800,
    height=500
).add_params(
    era_param,
    position_param
).interactive()

# Combine main chart with volume chart
final_chart = alt.vconcat(
    main_chart,
    volume_chart
).resolve_scale(
    color='shared'
).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the chart
final_chart

In [18]:
# 3-Point Shooting Analysis with Year Slider (±3 year window)

import altair as alt
import pandas as pd
import numpy as np

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# Filter for players with at least 1 three-point attempt per game
three_pt_data = data_for_viz[data_for_viz['3PA'] >= 1.0].copy()

# Instead of era, we'll use the specific year (Season_Year)
# First, determine the range of years available in the data
min_year = int(three_pt_data['Season_Year'].min())
max_year = int(three_pt_data['Season_Year'].max())

# Create a function to filter data for a specific year range (±3 years)
def filter_year_range(df, center_year, window=3):
    return df[(df['Season_Year'] >= center_year - window) & 
              (df['Season_Year'] <= center_year + window)]

# Group data by age, position for each possible center year
# We'll pre-calculate statistics for all possible year ranges
year_ranges = []

for center_year in range(min_year, max_year + 1):
    # Filter data for this year range
    year_data = filter_year_range(three_pt_data, center_year)
    
    # If we have sufficient data for this year range
    if len(year_data) > 0:
        # Group by age and position
        grouped = year_data.groupby(['Age', 'Position_Group']).agg({
            '3P': 'mean',
            '3PA': 'mean',
            '3P%': ['mean', 'std', 'count']
        }).reset_index()
        
        # Flatten the multi-level columns
        grouped.columns = [
            f"{col[0]}_{col[1]}" if col[1] != '' else col[0] 
            for col in grouped.columns
        ]
        
        # Calculate confidence intervals
        grouped['CI_Lower'] = grouped['3P%_mean'] - (
            1.96 * grouped['3P%_std'] / np.sqrt(grouped['3P%_count'])
        )
        grouped['CI_Upper'] = grouped['3P%_mean'] + (
            1.96 * grouped['3P%_std'] / np.sqrt(grouped['3P%_count'])
        )
        
        # Ensure percentage values are within reasonable ranges
        grouped['CI_Lower'] = grouped['CI_Lower'].clip(0, 1)
        grouped['CI_Upper'] = grouped['CI_Upper'].clip(0, 1)
        
        # Add the center year
        grouped['Center_Year'] = center_year
        
        # Add year range description
        grouped['Year_Range'] = f"{center_year-3} to {center_year+3}"
        
        # Append to our list
        year_ranges.append(grouped)

# Combine all the year ranges
all_year_data = pd.concat(year_ranges, ignore_index=True)

# Create a parameter for the center year selection
year_param = alt.param(
    name='Year',
    value=2016,  # Default to 2016 (or any reasonable default)
    bind=alt.binding_range(
        min=min_year + 3,  # Add 3 to ensure we have ±3 years of data
        max=max_year - 3,  # Subtract 3 to ensure we have ±3 years of data
        step=1,
        name="Center Year:"
    )
)

# Create position selection radio buttons
position_options = ['All', 'Guard', 'Forward', 'Center']
position_param = alt.param(
    name='Position', 
    value='All',
    bind=alt.binding_radio(options=position_options, name='Position:')
)

# Create the main line chart showing 3-point percentage
line_chart = alt.Chart(all_year_data).mark_line().encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('3P%_mean:Q', title='3-Point Percentage', 
           scale=alt.Scale(domain=[0.25, 0.45])),
    color=alt.Color('Position_Group:N', title='Position', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                   range=['#1f77b4', '#ff7f0e', '#2ca02c'])),
    tooltip=['Age', 'Position_Group', 'Year_Range', '3P%_mean', '3PA_mean', '3P%_count']
).transform_filter(
    alt.datum.Center_Year == year_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add confidence interval bands
ci_bands = alt.Chart(all_year_data).mark_area(opacity=0.2).encode(
    x='Age:Q',
    y='CI_Lower:Q',
    y2='CI_Upper:Q',
    color='Position_Group:N'
).transform_filter(
    alt.datum.Center_Year == year_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Add points sized by 3-point attempts
points = alt.Chart(all_year_data).mark_circle().encode(
    x='Age:Q',
    y='3P%_mean:Q',
    color='Position_Group:N',
    size=alt.Size('3PA_mean:Q', title='3PA Per Game',
                 scale=alt.Scale(range=[10, 200])),
    tooltip=['Age', 'Position_Group', 'Year_Range', '3P%_mean', '3PA_mean', '3P%_count']
).transform_filter(
    alt.datum.Center_Year == year_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create chart showing 3-point attempt volume by age
volume_chart = alt.Chart(all_year_data).mark_line().encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('3PA_mean:Q', title='3-Point Attempts Per Game',
           scale=alt.Scale(domain=[0, 8])),
    color='Position_Group:N',
    strokeDash=alt.value([5, 5])  # Dashed lines for attempt volume
).transform_filter(
    alt.datum.Center_Year == year_param
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
).properties(
    title='3-Point Attempt Volume by Age and Position',
    width=800,
    height=200
)

# Add year range text as a subtitle
year_text = alt.Chart(all_year_data).mark_text(
    align='center',
    baseline='top',
    fontSize=16,
    dy=30  # Move down from the top
).encode(
    text='Year_Range:N'
).transform_filter(
    alt.datum.Center_Year == year_param
).transform_filter(
    alt.datum.Age == all_year_data['Age'].min()  # Just need one row per year
).transform_filter(
    alt.datum.Position_Group == 'Guard'  # Just need one position
)

# Combine the visualizations
main_chart = (line_chart + ci_bands + points).properties(
    title='NBA 3-Point Shooting Percentage by Age and Position',
    width=800,
    height=500
).add_params(
    year_param,
    position_param
).interactive()

# Combine main chart with volume chart
final_chart = alt.vconcat(
    main_chart,
    volume_chart
).resolve_scale(
    color='shared'
).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the chart
final_chart

In [19]:

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# First, calculate the average scoring progression by age and position
avg_by_age_pos = data_for_viz.groupby(['Age', 'Position_Group'])['PTS'].mean().reset_index()
avg_by_age_pos.rename(columns={'PTS': 'Avg_PTS'}, inplace=True)

# Now we need to identify players with sufficiently long careers to show trajectories
# We'll count seasons played for each player
player_seasons = data_for_viz.groupby('Player').size().reset_index(name='Seasons_Played')

# Filter for players with careers of at least 10 seasons
long_career_players = player_seasons[player_seasons['Seasons_Played'] >= 10]['Player'].tolist()

# If there are too many long-career players, we can limit to the top 20
if len(long_career_players) > 20:
    # Get career scoring averages to find high-scoring players
    player_career_pts = data_for_viz.groupby('Player')['PTS'].mean().reset_index()
    player_career_pts = player_career_pts[player_career_pts['Player'].isin(long_career_players)]
    player_career_pts = player_career_pts.sort_values('PTS', ascending=False)
    long_career_players = player_career_pts.head(20)['Player'].tolist()

# Print selected players for reference
print(f"Selected {len(long_career_players)} players for trajectory analysis")

# Get data for these players
player_data = data_for_viz[data_for_viz['Player'].isin(long_career_players)].copy()

# Create selectors for position and player
position_options = ['All'] + sorted(data_for_viz['Position_Group'].unique().tolist())
position_param = alt.param(name='Position', value='All',
                          bind=alt.binding_radio(options=position_options, name='Position:'))

player_options = ['None'] + sorted(long_career_players)
player_param = alt.param(name='HighlightPlayer', value='None',
                         bind=alt.binding_select(options=player_options, name='Highlight Player:'))

# Create the base chart with average lines
avg_line = alt.Chart(avg_by_age_pos).mark_line(
    stroke='black',
    strokeWidth=3
).encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('Avg_PTS:Q', title='Points Per Game', scale=alt.Scale(domain=[5, 30])),
    color=alt.Color('Position_Group:N', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                  range=['#1f77b4', '#ff7f0e', '#2ca02c']))
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create individual player trajectories
player_lines = alt.Chart(player_data).mark_line(
    opacity=0.3,
    strokeWidth=1
).encode(
    x='Age:Q',
    y='PTS:Q',
    color='Position_Group:N',
    detail='Player:N',  # Each player gets a separate line
    tooltip=['Player', 'Age', 'PTS', 'Season']
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create highlighted player line
highlight_line = alt.Chart(player_data).mark_line(
    stroke='red',
    strokeWidth=3
).encode(
    x='Age:Q',
    y='PTS:Q',
    tooltip=['Player', 'Age', 'PTS', 'Season']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Add points for the highlighted player
highlight_points = alt.Chart(player_data).mark_circle(
    color='red',
    size=60
).encode(
    x='Age:Q',
    y='PTS:Q',
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Combine the visualizations
chart = (avg_line + player_lines + highlight_line + highlight_points).properties(
    title='NBA Scoring Progression: Individual Players vs. Position Averages',
    width=900,
    height=600
).add_params(
    position_param,
    player_param
).interactive()

# Create a supplementary chart showing selected player's career in more detail
career_chart = alt.Chart(player_data).mark_line(
    strokeWidth=2,
    stroke='blue'
).encode(
    x=alt.X('Season_Year:O', title='Season'),
    y=alt.Y('PTS:Q', title='Points Per Game'),
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
).properties(
    title='Selected Player Career Progression',
    width=900,
    height=300
)

# Add points to the career chart
career_points = alt.Chart(player_data).mark_circle(
    size=60
).encode(
    x='Season_Year:O',
    y='PTS:Q',
    color='Team:N',  # Color by team to show team changes
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Combine career charts
full_career_chart = (career_chart + career_points).properties(
    title='Selected Player Career Progression by Season'
)

# Final visualization
final_vis = alt.vconcat(
    chart,
    full_career_chart
).resolve_scale(
    color='independent'  # Independent color scales for each chart
).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the visualization
final_vis

Selected 20 players for trajectory analysis


In [20]:
# Visualization of Individual Player Trajectories vs Average Scoring Progression
# With improved player selection and manual additions

import altair as alt
import pandas as pd
import numpy as np

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# First, calculate the average scoring progression by age and position
avg_by_age_pos = data_for_viz.groupby(['Age', 'Position_Group'])['PTS'].mean().reset_index()
avg_by_age_pos.rename(columns={'PTS': 'Avg_PTS'}, inplace=True)

# For player selection, we'll use multiple criteria to ensure we include significant players
# 1. First, get all player stats
player_career_stats = data_for_viz.groupby('Player').agg({
    'PTS': ['mean', 'sum', 'count'],  # Points per game, total points, and seasons played
    'Position_Group': 'first'  # Get player's position
}).reset_index()

# Flatten the multi-level columns
player_career_stats.columns = [
    f"{col[0]}_{col[1]}" if col[1] != '' else col[0] 
    for col in player_career_stats.columns
]

# Rename columns for clarity
player_career_stats = player_career_stats.rename(columns={
    'PTS_mean': 'Career_PPG',
    'PTS_sum': 'Total_Points',
    'PTS_count': 'Seasons_Played',
    'Position_Group_first': 'Position_Group'
})

# 2. Score players by a composite metric to balance career length and scoring prowess
# This helps include both high-scoring short careers and long careers with moderate scoring
player_career_stats['Composite_Score'] = (
    player_career_stats['Career_PPG'] * 
    np.sqrt(player_career_stats['Seasons_Played'])  # Square root gives diminishing returns to career length
)

# 3. Select top 25 players by composite score
top_players = player_career_stats.sort_values('Composite_Score', ascending=False).head(25)['Player'].tolist()

# 4. Now add specific players of interest who might not be in the top 25
# These could be historically significant players or players of specific interest
manual_additions = [
    # Manually add players who should be included but might not be in the dataset
    # or might not rank in the top 25 by our composite metric
    # For example: "Michael Jordan", "Tim Duncan", etc.
]

# Add these to our top players list if they're in the dataset
for player in manual_additions:
    if player in data_for_viz['Player'].values and player not in top_players:
        top_players.append(player)

# Print the selected players for reference
print(f"Selected {len(top_players)} players for trajectory analysis:")
for player in top_players:
    player_stats = player_career_stats[player_career_stats['Player'] == player].iloc[0]
    print(f"{player}: {player_stats['Career_PPG']:.1f} PPG over {int(player_stats['Seasons_Played'])} seasons ({player_stats['Position_Group']})")

# Get data for these players
player_data = data_for_viz[data_for_viz['Player'].isin(top_players)].copy()

# Create selectors for position and player
position_options = ['All'] + sorted(data_for_viz['Position_Group'].unique().tolist())
position_param = alt.param(name='Position', value='All',
                          bind=alt.binding_radio(options=position_options, name='Position:'))

player_options = ['None'] + sorted(top_players)
player_param = alt.param(name='HighlightPlayer', value='None',
                        bind=alt.binding_select(options=player_options, name='Highlight Player:'))

# Create the base chart with average lines
avg_line = alt.Chart(avg_by_age_pos).mark_line(
    stroke='black',
    strokeWidth=3
).encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('Avg_PTS:Q', title='Points Per Game', scale=alt.Scale(domain=[5, 30])),
    color=alt.Color('Position_Group:N', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                  range=['#1f77b4', '#ff7f0e', '#2ca02c']))
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create individual player trajectories
player_lines = alt.Chart(player_data).mark_line(
    opacity=0.3,
    strokeWidth=1
).encode(
    x='Age:Q',
    y='PTS:Q',
    color='Position_Group:N',
    detail='Player:N',  # Each player gets a separate line
    tooltip=['Player', 'Age', 'PTS', 'Season']
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create highlighted player line
highlight_line = alt.Chart(player_data).mark_line(
    stroke='red',
    strokeWidth=3
).encode(
    x='Age:Q',
    y='PTS:Q',
    tooltip=['Player', 'Age', 'PTS', 'Season']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Add points for the highlighted player
highlight_points = alt.Chart(player_data).mark_circle(
    color='red',
    size=60
).encode(
    x='Age:Q',
    y='PTS:Q',
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Combine the visualizations
chart = (avg_line + player_lines + highlight_line + highlight_points).properties(
    title='NBA Scoring Progression: Individual Players vs. Position Averages',
    width=900,
    height=600
).add_params(
    position_param,
    player_param
).interactive()

# Create a supplementary chart showing selected player's career in more detail
career_chart = alt.Chart(player_data).mark_line(
    strokeWidth=2,
    stroke='blue'
).encode(
    x=alt.X('Season_Year:O', title='Season'),
    y=alt.Y('PTS:Q', title='Points Per Game'),
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
).properties(
    title='Selected Player Career Progression',
    width=900,
    height=300
)

# Add points to the career chart
career_points = alt.Chart(player_data).mark_circle(
    size=60
).encode(
    x='Season_Year:O',
    y='PTS:Q',
    color='Team:N',  # Color by team to show team changes
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Combine career charts
full_career_chart = (career_chart + career_points).properties(
    title='Selected Player Career Progression by Season'
)

# Final visualization
final_vis = alt.vconcat(
    chart,
    full_career_chart
).resolve_scale(
    color='independent'  # Independent color scales for each chart
).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the visualization
final_vis

Selected 25 players for trajectory analysis:
LeBron James: 27.0 PPG over 22 seasons (Guard)
Michael Jordan: 29.5 PPG over 15 seasons (Guard)
Kevin Durant: 27.2 PPG over 17 seasons (Guard)
Kobe Bryant: 25.1 PPG over 19 seasons (Forward)
Karl Malone: 24.7 PPG over 19 seasons (Forward)
Shaquille O'Neal: 23.0 PPG over 19 seasons (Center)
Allen Iverson: 26.1 PPG over 14 seasons (Guard)
Stephen Curry: 24.3 PPG over 16 seasons (Guard)
Carmelo Anthony: 22.0 PPG over 19 seasons (Forward)
James Harden: 23.9 PPG over 16 seasons (Guard)
Dominique Wilkins: 25.4 PPG over 14 seasons (Forward)
Dirk Nowitzki: 20.1 PPG over 21 seasons (Forward)
Damian Lillard: 25.4 PPG over 13 seasons (Guard)
Adrian Dantley: 24.4 PPG over 14 seasons (Forward)
Kyrie Irving: 24.0 PPG over 14 seasons (Guard)
Hakeem Olajuwon: 21.0 PPG over 18 seasons (Center)
Russell Westbrook: 21.2 PPG over 17 seasons (Guard)
Dwyane Wade: 21.8 PPG over 16 seasons (Guard)
Moses Malone: 21.7 PPG over 16 seasons (Center)
Anthony Davis: 24.0 P

In [22]:
# Visualization of Individual Player Trajectories vs Average Scoring Progression
# With improved player selection and smoothed highlight line

import altair as alt
import pandas as pd
import numpy as np

# Set Altair to render larger charts
alt.data_transformers.disable_max_rows()

# First, calculate the average scoring progression by age and position
avg_by_age_pos = data_for_viz.groupby(['Age', 'Position_Group'])['PTS'].mean().reset_index()
avg_by_age_pos.rename(columns={'PTS': 'Avg_PTS'}, inplace=True)

# For player selection, using the improved composite metric
player_career_stats = data_for_viz.groupby('Player').agg({
    'PTS': ['mean', 'sum', 'count'],  # Points per game, total points, and seasons played
    'Position_Group': 'first'  # Get player's position
}).reset_index()

# Flatten the multi-level columns
player_career_stats.columns = [
    f"{col[0]}_{col[1]}" if col[1] != '' else col[0] 
    for col in player_career_stats.columns
]

# Rename columns for clarity
player_career_stats = player_career_stats.rename(columns={
    'PTS_mean': 'Career_PPG',
    'PTS_sum': 'Total_Points',
    'PTS_count': 'Seasons_Played',
    'Position_Group_first': 'Position_Group'
})

# Score players by a composite metric to balance career length and scoring prowess
player_career_stats['Composite_Score'] = (
    player_career_stats['Career_PPG'] * 
    np.sqrt(player_career_stats['Seasons_Played'])
)

# Select top 25 players by composite score
top_players = player_career_stats.sort_values('Composite_Score', ascending=False).head(25)['Player'].tolist()

# Add specific players of interest who might not be in the top 25
manual_additions = [
    'Tim Duncan', 'Kawhi Leonard'
    # Add any specific players here
]

# Add these to our top players list if they're in the dataset
for player in manual_additions:
    if player in data_for_viz['Player'].values and player not in top_players:
        top_players.append(player)

# Print the selected players for reference
print(f"Selected {len(top_players)} players for trajectory analysis:")
for player in top_players:
    player_stats = player_career_stats[player_career_stats['Player'] == player].iloc[0]
    print(f"{player}: {player_stats['Career_PPG']:.1f} PPG over {int(player_stats['Seasons_Played'])} seasons ({player_stats['Position_Group']})")

# Get data for these players
player_data = data_for_viz[data_for_viz['Player'].isin(top_players)].copy()

# Create selectors for position and player
position_options = ['All'] + sorted(data_for_viz['Position_Group'].unique().tolist())
position_param = alt.param(name='Position', value='All',
                          bind=alt.binding_radio(options=position_options, name='Position:'))

player_options = ['None'] + sorted(top_players)
player_param = alt.param(name='HighlightPlayer', value='None',
                        bind=alt.binding_select(options=player_options, name='Highlight Player:'))

# Create the base chart with average lines
avg_line = alt.Chart(avg_by_age_pos).mark_line(
    stroke='black',
    strokeWidth=3
).encode(
    x=alt.X('Age:Q', title='Age', scale=alt.Scale(domain=[19, 40])),
    y=alt.Y('Avg_PTS:Q', title='Points Per Game', scale=alt.Scale(domain=[5, 30])),
    color=alt.Color('Position_Group:N', 
                   scale=alt.Scale(domain=['Guard', 'Forward', 'Center'],
                                  range=['#1f77b4', '#ff7f0e', '#2ca02c']))
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create individual player trajectories
player_lines = alt.Chart(player_data).mark_line(
    opacity=0.3,
    strokeWidth=1
).encode(
    x='Age:Q',
    y='PTS:Q',
    color='Position_Group:N',
    detail='Player:N',  # Each player gets a separate line
    tooltip=['Player', 'Age', 'PTS', 'Season']
).transform_filter(
    (alt.datum.Position_Group == position_param) | (position_param == 'All')
)

# Create highlighted player line WITH SMOOTHING using a basis curve
highlight_line = alt.Chart(player_data).mark_line(
    stroke='red',
    strokeWidth=3,
    interpolate='basis'  # This adds smoothing to the line
).encode(
    x='Age:Q',
    y='PTS:Q',
    tooltip=['Player', 'Age', 'PTS', 'Season']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Add points for the highlighted player
highlight_points = alt.Chart(player_data).mark_circle(
    color='red',
    size=60
).encode(
    x='Age:Q',
    y='PTS:Q',
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Combine the visualizations
chart = (avg_line + player_lines + highlight_line + highlight_points).properties(
    title='NBA Scoring Progression: Individual Players vs. Position Averages',
    width=900,
    height=600
).add_params(
    position_param,
    player_param
).interactive()

# Create a supplementary chart showing selected player's career in more detail
# Also use smoothing here for consistency
career_chart = alt.Chart(player_data).mark_line(
    strokeWidth=2,
    stroke='blue',
    interpolate='basis'  # Add smoothing here too
).encode(
    x=alt.X('Season_Year:O', title='Season'),
    y=alt.Y('PTS:Q', title='Points Per Game'),
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
).properties(
    title='Selected Player Career Progression',
    width=900,
    height=300
)

# Add points to the career chart
career_points = alt.Chart(player_data).mark_circle(
    size=60
).encode(
    x='Season_Year:O',
    y='PTS:Q',
    color='Team:N',  # Color by team to show team changes
    tooltip=['Player', 'Age', 'PTS', 'Season', 'Team']
).transform_filter(
    alt.datum.Player == player_param
).transform_filter(
    player_param != 'None'  # Only show when a player is selected
)

# Combine career charts
full_career_chart = (career_chart + career_points).properties(
    title='Selected Player Career Progression by Season'
)

# Final visualization
final_vis = alt.vconcat(
    chart,
    full_career_chart
).resolve_scale(
    color='independent'  # Independent color scales for each chart
).configure_title(
    fontSize=20,
    font='Arial',
    anchor='start'
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

# Display the visualization
final_vis

Selected 27 players for trajectory analysis:
LeBron James: 27.0 PPG over 22 seasons (Guard)
Michael Jordan: 29.5 PPG over 15 seasons (Guard)
Kevin Durant: 27.2 PPG over 17 seasons (Guard)
Kobe Bryant: 25.1 PPG over 19 seasons (Forward)
Karl Malone: 24.7 PPG over 19 seasons (Forward)
Shaquille O'Neal: 23.0 PPG over 19 seasons (Center)
Allen Iverson: 26.1 PPG over 14 seasons (Guard)
Stephen Curry: 24.3 PPG over 16 seasons (Guard)
Carmelo Anthony: 22.0 PPG over 19 seasons (Forward)
James Harden: 23.9 PPG over 16 seasons (Guard)
Dominique Wilkins: 25.4 PPG over 14 seasons (Forward)
Dirk Nowitzki: 20.1 PPG over 21 seasons (Forward)
Damian Lillard: 25.4 PPG over 13 seasons (Guard)
Adrian Dantley: 24.4 PPG over 14 seasons (Forward)
Kyrie Irving: 24.0 PPG over 14 seasons (Guard)
Hakeem Olajuwon: 21.0 PPG over 18 seasons (Center)
Russell Westbrook: 21.2 PPG over 17 seasons (Guard)
Dwyane Wade: 21.8 PPG over 16 seasons (Guard)
Moses Malone: 21.7 PPG over 16 seasons (Center)
Anthony Davis: 24.0 P