In [33]:
import pandas as pd
import requests

import time
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)


## Data Scraping

In [29]:
test_url = "https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season=2012-13&SeasonType=Regular%20Season&StatCategory=PTS"

In [30]:
r = requests.get(url=test_url).json()

In [31]:
table_headers = r['resultSet']['headers']

In [32]:
pd.DataFrame(r['resultSet']['rowSet'], columns = table_headers)

Unnamed: 0,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
0,201142,1,Kevin Durant,1610612760,OKC,81,3119,731,1433,0.510,...,640,374,116,105,280,143,2280,2462,1.34,0.41
1,977,2,Kobe Bryant,1610612747,LAL,78,3013,738,1595,0.463,...,433,469,106,25,287,173,2133,1921,1.63,0.37
2,2544,3,LeBron James,1610612748,MIA,76,2877,765,1354,0.565,...,610,551,129,67,226,110,2036,2446,2.44,0.57
3,201935,4,James Harden,1610612745,HOU,78,2985,585,1337,0.438,...,379,455,142,38,295,178,2023,1872,1.54,0.48
4,2546,5,Carmelo Anthony,1610612752,NYK,67,2482,669,1489,0.449,...,460,171,52,32,175,205,1920,1553,0.98,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,203130,463,Darius Johnson-Odom,1610612747,LAL,4,6,0,4,0.000,...,4,1,0,0,0,0,0,1,0.00,0.00
464,2545,463,Darko Milicic,1610612738,BOS,1,5,0,1,0.000,...,1,0,0,0,2,1,0,-2,0.00,0.00
465,202458,463,Justin Dentmon,1610612742,DAL,2,4,0,2,0.000,...,0,0,0,0,0,0,0,-2,0.00,0.00
466,2679,463,Matt Carroll,1610612766,CHA,1,6,0,0,0.000,...,0,1,0,0,0,1,0,1,0.00,0.00


In [33]:
temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns = table_headers)
temp_df2 = pd.DataFrame({'Year':['2012-13' for i in range(len(temp_df1))],
                         'Season_type':['Regular%20Season' for i in range(len(temp_df1))]})

temp_df3 = pd.concat([temp_df2, temp_df1], axis = 1)
temp_df3

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
0,2012-13,Regular%20Season,201142,1,Kevin Durant,1610612760,OKC,81,3119,731,...,640,374,116,105,280,143,2280,2462,1.34,0.41
1,2012-13,Regular%20Season,977,2,Kobe Bryant,1610612747,LAL,78,3013,738,...,433,469,106,25,287,173,2133,1921,1.63,0.37
2,2012-13,Regular%20Season,2544,3,LeBron James,1610612748,MIA,76,2877,765,...,610,551,129,67,226,110,2036,2446,2.44,0.57
3,2012-13,Regular%20Season,201935,4,James Harden,1610612745,HOU,78,2985,585,...,379,455,142,38,295,178,2023,1872,1.54,0.48
4,2012-13,Regular%20Season,2546,5,Carmelo Anthony,1610612752,NYK,67,2482,669,...,460,171,52,32,175,205,1920,1553,0.98,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2012-13,Regular%20Season,203130,463,Darius Johnson-Odom,1610612747,LAL,4,6,0,...,4,1,0,0,0,0,0,1,0.00,0.00
464,2012-13,Regular%20Season,2545,463,Darko Milicic,1610612738,BOS,1,5,0,...,1,0,0,0,2,1,0,-2,0.00,0.00
465,2012-13,Regular%20Season,202458,463,Justin Dentmon,1610612742,DAL,2,4,0,...,0,0,0,0,0,0,0,-2,0.00,0.00
466,2012-13,Regular%20Season,2679,463,Matt Carroll,1610612766,CHA,1,6,0,...,0,1,0,0,0,1,0,1,0.00,0.00


In [34]:
del temp_df1, temp_df2, temp_df3

In [35]:
df_cols = ['Year', 'Season_type'] + table_headers

In [36]:
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Host': 'stats.nba.com',
    'Origin': 'https://www.nba.com',
    'Referer': 'https://www.nba.com/',
    'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': "Windows",
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}

In [39]:
api_url = "https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season='+y+'&SeasonType='+s+'&StatCategory=PTS"
r = requests.get(url=api_url, headers = headers).json()

In [41]:
df= pd.DataFrame(columns=df_cols)
season_types = ['Regular%20Season', 'Playoffs']
years = ['2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23']

begin_loop = time.time()

for y in years:
    for s in season_types:
        api_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season='+y+'&SeasonType='+s+'&StatCategory=PTS'
        r = requests.get(url=api_url, headers = headers).json()
        temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns = table_headers)
        temp_df2 = pd.DataFrame({'Year':[y for i in range(len(temp_df1))],
                         'Season_type':[s for i in range(len(temp_df1))]})
        temp_df3 = pd.concat([temp_df2, temp_df1], axis = 1)
        df = pd.concat([df, temp_df3], axis = 0)
        print(f'Finished scraping data for the {y} {s}.')
        lag = np.random.uniform(low=5,high=40)
        print(f'...waiting {round(lag,1)} seconds')
        time.sleep(lag)

print(f'Process completed Total run time: {round(time.time() - begin_loop)/60,2}')

df.to_excel('nba_player_data.xlsx', index=False)

Finished scraping data for the 2012-13 Regular%20Season.
...waiting 29.1 seconds
Finished scraping data for the 2012-13 Playoffs.
...waiting 39.9 seconds
Finished scraping data for the 2013-14 Regular%20Season.
...waiting 13.3 seconds
Finished scraping data for the 2013-14 Playoffs.
...waiting 9.1 seconds
Finished scraping data for the 2014-15 Regular%20Season.
...waiting 23.9 seconds
Finished scraping data for the 2014-15 Playoffs.
...waiting 17.2 seconds
Finished scraping data for the 2015-16 Regular%20Season.
...waiting 35.7 seconds
Finished scraping data for the 2015-16 Playoffs.
...waiting 39.4 seconds
Finished scraping data for the 2016-17 Regular%20Season.
...waiting 5.8 seconds
Finished scraping data for the 2016-17 Playoffs.
...waiting 11.7 seconds
Finished scraping data for the 2017-18 Regular%20Season.
...waiting 21.4 seconds
Finished scraping data for the 2017-18 Playoffs.
...waiting 36.5 seconds
Finished scraping data for the 2018-19 Regular%20Season.
...waiting 26.9 secon

## Data Cleaning/Preparing

In [2]:
data = pd.read_excel('nba_player_data.xlsx')
data.shape


(8049, 30)

In [3]:
data.sample(10)

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
1720,2014-15,Regular%20Season,2202,359,Jason Richardson,1610612755,PHI,19,416,62,...,66,38,13,3,16,28,172,155,2.38,0.81
1356,2013-14,Playoffs,1737,185,Nazr Mohammed,1610612741,CHI,2,5,0,...,2,0,0,0,0,3,0,1,0.0,0.0
1897,2014-15,Playoffs,101162,44,Marcin Gortat,1610612764,WAS,10,307,54,...,88,22,6,11,16,29,124,195,1.38,0.38
6528,2021-22,Regular%20Season,1627734,58,Domantas Sabonis,1610612758,SAC,62,2136,443,...,752,323,59,27,195,203,1171,1721,1.66,0.3
1197,2013-14,Playoffs,2772,39,Trevor Ariza,1610612764,WAS,11,407,52,...,98,19,17,4,14,26,150,212,1.36,1.21
4485,2018-19,Regular%20Season,1628425,282,Sterling Brown,1610612749,MIL,58,1034,145,...,184,84,25,8,46,88,372,447,1.83,0.54
1598,2014-15,Regular%20Season,203918,237,Rodney Hood,1610612762,UTA,50,1064,155,...,117,83,30,12,45,119,433,392,1.84,0.67
3840,2017-18,Regular%20Season,203957,387,Dante Exum,1610612762,UTA,14,235,42,...,26,43,8,3,20,20,114,123,2.15,0.4
4247,2018-19,Regular%20Season,1627783,44,Pascal Siakam,1610612761,TOR,80,2548,519,...,549,248,73,52,154,241,1354,1631,1.61,0.47
7349,2022-23,Regular%20Season,1626156,57,D'Angelo Russell,1610612747,LAL,71,2304,445,...,215,437,70,29,186,140,1263,1288,2.35,0.38


In [4]:
data.isna().sum()

Year           0
Season_type    0
PLAYER_ID      0
RANK           0
PLAYER         0
TEAM_ID        0
TEAM           0
GP             0
MIN            0
FGM            0
FGA            0
FG_PCT         0
FG3M           0
FG3A           0
FG3_PCT        0
FTM            0
FTA            0
FT_PCT         0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
PF             0
PTS            0
EFF            0
AST_TOV        0
STL_TOV        0
dtype: int64

In [5]:
data.drop(columns=['RANK', 'EFF'], inplace=True)

In [6]:
data['season_start_year'] = data['Year'].str[:4].astype(int)

In [7]:
data['TEAM'].replace(to_replace=['NOP','NOH'], value ='NO', inplace=True)

In [8]:
data['Season_type'].replace('Regular%20Season', 'RS', inplace=True)

In [9]:
rs_df = data[data['Season_type']=='RS']
playoffs_df = data[data['Season_type']=='Playoffs']

In [10]:
data.columns

Index(['Year', 'Season_type', 'PLAYER_ID', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
      dtype='object')

In [11]:
total_cols = ['MIN', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

## Data Analysis

Which player stats are correlated with each other?

In [12]:
data_per_min = data.groupby(['PLAYER', 'PLAYER_ID', 'Year'])[total_cols].sum().reset_index()
for col in data_per_min.columns[4:]:
    data_per_min[col] = data_per_min[col]/data_per_min['MIN']

data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

data_per_min = data_per_min[data_per_min['MIN'] >=50]
data_per_min.drop(columns='PLAYER_ID', inplace = True)

fig = px.imshow(data_per_min.corr())
fig.show()



  fig = px.imshow(data_per_min.corr())


- 3PT made and FG% negatively correlated
- FGA and FTA positive
- Minutes played and fouls negative
- TOs and assists positive

In [13]:
(data_per_min['MIN'] >=50).mean()

1.0

How are minutes played distributed?

In [14]:
fig = px.histogram(x=playoffs_df['MIN'], histnorm='percent')
fig.show()

In [15]:
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
    return df.loc[(df['MIN']>=min_MIN) & (df['GP']>=min_GP), 'MIN']/\
    df.loc[(df['MIN']>=min_MIN) & (df['GP']>=min_GP), 'GP']

In [16]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df,50,5), histnorm='percent', name='RS',
                           xbins={'start':0,'end':46,'size':1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df,5,1), histnorm='percent',
                           name='Playoffs', xbins={'start':0,'end':46,'size':1}))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

There are tighter rotation during the playoffs than the regular season

In [17]:
((hist_data(playoffs_df,5,1)>=12)&(hist_data(playoffs_df,5,1)<=34)).mean()

# 49% of NBA players play between 12 and 34 minutes in the playoffs

0.4904993371630579

In [18]:
((hist_data(rs_df,5,1)>=12)&(hist_data(rs_df,5,1)<=34)).mean()

# 70% of NBA players play between 12 and 34 minutes in the reg.season

0.7029281277728483

How has the game changed over time?

In [24]:
change_df = data.groupby('season_start_year')[total_cols].sum().reset_index()
change_df['POSS_est'] = change_df['FGA']-change_df['OREB']+change_df['TOV']+0.44*change_df['FTA']
change_df = change_df[list(change_df.columns[0:2]) + ['POSS_est']+list(change_df.columns[2:-1])]

change_df['FG%'] = change_df['FGM']/change_df['FGA']
change_df['3PT%'] = change_df['FG3M']/change_df['FG3A']
change_df['FT%'] = change_df['FTM']/change_df['FTA']
change_df['AST%'] = change_df['AST']/change_df['FGM']
change_df['FG3A%'] = change_df['FG3A']/change_df['FGA']
change_df['PTS/FGA'] = change_df['PTS']/change_df['FGA']
change_df['FG3M/FGM'] = change_df['FG3M']/change_df['FGM']
change_df['FTA/FGA'] = change_df['FTA']/change_df['FGA']
change_df['TRU%'] = 0.5*change_df['PTS']/(change_df['FGA']+0.475*change_df['FTA'])
change_df['AST_TOV'] = change_df['AST']/change_df['TOV']

change_df

Unnamed: 0,season_start_year,MIN,POSS_est,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,...,FG%,3PT%,FT%,AST%,FG3A%,PTS/FGA,FG3M/FGM,FTA/FGA,TRU%,AST_TOV
0,2012,635884,248201.92,97235,215105,18808,52569,44125,58618,29237,...,0.452035,0.357777,0.752755,0.593346,0.244388,1.196639,0.193428,0.272509,0.529748,1.578841
1,2013,638373,254032.8,99251,218411,20480,56952,47219,62420,28669,...,0.454423,0.359601,0.756472,0.580921,0.260756,1.218808,0.206346,0.285791,0.536565,1.56566
2,2014,634546,253004.12,98251,219265,20724,59276,45098,60248,28566,...,0.448092,0.349619,0.748539,0.587546,0.27034,1.196379,0.210929,0.274773,0.529129,1.612666
3,2015,636391,258064.8,100351,222344,22524,63673,46516,61520,27426,...,0.451332,0.353745,0.756112,0.580473,0.286372,1.213174,0.224452,0.276688,0.536126,1.614585
4,2016,632482,258443.8,102147,223333,25408,71018,46806,60620,26470,...,0.457375,0.357768,0.772121,0.579185,0.317992,1.238097,0.24874,0.271433,0.54835,1.694798
5,2017,633425,260904.52,103729,225523,27530,76245,43721,57008,25397,...,0.459949,0.361073,0.766927,0.585555,0.338081,1.235834,0.265403,0.252781,0.551677,1.701611
6,2018,634231,268739.84,107374,233717,29817,84143,46671,60811,27128,...,0.459419,0.354361,0.767476,0.598441,0.360021,1.246105,0.277693,0.260191,0.554519,1.815477
7,2019,552262,234384.64,92997,202223,28032,78279,40949,52906,22802,...,0.459874,0.358104,0.773995,0.596202,0.387092,1.260861,0.301429,0.261622,0.560746,1.749882
8,2020,562518,235759.48,95849,205754,29549,80653,39624,50917,22918,...,0.465843,0.366372,0.778208,0.59793,0.391988,1.267878,0.308287,0.247465,0.56726,1.877818
9,2021,635572,264004.96,106569,231293,32733,92552,44740,57709,27052,...,0.460753,0.353671,0.775269,0.606349,0.40015,1.256463,0.307153,0.249506,0.561665,1.87996


In [35]:
# Per 48 Minutes played

change_per48_df = change_df.copy()

for col in change_per48_df.columns[2:18]:
    change_per48_df[col] = change_per48_df[col]/change_per48_df['MIN']*48*5

change_per48_df.drop(columns='MIN', inplace=True)
change_per48_df

fig = go.Figure()
for col in change_per48_df.columns[1:]:
    fig.add_trace(go.Scatter(x=change_per48_df['season_start_year'],
                             y=change_per48_df[col], name=col))

fig.show()


In [37]:
# Per 100 Possessions

change_per100_df = change_df.copy()
change_per100_df

for col in change_per100_df.columns[3:18]:
    change_per100_df[col] = (change_per100_df[col] / change_per100_df["POSS_est"]) * 100

change_per100_df.drop(columns=["MIN", 'POSS_est'], inplace=True)
change_per100_df

fig = go.Figure()
for col in change_per100_df.columns[1:]:
    fig.add_trace(
        go.Scatter(
            x=change_per100_df["season_start_year"], y=change_per100_df[col], name=col
        )
    )

fig.show()

- Increase offensive efficiency over time (TRU%, PTS, AST%)
- Slight decline in blocks, could be attributed to lesser close range shots compared to long distance
- Decline in fouls, interesting because of how much foul baiting happens around the league