In [24]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pd.options.plotting.backend = 'plotly'


For the purpose of this code, Austin Reaves will be the subject. First, we must establish of point of comparison of least year's stats as to determine if he has truly regressed. All data comes from basketball-reference.com as they prepare all game and season logs as csv form.

In [25]:
reaves_22_23_path = Path('data') / 'reaves_22_23_game_logs.csv'
ar22_23 = pd.read_csv(reaves_22_23_path)
ar22_23.head()

Unnamed: 0,G,Date,Opp,Unnamed: 3,GS,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,10/18/2022,GSW,L (-14),0.0,16:00,1.0,3.0,0.333,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,1.6,-6.0
2,2.0,10/20/2022,LAC,L (-6),0.0,21:55,1.0,2.0,0.5,0.0,...,2.0,4.0,2.0,0.0,0.0,0.0,5.0,4.0,4.4,0.0
3,3.0,10/23/2022,POR,L (-2),0.0,18:58,2.0,5.0,0.4,2.0,...,0.0,0.0,1.0,1.0,0.0,2.0,1.0,7.0,3.6,-6.0
4,4.0,10/26/2022,DEN,L (-11),1.0,26:29:00,3.0,5.0,0.6,2.0,...,2.0,2.0,1.0,0.0,0.0,1.0,0.0,8.0,6.0,-23.0


Fortunately, the data is fairly clean. But I will not consider any games he did not play. So I will drop games that he did not play.

In [26]:
def clean_player_data(game_logs):
    game_logs = game_logs.rename(columns={"Unnamed: 3": "Result"})
    game_logs = game_logs.replace("Inac", np.NaN)
    game_logs = game_logs.replace("Did", np.NaN)
    game_logs = game_logs.dropna(subset=['MP'])
    game_logs["G"] = game_logs['G'].apply(int)
    game_logs = game_logs.fillna(0)
    game_logs["GS"] = game_logs['GS'].apply(int)
    game_logs.loc[:, "FG": "+/-"] = game_logs.loc[:, "FG": "+/-"].map(float)
    return game_logs

In [27]:
cleaned_ar = clean_player_data(ar22_23)
cleaned_ar.tail()

Unnamed: 0,G,Date,Opp,Result,GS,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
78,60,4/2/2023,HOU,W (+25),1,32:44:00,4.0,7.0,0.571,1.0,...,1.0,1.0,8.0,0.0,0.0,2.0,2.0,18.0,17.4,15.0
79,61,4/4/2023,UTA,W (+2),1,41:32:00,8.0,13.0,0.615,4.0,...,3.0,3.0,6.0,0.0,0.0,2.0,1.0,28.0,24.8,10.0
80,62,4/5/2023,LAC,L (-7),1,30:41:00,6.0,11.0,0.545,3.0,...,4.0,4.0,2.0,0.0,0.0,2.0,5.0,20.0,13.3,9.0
81,63,4/7/2023,PHO,W (+14),1,31:15:00,9.0,13.0,0.692,1.0,...,0.0,0.0,5.0,1.0,0.0,2.0,1.0,22.0,17.8,18.0
82,64,4/9/2023,UTA,W (+11),1,33:59:00,3.0,5.0,0.6,3.0,...,4.0,4.0,6.0,0.0,0.0,0.0,2.0,12.0,14.3,20.0


As Reaves only played 64 games in a season, the relevant data is now saved in the cleaned dataframe. NaN values resulting from values not being calculated are set as zero. Furthermore, we will consider all numeric, continuous variables as floats. Whenever we calculate averages, they tend to be floats anyways.

In [28]:
def mean_calc(stat):
    return stat.mean()

In [29]:
def sum_calc(stat):
    return stat.sum()

In [30]:
ar_average_points = mean_calc(cleaned_ar['PTS'])
ar_average_points

13.015625

In [31]:
ar_total_points = sum_calc(cleaned_ar['PTS'])
ar_total_points

833.0

Now we can calcualte relevant values for comparison. It is necessary to make the distinction between average and totals as certain metrics such as +/- is a cumulative stat. 

In [32]:
league_totals_22_23_path = Path('data') / 'league_totals_22_23.csv'
totals_22_23 = pd.read_csv(league_totals_22_23_path, encoding = 'latin-1')
totals_22_23.head()

Unnamed: 0,Rk,Player,Pos,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional
0,,,,,,,,,,,...,,,,,,,,,,-9999
1,1.0,Precious Achiuwa,C,TOR,55.0,12.0,1140.0,196.0,404.0,0.485,...,100.0,228.0,328.0,50.0,31.0,30.0,59.0,102.0,508.0,achiupr01
2,2.0,Steven Adams,C,MEM,42.0,42.0,1133.0,157.0,263.0,0.597,...,214.0,271.0,485.0,97.0,36.0,46.0,79.0,98.0,361.0,adamsst01
3,3.0,Bam Adebayo,C,MIA,75.0,75.0,2598.0,602.0,1114.0,0.54,...,184.0,504.0,688.0,240.0,88.0,61.0,187.0,208.0,1529.0,adebaba01
4,4.0,Ochai Agbaji,SG,UTA,59.0,22.0,1209.0,165.0,386.0,0.427,...,43.0,78.0,121.0,67.0,16.0,15.0,41.0,99.0,467.0,agbajoc01


Ok, so now we want to load in league averages and totals to get a true sense of where Reaves was at last year.

In [33]:
def clean_league_data(league_stats):
    league_stats = league_stats.drop(columns=['Player-additional'])
    league_stats = league_stats.dropna(how='all')
    league_stats['Rk'] = league_stats['Rk'].apply(int)
    league_stats.loc[:, "G":'PTS'] = league_stats.loc[:, "G":'PTS'].map(float)
    return league_stats

In [34]:
clean_totals_22_23 = clean_league_data(totals_22_23)
clean_totals_22_23.head()

Unnamed: 0,Rk,Player,Pos,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1,Precious Achiuwa,C,TOR,55.0,12.0,1140.0,196.0,404.0,0.485,...,0.702,100.0,228.0,328.0,50.0,31.0,30.0,59.0,102.0,508.0
2,2,Steven Adams,C,MEM,42.0,42.0,1133.0,157.0,263.0,0.597,...,0.364,214.0,271.0,485.0,97.0,36.0,46.0,79.0,98.0,361.0
3,3,Bam Adebayo,C,MIA,75.0,75.0,2598.0,602.0,1114.0,0.54,...,0.806,184.0,504.0,688.0,240.0,88.0,61.0,187.0,208.0,1529.0
4,4,Ochai Agbaji,SG,UTA,59.0,22.0,1209.0,165.0,386.0,0.427,...,0.812,43.0,78.0,121.0,67.0,16.0,15.0,41.0,99.0,467.0
5,5,Santi Aldama,PF,MEM,77.0,20.0,1682.0,247.0,525.0,0.47,...,0.75,85.0,286.0,371.0,97.0,45.0,48.0,60.0,143.0,696.0


It turns out there are over 500 players that played at least 1 game in the NBA last season. But many of those do not serve in comparison to Reaves. For example, Thanasis Antetokounmpo.

In [35]:
clean_totals_22_23[clean_totals_22_23['Player'] == "Thanasis Antetokounmpo"]

Unnamed: 0,Rk,Player,Pos,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
14,12,Thanasis Antetokounmpo,PF,MIL,37.0,0.0,206.0,20.0,46.0,0.435,...,0.5,15.0,29.0,44.0,15.0,3.0,3.0,12.0,22.0,51.0


In [36]:
league_average_mp = mean_calc(clean_totals_22_23['MP'])
league_average_mp

984.421207658321

What we will do is remove players who have under the league average for minutes played.

In [37]:
clean_totals_22_23 = clean_totals_22_23[clean_totals_22_23['MP'] >= league_average_mp]
clean_totals_22_23

Unnamed: 0,Rk,Player,Pos,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1,Precious Achiuwa,C,TOR,55.0,12.0,1140.0,196.0,404.0,0.485,...,0.702,100.0,228.0,328.0,50.0,31.0,30.0,59.0,102.0,508.0
2,2,Steven Adams,C,MEM,42.0,42.0,1133.0,157.0,263.0,0.597,...,0.364,214.0,271.0,485.0,97.0,36.0,46.0,79.0,98.0,361.0
3,3,Bam Adebayo,C,MIA,75.0,75.0,2598.0,602.0,1114.0,0.540,...,0.806,184.0,504.0,688.0,240.0,88.0,61.0,187.0,208.0,1529.0
4,4,Ochai Agbaji,SG,UTA,59.0,22.0,1209.0,165.0,386.0,0.427,...,0.812,43.0,78.0,121.0,67.0,16.0,15.0,41.0,99.0,467.0
5,5,Santi Aldama,PF,MEM,77.0,20.0,1682.0,247.0,525.0,0.470,...,0.750,85.0,286.0,371.0,97.0,45.0,48.0,60.0,143.0,696.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,523,Patrick Williams,PF,CHI,82.0,65.0,2323.0,314.0,677.0,0.464,...,0.857,78.0,249.0,327.0,100.0,72.0,70.0,101.0,147.0,833.0
671,531,Christian Wood,C,DAL,67.0,17.0,1738.0,396.0,769.0,0.515,...,0.772,87.0,404.0,491.0,121.0,30.0,72.0,121.0,169.0,1114.0
672,532,Delon Wright,PG,WAS,50.0,14.0,1221.0,138.0,291.0,0.474,...,0.867,58.0,122.0,180.0,194.0,92.0,17.0,44.0,59.0,369.0
676,536,Trae Young,PG,ATL,73.0,73.0,2541.0,597.0,1390.0,0.429,...,0.886,56.0,161.0,217.0,741.0,80.0,9.0,300.0,104.0,1914.0


Now, this cuts down over 50 percent of players. Although starters appear to play more than 2000 minutes, they can serve as comparators. For example, last years sixth man of the year, Malcolm Brogdon.

In [39]:
clean_totals_22_23[clean_totals_22_23['Player'] == "Malcolm Brogdon"]

Unnamed: 0,Rk,Player,Pos,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
82,60,Malcolm Brogdon,PG,BOS,67.0,0.0,1744.0,354.0,732.0,0.484,...,0.87,42.0,238.0,280.0,248.0,45.0,18.0,98.0,109.0,1000.0


In [45]:
per_game_22_23_path = Path('data') / 'per_game_22_23.csv'
per_game_22_23 = pd.read_csv(per_game_22_23_path)
clean_per_game_22_23 = clean_league_data(per_game_22_23)

league_average_games = mean_calc(clean_per_game_22_23["G"])
clean_per_game_22_23 = clean_per_game_22_23[clean_per_game_22_23["G"] >= league_average_games]

clean_per_game_22_23.head()

Unnamed: 0,Rk,Player,Pos,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1,1,Precious Achiuwa,C,TOR,55.0,12.0,20.7,3.6,7.3,0.485,...,0.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
3,3,Bam Adebayo,C,MIA,75.0,75.0,34.6,8.0,14.9,0.54,...,0.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
4,4,Ochai Agbaji,SG,UTA,59.0,22.0,20.5,2.8,6.5,0.427,...,0.812,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9
5,5,Santi Aldama,PF,MEM,77.0,20.0,21.8,3.2,6.8,0.47,...,0.75,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0
6,6,Nickeil Alexander-Walker,SG,TOT,59.0,3.0,15.0,2.2,5.0,0.444,...,0.667,0.3,1.5,1.7,1.8,0.5,0.4,0.9,1.5,6.2


Above is also the per game stats of every player in the league but sorted out to only players who have played over the league average.

Now that we have the data for the entire league, we can start making comparisons. For the sake of simplicity, we will take 8 categories as a player's output.
- Rebounding
- Scoring
- Efficiency (True Shooting Percentage)
- 3 Pointers
- Playmaking (Assists)
- Ball Handling (Assist / Turnover Ratio)
- Open Court Defense (Steals)
- At Rim Defense (Blocks)

Albeit, these metrics are simplications of what is actually happening on the court. Coachs' strategical play can affect what players are doing on the court. I.e. Just because a player may not have high steals may not be imply he is a bad defender. Nevertheless, someone with high amount of steals still highlights good open court defense. 

In [78]:
total_position_averages = clean_totals_22_23.groupby("Pos")[['ORB', 'DRB', 'TRB', 'PTS', 'eFG%', '3P', 'AST', 'TOV', 'STL', 'BLK']].mean()
total_position_averages

Unnamed: 0_level_0,ORB,DRB,TRB,PTS,eFG%,3P,AST,TOV,STL,BLK
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C,161.1,374.06,535.16,851.98,0.61234,34.38,142.6,100.56,44.44,72.76
PF,85.833333,283.814815,369.648148,860.703704,0.551352,82.777778,157.648148,96.685185,47.62963,36.796296
PG,40.185185,189.314815,229.5,966.481481,0.523519,111.277778,344.833333,130.962963,66.222222,20.111111
PG-SG,59.0,245.0,304.0,1623.0,0.572,188.0,331.0,128.0,66.0,45.0
SF,60.421053,239.403509,299.824561,879.929825,0.546825,115.263158,153.368421,86.701754,57.403509,25.140351
SF-SG,59.5,192.5,252.0,982.0,0.5385,111.5,163.0,75.5,87.0,46.5
SG,40.757576,176.969697,217.727273,885.575758,0.541667,120.151515,189.5,98.348485,54.621212,18.515152
SG-PG,36.0,222.5,258.5,892.0,0.5175,130.5,354.5,102.5,65.0,32.5


In [70]:
per_game_position_avgs = clean_per_game_22_23.groupby("Pos")[['ORB', 'DRB', 'TRB', 'PTS', 'eFG%', '3P', 'AST', 'TOV', 'STL', 'BLK']].mean()
per_game_position_avgs

Unnamed: 0_level_0,ORB,DRB,TRB,PTS,eFG%,3P,AST,TOV,STL,BLK
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C,2.104615,4.803077,6.913846,10.835385,0.606277,0.466154,1.772308,1.3,0.595385,0.96
PF,1.179688,3.79375,4.973438,11.303125,0.551063,1.084375,2.073437,1.275,0.65,0.501563
PF-C,0.9,2.7,3.6,6.4,0.54,0.8,1.3,1.0,0.4,0.1
PG,0.581967,2.67377,3.24918,13.532787,0.516541,1.590164,4.857377,1.859016,0.947541,0.285246
PG-SG,0.65,2.8,3.45,16.05,0.572,1.9,3.95,1.4,0.85,0.45
SF,0.813043,3.152174,3.968116,11.452174,0.537304,1.508696,2.017391,1.152174,0.75942,0.33913
SF-PF,0.5,2.3,2.8,6.6,0.556,1.0,0.5,0.8,0.4,0.2
SF-SG,0.8,2.4,3.2,12.1,0.5385,1.4,2.0,0.9,1.15,0.6
SG,0.546753,2.42987,2.979221,12.203896,0.538208,1.65974,2.572727,1.354545,0.751948,0.258442
SG-PG,0.55,3.05,3.55,11.75,0.5175,1.75,4.7,1.35,0.85,0.45


From these stats, it may look like certain positions perform much better or worse than others; for example, the PG-SG position seems way better just in terms of both averages. However, this only person that applies to this position is All-Star Kyrie Irving.

In [52]:
clean_totals_22_23['Pos'].value_counts()

Pos
SG       66
SF       57
PF       54
PG       54
C        50
SG-PG     2
SF-SG     2
PG-SG     1
Name: count, dtype: int64

In [79]:
subset_total_avgs = total_position_averages.reset_index().copy()
subset_total_avgs = subset_total_avgs[subset_total_avgs['Pos'].isin(['PG','SG','SF','PF','C'])]
subset_total_avgs

Unnamed: 0,Pos,ORB,DRB,TRB,PTS,eFG%,3P,AST,TOV,STL,BLK
0,C,161.1,374.06,535.16,851.98,0.61234,34.38,142.6,100.56,44.44,72.76
1,PF,85.833333,283.814815,369.648148,860.703704,0.551352,82.777778,157.648148,96.685185,47.62963,36.796296
2,PG,40.185185,189.314815,229.5,966.481481,0.523519,111.277778,344.833333,130.962963,66.222222,20.111111
4,SF,60.421053,239.403509,299.824561,879.929825,0.546825,115.263158,153.368421,86.701754,57.403509,25.140351
6,SG,40.757576,176.969697,217.727273,885.575758,0.541667,120.151515,189.5,98.348485,54.621212,18.515152


In [80]:
subset_per_game_avgs = per_game_position_avgs.reset_index().copy()
subset_per_game_avgs = subset_per_game_avgs[subset_per_game_avgs['Pos'].isin(['PG','SG','SF','PF','C'])]
subset_per_game_avgs

Unnamed: 0,Pos,ORB,DRB,TRB,PTS,eFG%,3P,AST,TOV,STL,BLK
0,C,2.104615,4.803077,6.913846,10.835385,0.606277,0.466154,1.772308,1.3,0.595385,0.96
1,PF,1.179688,3.79375,4.973438,11.303125,0.551063,1.084375,2.073437,1.275,0.65,0.501563
2,PG,0.581967,2.67377,3.24918,13.532787,0.516541,1.590164,4.857377,1.859016,0.947541,0.285246
3,SF,0.813043,3.152174,3.968116,11.452174,0.537304,1.508696,2.017391,1.152174,0.75942,0.33913
4,SG,0.546753,2.42987,2.979221,12.203896,0.538208,1.65974,2.572727,1.354545,0.751948,0.258442


Now we can see that in both cases, there is a big enough data set to create approprivate averages. In general, Shoot Guards (Austin Reaves's Position) tend to have the least amount of rebounds, 2nd to most in points, best in three-pointers, 2nd in assists, 3rd in turnovers, 3rd in steals, last in blocks. Alternatively, where Shooting guards rank last, Centers and Power Forwards tend to rank first. So once again, I am going to subset the data. But from game's tactical perspective, Forwards and Guards have fundamentally have different roles on the court, so it makes sense that I am subsetting from a philosophical standpoint.

In [81]:
subset_total_avgs = subset_total_avgs[subset_total_avgs['Pos'].isin(['PG', 'SG', 'SF'])]
subset_per_game_avgs = subset_per_game_avgs[subset_per_game_avgs['Pos'].isin(['PG', 'SG', 'SF'])]
subset_total_avgs

Unnamed: 0,Pos,ORB,DRB,TRB,PTS,eFG%,3P,AST,TOV,STL,BLK
2,PG,40.185185,189.314815,229.5,966.481481,0.523519,111.277778,344.833333,130.962963,66.222222,20.111111
4,SF,60.421053,239.403509,299.824561,879.929825,0.546825,115.263158,153.368421,86.701754,57.403509,25.140351
6,SG,40.757576,176.969697,217.727273,885.575758,0.541667,120.151515,189.5,98.348485,54.621212,18.515152


In [88]:
ar_subset_avg = cleaned_ar.loc[:, ['ORB', 'DRB', 'TRB', 'PTS', 'FG%', '3P', 'AST', 'TOV', 'STL', 'BLK']].mean()
ar_subset_avg

ORB     0.515625
DRB          2.5
TRB     3.015625
PTS    13.015625
FG%     0.529984
3P       1.34375
AST     3.359375
TOV     1.546875
STL     0.515625
BLK     0.296875
dtype: object

In [87]:
ar_subset_sum = cleaned_ar.loc[:, ['ORB', 'DRB', 'TRB', 'PTS', 'FG%', '3P', 'AST', 'TOV', 'STL', 'BLK']].sum()
ar_subset_sum

ORB      33.0
DRB     160.0
TRB     193.0
PTS     833.0
FG%    33.919
3P       86.0
AST     215.0
TOV      99.0
STL      33.0
BLK      19.0
dtype: object