# Shot Quality based on Andrew Patton's Tutorial

In [None]:
import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from nbafuns import *

from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error
pd.options.mode.chained_assignment = None

shots_DIR = "../fdata/ShotLocationData/"

# Andrew Patton's Code

In [None]:
league = "NBA"
season = "2022"
season_start = 2015
season_end = 2024
seasons = np.arange(season_start, season_end, 1).astype(str)

In [None]:
shota = [] 
for season in seasons:
    df1 = pd.read_parquet(shots_DIR + league + "_Shot_Details_" + season + ".parquet")
    shota.append(df1)
all_shots = pd.concat(shota)
all_shots.columns = map(str.lower, all_shots.columns)
all_shots = all_shots.rename(columns={"shot_made_flag":"shot_made"})
all_shots["loc_x"] = all_shots["loc_x"]/10
all_shots["loc_y"] = all_shots["loc_y"]/10
all_shots['fg2'] = np.where(all_shots['shot_type'] == '2PT Field Goal', 1, 0)
all_shots['fg3'] = np.where(all_shots['shot_type'] == '3PT Field Goal', 1, 0)
all_shots["points"] = (2*all_shots['fg2'] + 3*all_shots['fg3'])*all_shots["shot_made"]
shots = all_shots.loc[all_shots['shot_distance'] <= 32].reset_index(drop=True)

In [None]:
# fig, ax = plt.subplots(figsize=(4, 3))
# #plt.axis('equal')
# plt.xlim(-25, 25)
# plt.ylim(-4, 45)
# plt.title("Shots!")
# sns.scatterplot(
#     data=shots, 
#     x="loc_x", 
#     y="loc_y", 
#     hue="shot_type",
#     alpha = 0.25)
# plt.show()

In [None]:
train_shots, test_shots = train_test_split(shots, train_size=0.75, random_state=42)
## NOTE THAT Y COMES FIRST UNLIKE SKLEARN
y_train = train_shots['points']
X_train = train_shots[['fg2', 'fg3']]
model = sm.OLS(y_train, X_train).fit()
model.summary()

In [None]:
y_test = test_shots['points']
X_test = test_shots[['fg2', 'fg3']]
test_shots['pred_PPS'] = model.predict(X_test).values

results_by_player = (test_shots
                     .groupby(['player_name', 'shot_type'])[['points', 'pred_PPS']]
                     .agg({'points': ['sum', 'count'],
                           'pred_PPS': ['sum']}))

results_by_player.columns = ['total_points', 'number_of_shots', 'total_pred_points']
results_by_player = results_by_player.reset_index()
results_by_player['total_resid'] = results_by_player['total_pred_points'] - results_by_player['total_points']
results_by_player['resid_per_shot'] = results_by_player['total_resid']/results_by_player['number_of_shots']
results_by_player['pps_actual'] = results_by_player['total_points']/results_by_player['number_of_shots']
results_by_player['pps_pred'] = results_by_player['total_pred_points']/results_by_player['number_of_shots']

twos = results_by_player.loc[results_by_player['shot_type'] == "2PT Field Goal"]
threes = results_by_player.loc[results_by_player['shot_type'] == "3PT Field Goal"]

jittered = np.random.normal(1, 0.05, len(twos['pps_actual']))

f, ax = plt.subplots(figsize=(6, 10))
plt.title("I wish this was ggplot :(")
plt.ylabel("Actual PPS")
plt.xlabel("Jittered (1.10 predicted PPS)")
sns.scatterplot(data=twos,
                y="pps_actual",
                x=jittered,
                hue="resid_per_shot",
                size="number_of_shots",
                alpha=0.65,
                sizes=(20, 200),
                palette=sns.color_palette("vlag", as_cmap=True)
                )
plt.show()

two_error = np.round(mean_squared_error(y_true=twos['pps_actual'],
                               y_pred=twos['pps_pred'],
                               sample_weight=twos['number_of_shots'],
                               squared=False), 3)

three_error = np.round(mean_squared_error(y_true=threes['pps_actual'],
                                 y_pred=threes['pps_pred'],
                                 sample_weight=threes['number_of_shots'],
                                 squared=False), 3)

print(two_error) ## I have no idea how to include variables in the markdown cell
print(three_error) ## I have no idea how to include variables in the markdown cell

In [None]:
## NOTE THAT Y COMES FIRST UNLIKE SKLEARN
X_train = train_shots[['fg2', 'fg3', 'shot_distance']]
X_test = test_shots[['fg2', 'fg3', 'shot_distance']]
model = sm.OLS(y_train, X_train).fit()
model.summary()
test_shots['pred_PPS'] = model.predict(X_test).values

results_by_player = (test_shots
                     .groupby(['player_name', 'shot_type'])[['points', 'pred_PPS']]
                     .agg({'points': ['sum', 'count'],
                           'pred_PPS': ['sum']}))

results_by_player.columns = ['total_points', 'number_of_shots', 'total_pred_points']
results_by_player = results_by_player.reset_index()
results_by_player['total_resid'] = results_by_player['total_pred_points'] - results_by_player['total_points']
results_by_player['resid_per_shot'] = results_by_player['total_resid']/results_by_player['number_of_shots']
results_by_player['pps_actual'] = results_by_player['total_points']/results_by_player['number_of_shots']
results_by_player['pps_pred'] = results_by_player['total_pred_points']/results_by_player['number_of_shots']

twos = results_by_player.loc[results_by_player['shot_type'] == "2PT Field Goal"]
threes = results_by_player.loc[results_by_player['shot_type'] == "3PT Field Goal"]

two_error = np.round(mean_squared_error(y_true=twos['pps_actual'],
                               y_pred=twos['pps_pred'],
                               sample_weight=twos['number_of_shots'],
                               squared=False), 3)

three_error = np.round(mean_squared_error(y_true=threes['pps_actual'],
                                 y_pred=threes['pps_pred'],
                                 sample_weight=threes['number_of_shots'],
                                 squared=False), 3)

print(two_error) ## I have no idea how to include variables in the markdown cell
print(three_error) ## I have no idea how to include variables in the markdown cell

In [None]:
pd.options.mode.chained_assignment = None

shots['rim_fg'] = np.where(shots['shot_distance'] <= 6, 1, 0)
shots['corner_three'] = np.where((shots['fg3'] == 1) & (shots['loc_y'] <= 5), 1, 0)

train_shots, test_shots = train_test_split(shots, train_size=0.75, random_state=42)
y_train = train_shots['shot_made']
X_train = train_shots[['fg2', "shot_distance", "rim_fg", "corner_three"]]
X_train = sm.add_constant(X_train) ## this adds the intercept back in (default is no intercept for sm)
model = sm.GLM(y_train, 
               X_train, 
               family=sm.families.Binomial()).fit()
model.summary()

In [None]:
X_test = test_shots[['fg2', "shot_distance", "rim_fg", "corner_three"]]
X_test = sm.add_constant(X_test)
preds = model.predict(X_test).values

test_shots['xFG'] = preds
test_shots['xPTS'] = np.where(test_shots['shot_type'] == "2PT Field Goal",
                             2 * test_shots['xFG'],
                             3 * test_shots['xFG'])

fg = (test_shots
      .groupby(['player_name', 'shot_type'])[['points', 'xPTS', 'shot_made', 'xFG']]
      .agg({'points': ['sum', 'count'],
           'xPTS': ['sum'],
           'shot_made': ['sum'],
           'xFG': ['mean']}))


fg.columns = ['PTS', 'number_of_shots', 'xPTS', 'number_of_makes', 'xFG%']
fg2 = fg.reset_index()
fg2 = fg2.loc[fg2['shot_type'] == '2PT Field Goal']
# fg2 = fg2.nlargest(10, 'number_of_shots').sort_values('number_of_shots', ascending=False)

fg2['FG%'] = np.round(fg2['number_of_makes']/fg2['number_of_shots'], 3)
fg2['xFG%'] = np.round(fg2['xFG%'], 3)
fg2['xPTS'] = np.round(fg2['xPTS'], 3)
fg2 = fg2.rename(columns = {'player_name': "Player",
                           'number_of_shots': 'FG2A',
                           'number_of_makes': 'FG2M'})
fg2['Shot Making'] = np.round((fg2['PTS'] - fg2['xPTS'])/fg2['FG2A'], 2)
fg2[['Player', 'FG2A', 'FG2M', 'FG%', 'xFG%', 'PTS', 'xPTS', 'Shot Making']]

In [None]:
fg3 = fg.reset_index()
fg3 = fg3.loc[fg3['shot_type'] == '3PT Field Goal']
# fg3 = fg3.nlargest(10, 'number_of_shots').sort_values('number_of_shots', ascending=False)

fg3['FG%'] = np.round(fg3['number_of_makes']/fg3['number_of_shots'], 3)
fg3['xFG%'] = np.round(fg3['xFG%'], 3)
fg3['xPTS'] = np.round(fg3['xPTS'], 3)
fg3 = fg3.rename(columns = {'player_name': "Player",
                           'number_of_shots': 'FG3A',
                           'number_of_makes': 'FG3M'})
fg3['Shot Making'] = np.round((fg3['PTS'] - fg3['xPTS'])/fg3['FG3A'], 2)
fg3[['Player', 'FG3A', 'FG3M', 'FG%', 'xFG%', 'PTS', 'xPTS', 'Shot Making']]

In [None]:
pd.merge(fg2,fg3,on ="Player", suffixes=["_2", "_3"])

In [None]:
[fg.get_group(x) for x in fg.groups]

In [None]:
fg

# My Model: Statsmodels

In [None]:
league = "NBA"
season_start = 2010
season_end = 2023
seasons = np.arange(season_start, season_end, 1).astype(str)

shota = [] 
for season in seasons:
    df1 = pd.read_parquet(shots_DIR + league + "_Shot_Details_" + season + ".parquet")
    shota.append(df1)
all_shots = pd.concat(shota)
all_shots.columns = map(str.lower, all_shots.columns)
all_shots = all_shots.rename(columns={"shot_made_flag":"shot_made"})
all_shots["loc_x"] = all_shots["loc_x"]/10
all_shots["loc_y"] = all_shots["loc_y"]/10
all_shots['fg2'] = np.where(all_shots['shot_type'] == '2PT Field Goal', 1, 0)
all_shots['fg3'] = np.where(all_shots['shot_type'] == '3PT Field Goal', 1, 0)
all_shots["points"] = (2*all_shots['fg2'] + 3*all_shots['fg3'])*all_shots["shot_made"]
shots = all_shots.loc[all_shots['shot_distance'] <= 32].reset_index(drop=True)
shots["shot_zone_basic"][shots["shot_zone_basic"].str.contains("Corner")] = "Corner 3"
shots["RA"] = np.where(shots["shot_zone_basic"] == "Restricted Area",1,0)
shots["Paint"] = np.where(shots["shot_zone_basic"] == "In The Paint (Non-RA)",1,0)
shots["Abv_3"] = np.where(shots["shot_zone_basic"] == "Above the Break 3",1,0)
shots["MidR"] = np.where(shots["shot_zone_basic"] == "Mid-Range",1,0)
shots["Cor_3"] = np.where(shots["shot_zone_basic"] == "Corner 3",1,0)
train_shots = shots.copy()

In [None]:
# train_shots, test_shots = train_test_split(shots, train_size=0.75, random_state=42)
y_train = train_shots['shot_made']
X_train = train_shots[["shot_distance", "RA" ,"Paint","MidR","Abv_3","Cor_3"]]
X_train = sm.add_constant(X_train) ## this adds the intercept back in (default is no intercept for sm)
model = sm.GLM(y_train, 
               X_train, 
               family=sm.families.Binomial()).fit()
model.summary()

In [None]:
league = "NBA"
season_start = 2023
season_end = 2024
seasons = np.arange(season_start, season_end, 1).astype(str)

shota = [] 
for season in seasons:
    df1 = pd.read_parquet(shots_DIR + league + "_Shot_Details_" + season + ".parquet")
    shota.append(df1)
all_shots = pd.concat(shota)
all_shots.columns = map(str.lower, all_shots.columns)
all_shots = all_shots.rename(columns={"shot_made_flag":"shot_made"})
all_shots["loc_x"] = all_shots["loc_x"]/10
all_shots["loc_y"] = all_shots["loc_y"]/10
all_shots['fg2'] = np.where(all_shots['shot_type'] == '2PT Field Goal', 1, 0)
all_shots['fg3'] = np.where(all_shots['shot_type'] == '3PT Field Goal', 1, 0)
all_shots["points"] = (2*all_shots['fg2'] + 3*all_shots['fg3'])*all_shots["shot_made"]
shots = all_shots.loc[all_shots['shot_distance'] <= 32].reset_index(drop=True)
shots["shot_zone_basic"][shots["shot_zone_basic"].str.contains("Corner")] = "Corner 3"
shots["RA"] = np.where(shots["shot_zone_basic"] == "Restricted Area",1,0)
shots["Paint"] = np.where(shots["shot_zone_basic"] == "In The Paint (Non-RA)",1,0)
shots["Abv_3"] = np.where(shots["shot_zone_basic"] == "Above the Break 3",1,0)
shots["MidR"] = np.where(shots["shot_zone_basic"] == "Mid-Range",1,0)
shots["Cor_3"] = np.where(shots["shot_zone_basic"] == "Corner 3",1,0)
test_shots = shots.copy()

In [None]:
X_test = test_shots[["shot_distance", "RA" ,"Paint","MidR","Abv_3","Cor_3"]]
X_test = sm.add_constant(X_test) 
preds = model.predict(X_test).values

In [None]:
test_shots['xFG'] = preds
test_shots['xPTS'] = np.where(test_shots['shot_type'] == "2PT Field Goal",
                             2 * test_shots['xFG'],
                             3 * test_shots['xFG'])
fg = (test_shots
      .groupby(['player_name', 'shot_type'])[['points', 'xPTS', 'shot_made', 'xFG']]
      .agg({'points': ['sum', 'count'],
           'xPTS': ['sum'],
           'shot_made': ['sum'],
           'xFG': ['mean']}))


fg.columns = ['PTS', 'number_of_shots', 'xPTS', 'number_of_makes', 'xFG%']

In [None]:

fg2 = fg.reset_index()
fg2 = fg2.loc[fg2['shot_type'] == '2PT Field Goal']
fg2 = fg2.nlargest(10, 'number_of_shots').sort_values('number_of_shots', ascending=False)

fg2['FG%'] = np.round(fg2['number_of_makes']/fg2['number_of_shots'], 3)
fg2['xFG%'] = np.round(fg2['xFG%'], 3)
fg2['xPTS'] = np.round(fg2['xPTS'], 3)
fg2 = fg2.rename(columns = {'player_name': "Player",
                           'number_of_shots': 'FG2A',
                           'number_of_makes': 'FG2M'})
fg2['Shot Making'] = np.round((fg2['PTS'] - fg2['xPTS'])/fg2['FG2A'], 2)
fg2_1 = fg2[['Player', 'FG2A', 'FG2M', 'FG%', 'xFG%', 'PTS', 'xPTS', 'Shot Making']]

In [None]:
fg3 = fg.reset_index()
fg3 = fg3.loc[fg3['shot_type'] == '3PT Field Goal']
fg3 = fg3.nlargest(10, 'number_of_shots').sort_values('number_of_shots', ascending=False)

fg3['FG%'] = np.round(fg3['number_of_makes']/fg3['number_of_shots'], 3)
fg3['xFG%'] = np.round(fg3['xFG%'], 3)
fg3['xPTS'] = np.round(fg3['xPTS'], 3)
fg3 = fg3.rename(columns = {'player_name': "Player",
                           'number_of_shots': 'FG3A',
                           'number_of_makes': 'FG3M'})
fg3['Shot Making'] = np.round((fg3['PTS'] - fg3['xPTS'])/fg3['FG3A'], 2)
fg3_1 = fg3[['Player', 'FG3A', 'FG3M', 'FG%', 'xFG%', 'PTS', 'xPTS', 'Shot Making']]

In [None]:
# asdsad

# My Model: Statsmodels - Location

In [None]:
league = "NBA"
season_start = 2010
season_end = 2023
seasons = np.arange(season_start, season_end, 1).astype(str)

shota = [] 
for season in seasons:
    df1 = pd.read_parquet(shots_DIR + league + "_Shot_Details_" + season + ".parquet")
    shota.append(df1)
all_shots = pd.concat(shota)
all_shots.columns = map(str.lower, all_shots.columns)
all_shots = all_shots.rename(columns={"shot_made_flag":"shot_made"})
all_shots["loc_x"] = all_shots["loc_x"]/10
all_shots["loc_y"] = all_shots["loc_y"]/10
all_shots['fg2'] = np.where(all_shots['shot_type'] == '2PT Field Goal', 1, 0)
all_shots['fg3'] = np.where(all_shots['shot_type'] == '3PT Field Goal', 1, 0)
all_shots["points"] = (2*all_shots['fg2'] + 3*all_shots['fg3'])*all_shots["shot_made"]
shots = all_shots.loc[all_shots['shot_distance'] <= 32].reset_index(drop=True)
train_shots = shots.copy()

In [None]:
y_train = train_shots['shot_made']
X_train = train_shots[["loc_x", "loc_y"]]
X_train = sm.add_constant(X_train) ## this adds the intercept back in (default is no intercept for sm)
model = sm.GLM(y_train, 
               X_train, 
               family=sm.families.Binomial()).fit()
model.summary()

In [None]:
league = "NBA"
season_start = 2023
season_end = 2024
seasons = np.arange(season_start, season_end, 1).astype(str)

shota = [] 
for season in seasons:
    df1 = pd.read_parquet(shots_DIR + league + "_Shot_Details_" + season + ".parquet")
    shota.append(df1)
all_shots = pd.concat(shota)
all_shots.columns = map(str.lower, all_shots.columns)
all_shots = all_shots.rename(columns={"shot_made_flag":"shot_made"})
all_shots["loc_x"] = all_shots["loc_x"]/10
all_shots["loc_y"] = all_shots["loc_y"]/10
all_shots['fg2'] = np.where(all_shots['shot_type'] == '2PT Field Goal', 1, 0)
all_shots['fg3'] = np.where(all_shots['shot_type'] == '3PT Field Goal', 1, 0)
all_shots["points"] = (2*all_shots['fg2'] + 3*all_shots['fg3'])*all_shots["shot_made"]
shots = all_shots.loc[all_shots['shot_distance'] <= 32].reset_index(drop=True)
test_shots = shots.copy()

In [None]:
X_test = test_shots[["loc_x", "loc_y"]]
X_test = sm.add_constant(X_test) 
preds = model.predict(X_test).values

In [None]:
test_shots['xFG'] = preds
test_shots['xPTS'] = np.where(test_shots['shot_type'] == "2PT Field Goal",
                             2 * test_shots['xFG'],
                             3 * test_shots['xFG'])
fg = (test_shots
      .groupby(['player_name', 'shot_type'])[['points', 'xPTS', 'shot_made', 'xFG']]
      .agg({'points': ['sum', 'count'],
           'xPTS': ['sum'],
           'shot_made': ['sum'],
           'xFG': ['mean']}))

In [None]:

fg.columns = ['PTS', 'number_of_shots', 'xPTS', 'number_of_makes', 'xFG%']
fg2 = fg.reset_index()
fg2 = fg2.loc[fg2['shot_type'] == '2PT Field Goal']
fg2 = fg2.nlargest(10, 'number_of_shots').sort_values('number_of_shots', ascending=False)

fg2['FG%'] = np.round(fg2['number_of_makes']/fg2['number_of_shots'], 3)
fg2['xFG%'] = np.round(fg2['xFG%'], 3)
fg2['xPTS'] = np.round(fg2['xPTS'], 3)
fg2 = fg2.rename(columns = {'player_name': "Player",
                           'number_of_shots': 'FG2A',
                           'number_of_makes': 'FG2M'})
fg2['Shot Making'] = np.round((fg2['PTS'] - fg2['xPTS'])/fg2['FG2A'], 2)
fg2_2 = fg2[['Player', 'FG2A', 'FG2M', 'FG%', 'xFG%', 'PTS', 'xPTS', 'Shot Making']]

In [None]:
fg3 = fg.reset_index()
fg3 = fg3.loc[fg3['shot_type'] == '3PT Field Goal']
fg3 = fg3.nlargest(10, 'number_of_shots').sort_values('number_of_shots', ascending=False)

fg3['FG%'] = np.round(fg3['number_of_makes']/fg3['number_of_shots'], 3)
fg3['xFG%'] = np.round(fg3['xFG%'], 3)
fg3['xPTS'] = np.round(fg3['xPTS'], 3)
fg3 = fg3.rename(columns = {'player_name': "Player",
                           'number_of_shots': 'FG3A',
                           'number_of_makes': 'FG3M'})
fg3['Shot Making'] = np.round((fg3['PTS'] - fg3['xPTS'])/fg3['FG3A'], 2)
fg3_2 = fg3[['Player', 'FG3A', 'FG3M', 'FG%', 'xFG%', 'PTS', 'xPTS', 'Shot Making']]

In [None]:
fg2_1

In [None]:
fg2_2

In [None]:
fg3_1

In [None]:
fg3_2

In [None]:
asdasd

# My Model Scikit-learn

In [None]:
league = "NBA"
season_start = 2010
season_end = 2023
seasons = np.arange(season_start, season_end, 1).astype(str)

shota = [] 
for season in seasons:
    df1 = pd.read_parquet(shots_DIR + league + "_Shot_Details_" + season + ".parquet")
    shota.append(df1)
all_shots = pd.concat(shota)
all_shots.columns = map(str.lower, all_shots.columns)
all_shots = all_shots.rename(columns={"shot_made_flag":"shot_made"})
all_shots["loc_x"] = all_shots["loc_x"]/10
all_shots["loc_y"] = all_shots["loc_y"]/10
all_shots['fg2'] = np.where(all_shots['shot_type'] == '2PT Field Goal', 1, 0)
all_shots['fg3'] = np.where(all_shots['shot_type'] == '3PT Field Goal', 1, 0)
all_shots["points"] = (2*all_shots['fg2'] + 3*all_shots['fg3'])*all_shots["shot_made"]
shots = all_shots.loc[all_shots['shot_distance'] <= 32].reset_index(drop=True)

In [None]:
shots["shot_zone_basic"] = shots["shot_zone_basic"].astype("category")
shots["shot_zone_area"] = shots["shot_zone_area"].astype("category")

In [None]:
df_encoded = pd.get_dummies(shots, columns=['shot_zone_basic'])

In [None]:
df_encoded

In [None]:
shots.iloc[0]

In [None]:
shots_group = shots.groupby(["player_name"])

In [None]:
shots_group

In [None]:
shots_group[["player_name","shot_made","shot_zone_basic","shot_zone_area"]].get_group("Stephen Curry")