In [1]:
import pandas as pd

In [2]:
all_shots = pd.read_csv("shots.csv") # original dataset

In [3]:
# Convert shot type to numeric shot value
all_shots["SHOT_VALUE"] = all_shots["SHOT_TYPE"].apply(lambda x: 3 if "3PT" in x else 2)

# Get time to end of period in seconds
all_shots["TIME_LEFT_SEC"] = all_shots["MINUTES_REMAINING"] * 60 + all_shots["SECONDS_REMAINING"]

all_shots = all_shots.drop(columns=["MINUTES_REMAINING", "SECONDS_REMAINING", "SHOT_TYPE"], errors = 'ignore')

# Removing variables which will never be necessary
cols_to_drop = [
    "GRID_TYPE",
    "SHOT_ATTEMPTED_FLAG",
    "PLAYER_ID",
    "TEAM_ID",
    "EVENT_TYPE",
    "GAME_DATE",
    "HTM",
    "VTM",
    "GAME_ID",
    "GAME_EVENT_ID",
    "LOC_X",
    "LOC_Y"

]

In [4]:
# SHOT FREQUENCY OF EACH PLAYER

player_counts = all_shots['PLAYER_NAME'].value_counts()
print(player_counts.describe())

count     443.000000
mean      463.968397
std       357.667423
min         2.000000
25%       174.500000
50%       388.000000
75%       691.500000
max      1617.000000
Name: count, dtype: float64


In [5]:
# Count shots per player
player_counts = all_shots['PLAYER_NAME'].value_counts()

# Keep only players with 175 or more shots
players_to_keep = player_counts[player_counts >= 175].index

# Filter the dataframe
all_shots = all_shots[all_shots['PLAYER_NAME'].isin(players_to_keep)].copy()

player_counts = all_shots['PLAYER_NAME'].value_counts()
print(player_counts.describe())

count     332.000000
mean      594.015060
std       319.866215
min       175.000000
25%       336.500000
50%       512.500000
75%       781.000000
max      1617.000000
Name: count, dtype: float64


In [17]:
# ORIGINAL DATASET CLEANED
all_shots_clean = all_shots.drop(columns=cols_to_drop, errors='ignore')

# Keep only players with 174 or more shots
players_to_keep = player_counts[player_counts >= 174].index

# Filter the dataframe
all_shots_clean = all_shots_clean[all_shots_clean['PLAYER_NAME'].isin(players_to_keep)].copy()

# COMBINES THREE SHOT DESCRIPTORS INTO ONE
all_shots_zones_comb = all_shots_clean.copy()
all_shots_zones_comb['SHOT_ZONE'] = all_shots_zones_comb['SHOT_ZONE_BASIC'] + " - " + all_shots_zones_comb['SHOT_ZONE_AREA'] + " - " + all_shots_zones_comb['SHOT_ZONE_RANGE']
all_shots_zones_comb = all_shots_zones_comb.drop(columns=["SHOT_ZONE_BASIC","SHOT_ZONE_AREA","SHOT_ZONE_RANGE"], errors = 'ignore')

In [18]:
# SHOW ZONE REDUCTION (difference between orig & _zones dataframes )

In [19]:
cols = ["SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE"]

# Get unique values for each column
unique_lists = [list(all_shots_clean[col].unique()) for col in cols]

# Find max length to pad shorter lists
max_len = max(len(lst) for lst in unique_lists)

# Pad each list with empty strings so they align
padded_lists = [lst + [""]*(max_len - len(lst)) for lst in unique_lists]

# Combine into a DataFrame
unique_df = pd.DataFrame({col: padded for col, padded in zip(cols, padded_lists)})

print(unique_df)

         SHOT_ZONE_BASIC         SHOT_ZONE_AREA  SHOT_ZONE_RANGE
0      Above the Break 3  Right Side Center(RC)          24+ ft.
1              Mid-Range   Left Side Center(LC)        16-24 ft.
2        Restricted Area          Right Side(R)  Less Than 8 ft.
3  In The Paint (Non-RA)              Center(C)         8-16 ft.
4         Right Corner 3           Left Side(L)  Back Court Shot
5          Left Corner 3         Back Court(BC)                 
6              Backcourt                                        


In [20]:
# Get value counts for ACTION_TYPE
zone_counts = all_shots_zones_comb["SHOT_ZONE"].value_counts()

# Print each unique action type with its frequency
for action, count in zone_counts.items():
    print(f"{action}: {count}")

Restricted Area - Center(C) - Less Than 8 ft.: 63602
In The Paint (Non-RA) - Center(C) - Less Than 8 ft.: 18394
Above the Break 3 - Left Side Center(LC) - 24+ ft.: 15834
Above the Break 3 - Right Side Center(RC) - 24+ ft.: 15116
Above the Break 3 - Center(C) - 24+ ft.: 10428
Mid-Range - Right Side Center(RC) - 16-24 ft.: 7737
Left Corner 3 - Left Side(L) - 24+ ft.: 7458
Mid-Range - Left Side Center(LC) - 16-24 ft.: 7338
Mid-Range - Right Side(R) - 8-16 ft.: 7279
Mid-Range - Left Side(L) - 8-16 ft.: 7272
Right Corner 3 - Right Side(R) - 24+ ft.: 7006
Mid-Range - Center(C) - 16-24 ft.: 6972
In The Paint (Non-RA) - Center(C) - 8-16 ft.: 6508
Mid-Range - Left Side(L) - 16-24 ft.: 5158
Mid-Range - Right Side(R) - 16-24 ft.: 4689
Mid-Range - Center(C) - 8-16 ft.: 2152
In The Paint (Non-RA) - Left Side(L) - 8-16 ft.: 1990
In The Paint (Non-RA) - Right Side(R) - 8-16 ft.: 1830
Backcourt - Back Court(BC) - Back Court Shot: 397
Above the Break 3 - Back Court(BC) - Back Court Shot: 53


In [21]:
# CAN REMOVE BACKCOURT CHOTS HERE
all_shots_clean_ = all_shots_clean[
    ~(
        all_shots_clean['SHOT_ZONE_BASIC'].str.contains('Backcourt', na=False) |
        all_shots_clean['SHOT_ZONE_AREA'].str.contains('Back Court', na=False) |
        all_shots_clean['SHOT_ZONE_RANGE'].str.contains('Back Court', na=False)
    )
]

all_shots_zones_comb = all_shots_zones_comb[~all_shots_zones_comb ['SHOT_ZONE'].str.contains('Back Court', na=False)]

In [22]:
# SHOW SHOT CATEGORIZATION (difference between orig & _cat dataframes)

In [29]:
import importlib
import CategorizeShots

importlib.reload(CategorizeShots)
from CategorizeShots import categorize_shot

# 'cat' dataframes categorize 57 different shot types into 5 categories
# Bank, Hook, Dunk, Layup, Jump Shot, Other

all_shots_clean_cat = all_shots_clean.copy()
all_shots_zones_comb_cat = all_shots_zones_comb.copy()

all_shots_clean_cat["SHOT_CATEGORY"] = all_shots_clean_cat["ACTION_TYPE"].apply(categorize_shot)
all_shots_zones_comb_cat["SHOT_CATEGORY"] = all_shots_zones_comb_cat["ACTION_TYPE"].apply(categorize_shot)

all_shots_clean_cat = all_shots_clean_cat.drop(columns=["ACTION_TYPE"], errors = 'ignore')
all_shots_zones_comb_cat = all_shots_zones_comb_cat.drop(columns=["ACTION_TYPE"], errors = 'ignore')

In [30]:
# Get value counts for ACTION_TYPE
action_counts = all_shots_clean["ACTION_TYPE"].value_counts()

# Print each unique action type with its frequency
for action, count in action_counts.items():
    print(f"{action}: {count}")

Jump Shot: 93487
Layup Shot: 17102
Driving Layup Shot: 12488
Pullup Jump shot: 11781
Floating Jump shot: 4954
Hook Shot: 4430
Step Back Jump shot: 4334
Tip Layup Shot: 3888
Running Layup Shot: 3432
Turnaround Jump Shot: 3393
Cutting Layup Shot: 3041
Dunk Shot: 3002
Fadeaway Jump Shot: 2891
Driving Finger Roll Layup Shot: 2134
Driving Floating Jump Shot: 2116
Putback Layup Shot: 1832
Reverse Layup Shot: 1802
Running Jump Shot: 1764
Turnaround Hook Shot: 1695
Jump Bank Shot: 1689
Turnaround Fadeaway shot: 1600
Alley Oop Dunk Shot: 1395
Cutting Dunk Shot: 1395
Driving Reverse Layup Shot: 1376
Driving Dunk Shot: 1369
Running Dunk Shot: 1231
Driving Hook Shot: 947
Driving Bank shot: 934
Alley Oop Layup shot: 817
Finger Roll Layup Shot: 619
Putback Dunk Shot: 583
Driving Floating Bank Jump Shot: 488
Running Finger Roll Layup Shot: 454
Cutting Finger Roll Layup Shot: 358
Turnaround Bank shot: 355
Running Pull-Up Jump Shot: 333
Running Reverse Layup Shot: 271
Pullup Bank shot: 269
Tip Dunk Sho

In [31]:
# Get value counts for ACTION_TYPE
shot_category_counts = all_shots_clean_cat["SHOT_CATEGORY"].value_counts()

# Print each unique action type with its frequency
for action, count in shot_category_counts.items():
    print(f"{action}: {count}")

Jump Shot: 126721
Layup Shot: 49645
Dunk Shot: 9497
Hook Shot: 7250
Bank Shot: 4079
Other: 21


In [32]:
# REMOVE NO SHOT / OTHER SHOT 
all_shots_clean = all_shots_clean[~all_shots_clean['ACTION_TYPE'].str.contains('No Shot', na=False)]
all_shots_zones_comb = all_shots_zones_comb[~all_shots_zones_comb['ACTION_TYPE'].str.contains('No Shot', na=False)]

all_shots_clean_cat = all_shots_clean_cat[~all_shots_clean_cat['SHOT_CATEGORY'].str.contains('OTHER', na=False)]
all_shots_zones_comb_cat = all_shots_zones_comb_cat[~all_shots_zones_comb_cat['SHOT_CATEGORY'].str.contains('OTHER', na=False)]

In [33]:
# SAVE TO CSVs

# Seperate zones, 57 shot types
all_shots_clean.to_csv("shots_clean_all_types.csv", index=False)

# Merged zones, 57 shot types
all_shots_zones_comb.to_csv("shots_comb_zones_all_types.csv", index=False)

# Seperate zones, 5 shot types
all_shots_clean_cat.to_csv("shots_clean_5_types.csv", index=False)

# Merged zones, 5 shot types
all_shots_zones_comb_cat.to_csv("shots_comb_zones_5_types.csv", index=False)