In [1]:
import pandas as pd

In [2]:
all_shots = pd.read_csv("shots.csv") # original dataset

In [3]:
# Convert shot type to numeric shot value
all_shots["SHOT_VALUE"] = all_shots["SHOT_TYPE"].apply(lambda x: 3 if "3PT" in x else 2)

all_shots = all_shots.drop(columns=[], errors = 'ignore')

# Removing variables which will never be necessary, even if some moderatley improve accuracy (i.e like backourt shots being missed
# details like these are not significant for coaching staff

cols_to_drop = [
    "GRID_TYPE",
    "SHOT_ATTEMPTED_FLAG",
    "PLAYER_ID",
    "TEAM_ID",
    "EVENT_TYPE",
    "GAME_DATE",
    "HTM",
    "VTM",
    "GAME_ID",
    "GAME_EVENT_ID",
    "LOC_X",
    "LOC_Y", 
    "MINUTES_REMAINING", 
    "SECONDS_REMAINING", 
    "SHOT_TYPE",
    "PERIOD"

]

In [4]:
# SHOT FREQUENCY OF EACH PLAYER

player_counts = all_shots['PLAYER_NAME'].value_counts()
print(player_counts.describe())

count     443.000000
mean      463.968397
std       357.667423
min         2.000000
25%       174.500000
50%       388.000000
75%       691.500000
max      1617.000000
Name: count, dtype: float64


In [5]:
# Count shots per player
player_counts = all_shots['PLAYER_NAME'].value_counts()

# Keep only players with 175 or more shots
players_to_keep = player_counts[player_counts >= 175].index

# Filter the dataframe
all_shots = all_shots[all_shots['PLAYER_NAME'].isin(players_to_keep)].copy()

player_counts = all_shots['PLAYER_NAME'].value_counts()
print(player_counts.describe())

count     332.000000
mean      594.015060
std       319.866215
min       175.000000
25%       336.500000
50%       512.500000
75%       781.000000
max      1617.000000
Name: count, dtype: float64


In [6]:
# ORIGINAL DATASET CLEANED
all_shots_clean = all_shots.drop(columns=cols_to_drop, errors='ignore')

# Keep only players with 174 or more shots
players_to_keep = player_counts[player_counts >= 174].index

# Filter the dataframe
all_shots_clean = all_shots_clean[all_shots_clean['PLAYER_NAME'].isin(players_to_keep)].copy()

# COMBINES THREE SHOT DESCRIPTORS INTO ONE
all_shots_zones_comb = all_shots_clean.copy()
all_shots_zones_comb['SHOT_ZONE'] = all_shots_zones_comb['SHOT_ZONE_BASIC'] + " - " + all_shots_zones_comb['SHOT_ZONE_AREA']
all_shots_zones_comb = all_shots_zones_comb.drop(columns=["SHOT_ZONE_BASIC","SHOT_ZONE_AREA","SHOT_ZONE_RANGE"], errors = 'ignore')

In [7]:
# SHOW ZONE REDUCTION (difference between orig & _zones dataframes )

In [8]:
cols = ["SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE"]

# Get unique values for each column
unique_lists = [list(all_shots_clean[col].unique()) for col in cols]

# Find max length to pad shorter lists
max_len = max(len(lst) for lst in unique_lists)

# Pad each list with empty strings so they align
padded_lists = [lst + [""]*(max_len - len(lst)) for lst in unique_lists]

# Combine into a DataFrame
unique_df = pd.DataFrame({col: padded for col, padded in zip(cols, padded_lists)})

print(unique_df)

         SHOT_ZONE_BASIC         SHOT_ZONE_AREA  SHOT_ZONE_RANGE
0      Above the Break 3  Right Side Center(RC)          24+ ft.
1              Mid-Range   Left Side Center(LC)        16-24 ft.
2        Restricted Area          Right Side(R)  Less Than 8 ft.
3  In The Paint (Non-RA)              Center(C)         8-16 ft.
4         Right Corner 3           Left Side(L)  Back Court Shot
5          Left Corner 3         Back Court(BC)                 
6              Backcourt                                        


In [9]:
zone_counts = all_shots_zones_comb["SHOT_ZONE"].value_counts()
zone_percent = zone_counts / zone_counts.sum() * 100

for zone, count in zone_counts.items():
    pct = zone_percent[zone]
    print(f"{zone}: {count}  ({pct:.2f}%)")


Restricted Area - Center(C): 63602  (32.25%)
In The Paint (Non-RA) - Center(C): 24902  (12.63%)
Above the Break 3 - Left Side Center(LC): 15834  (8.03%)
Above the Break 3 - Right Side Center(RC): 15116  (7.66%)
Mid-Range - Left Side(L): 12430  (6.30%)
Mid-Range - Right Side(R): 11968  (6.07%)
Above the Break 3 - Center(C): 10428  (5.29%)
Mid-Range - Center(C): 9124  (4.63%)
Mid-Range - Right Side Center(RC): 7737  (3.92%)
Left Corner 3 - Left Side(L): 7458  (3.78%)
Mid-Range - Left Side Center(LC): 7338  (3.72%)
Right Corner 3 - Right Side(R): 7006  (3.55%)
In The Paint (Non-RA) - Left Side(L): 1990  (1.01%)
In The Paint (Non-RA) - Right Side(R): 1830  (0.93%)
Backcourt - Back Court(BC): 397  (0.20%)
Above the Break 3 - Back Court(BC): 53  (0.03%)


In [10]:
# CAN REMOVE BACKCOURT CHOTS HERE
all_shots_clean_ = all_shots_clean[
    ~(
        all_shots_clean['SHOT_ZONE_BASIC'].str.contains('Backcourt', na=False) |
        all_shots_clean['SHOT_ZONE_AREA'].str.contains('Back Court', na=False) |
        all_shots_clean['SHOT_ZONE_RANGE'].str.contains('Back Court', na=False)
    )
]

all_shots_zones_comb = all_shots_zones_comb[~all_shots_zones_comb ['SHOT_ZONE'].str.contains('Back Court', na=False)]

In [11]:
# SHOW SHOT CATEGORIZATION (difference between orig & _cat dataframes)

In [12]:
# Compute proportions of each action type
action_props = all_shots_clean["ACTION_TYPE"].value_counts(normalize=True)

# Identify action types >= 1%
top_actions = action_props[action_props >= 0.01].index

# Filter the dataframe to only keep top actions
actions_filtered = all_shots_clean[all_shots_clean["ACTION_TYPE"].isin(top_actions)].copy()

# Recalculate counts and percentages after filtering
action_counts = actions_filtered["ACTION_TYPE"].value_counts()
action_percentages = actions_filtered["ACTION_TYPE"].value_counts(normalize=True) * 100  # percentage

# Combine into a single dataframe for easy viewing
action_summary = pd.DataFrame({
    "Count": action_counts,
    "Percentage": action_percentages
})

print(action_summary)

                                Count  Percentage
ACTION_TYPE                                      
Jump Shot                       93487   54.203846
Layup Shot                      17102    9.915755
Driving Layup Shot              12488    7.240554
Pullup Jump shot                11781    6.830634
Floating Jump shot               4954    2.872334
Hook Shot                        4430    2.568518
Step Back Jump shot              4334    2.512857
Tip Layup Shot                   3888    2.254266
Running Layup Shot               3432    1.989877
Turnaround Jump Shot             3393    1.967264
Cutting Layup Shot               3041    1.763175
Dunk Shot                        3002    1.740562
Fadeaway Jump Shot               2891    1.676204
Driving Finger Roll Layup Shot   2134    1.237295
Driving Floating Jump Shot       2116    1.226859


In [13]:
# REMOVE NO SHOT / OTHER SHOT 
all_shots_zones_comb = all_shots_zones_comb[~all_shots_zones_comb['ACTION_TYPE'].str.contains('No Shot', na=False)]

In [14]:
# SAVE TO CSVs

# Merged zones, 57 shot types
all_shots_zones_comb.to_csv("shots_for_model.csv", index=False)

In [15]:
# Hyper Parameter Tuning for XG Boost Model

In [16]:
shots = pd.read_csv("shots_for_model.csv")

In [17]:
print("=== shots ===")
print(list(shots.columns))
print()

=== shots ===
['PLAYER_NAME', 'TEAM_NAME', 'ACTION_TYPE', 'SHOT_DISTANCE', 'SHOT_MADE_FLAG', 'SHOT_VALUE', 'SHOT_ZONE']



In [27]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer
import numpy as np
import warnings
warnings.filterwarnings("ignore", message=".*use_label_encoder.*")



# --------------------------
# DATA & FEATURES
# --------------------------
features = [
    "PLAYER_NAME", "TEAM_NAME", 
    "ACTION_TYPE", "SHOT_ZONE", "SHOT_DISTANCE"
]

target = "SHOT_MADE_FLAG"

# --------------------------
# PARAM GRID (same as before)
# --------------------------
param_grid = {
    "xgb__n_estimators": [100, 250, 500, 750, 1000],
    "xgb__max_depth": [3, 5, 8, 10, 12],
    "xgb__learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "xgb__subsample": [0.5, 0.75, 1.0],
    "xgb__colsample_bytree": [0.5, 0.75, 1.0]
}

# --------------------------
# TRAIN + TUNE
# --------------------------
def train_xgb_tuned(shots, features, target):
    X = shots[features]
    y = shots[target]

    # Detect categorical vars
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        [("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)],
        remainder="passthrough"
    )

    # Base pipeline
    base_model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            eval_metric="logloss",
            n_jobs=1, 
            random_state=42
        ))
    ])

    # Randomized search using F1
    search = RandomizedSearchCV(
        base_model,
        param_distributions=param_grid,
        n_iter=20,
        scoring=make_scorer(f1_score),
        cv=3,
        verbose=1,
        n_jobs=1,
        random_state=42
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    print("\nBest Hyperparameters:")
    print(search.best_params_)

    # Evaluate
    y_proba = best_model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > 0.5).astype(int)

    accuracy = best_model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.3f} | F1: {f1:.3f}")

    return best_model, X_test, y_test, y_proba, accuracy, f1


# --------------------------
# RUN
# --------------------------
best_model, X_test, y_test, y_proba, acc, f1 = train_xgb_tuned(shots, features, target)

Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Hyperparameters:
{'xgb__subsample': 0.5, 'xgb__n_estimators': 750, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.2, 'xgb__colsample_bytree': 1.0}

Accuracy: 0.637 | F1: 0.550


In [19]:
# second round of tuning, Best: Accuracy: 0.654 | F1: 0.559

In [28]:
# --------------------------
# DATA & FEATURES
# --------------------------
features = [
    "PLAYER_NAME", 
    "TEAM_NAME", # significant because of coaching style and shot selection
    "ACTION_TYPE", "SHOT_ZONE",
    "SHOT_DISTANCE" # more precise insight into tendencies
]

target = "SHOT_MADE_FLAG"

# --------------------------
# PARAM GRID 
# --------------------------

param_grid2 = {
        "xgb__n_estimators": [650, 750, 850],
        "xgb__max_depth": [6, 7, 8, 9],
        "xgb__learning_rate": [0.15, 0.2, 0.25],
        "xgb__subsample": [0.25, 0.5, 0.75],
        "xgb__colsample_bytree": [0.8, 0.9, 1.0]
}

# --------------------------
# TRAIN + TUNE
# --------------------------
def train_xgb_tuned2(shots, features, target):
    X = shots[features]
    y = shots[target]

    # Detect categorical vars
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        [("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)],
        remainder="passthrough"
    )

    # Base pipeline
    base_model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            eval_metric="logloss",
            n_jobs=1, 
            random_state=42
        ))
    ])

    # Randomized search using F1
    search = RandomizedSearchCV(
        base_model,
        param_distributions=param_grid2,
        n_iter=20,
        scoring=make_scorer(f1_score),
        cv=3,
        verbose=1,
        n_jobs=1,
        random_state=42
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    print("\nBest Hyperparameters:")
    print(search.best_params_)

    # Evaluate
    y_proba = best_model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > 0.5).astype(int)

    accuracy = best_model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.3f} | F1: {f1:.3f}")

    return best_model, X_test, y_test, y_proba, accuracy, f1


# --------------------------
# RUN
# --------------------------
best_model, X_test, y_test, y_proba, acc, f1 = train_xgb_tuned2(shots, features, target)

Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Hyperparameters:
{'xgb__subsample': 0.75, 'xgb__n_estimators': 650, 'xgb__max_depth': 9, 'xgb__learning_rate': 0.15, 'xgb__colsample_bytree': 0.9}

Accuracy: 0.641 | F1: 0.552


In [30]:
# --------------------------
# Top 3 teams by Accuracy
# --------------------------
top_acc = sorted(team_results.items(), key=lambda x: x[1]["accuracy"], reverse=True)[:3]
print("Top 3 Teams by Accuracy:")
for team, metrics in top_acc:
    print(f"{team}: Accuracy={metrics['accuracy']:.3f}, F1={metrics['f1']:.3f}")

# --------------------------
# Top 3 teams by F1
# --------------------------
top_f1 = sorted(team_results.items(), key=lambda x: x[1]["f1"], reverse=True)[:3]
print("\nTop 3 Teams by F1:")
for team, metrics in top_f1:
    print(f"{team}: Accuracy={metrics['accuracy']:.3f}, F1={metrics['f1']:.3f}")


Top 3 Teams by Accuracy:
Washington Wizards: Accuracy=0.749, F1=0.705
Philadelphia 76ers: Accuracy=0.741, F1=0.662
New Orleans Pelicans: Accuracy=0.713, F1=0.642

Top 3 Teams by F1:
Washington Wizards: Accuracy=0.749, F1=0.705
Golden State Warriors: Accuracy=0.703, F1=0.671
Milwaukee Bucks: Accuracy=0.709, F1=0.667


In [None]:
# TRYING OUT?


import pandas as pd
import itertools

# --------------------------
# Select unique values for each feature
# --------------------------
feature_values = {feat: shots[feat].unique() for feat in features}

# --------------------------
# Create all combinations
# --------------------------
combinations = list(itertools.product(*feature_values.values()))
combo_df = pd.DataFrame(combinations, columns=features)

# --------------------------
# Predict XG for each combination
# --------------------------
combo_df['xg'] = best_model.predict_proba(combo_df)[:, 1]

# --------------------------
# Sort by highest probability
# --------------------------
combo_df = combo_df.sort_values(by='xg', ascending=False).reset_index(drop=True)

# Display top 20 as an example
print(combo_df.head(20))
