In [1]:
import pandas as pd

In [2]:
all_shots = pd.read_csv("shots.csv") # original dataset

In [3]:
# Convert shot type to numeric shot value
all_shots["SHOT_VALUE"] = all_shots["SHOT_TYPE"].apply(lambda x: 3 if "3PT" in x else 2)

all_shots = all_shots.drop(columns=[], errors = 'ignore')

# Removing variables which will never be necessary, even if some moderatley improve accuracy (i.e like backourt shots being missed
# details like these are not significant for coaching staff

cols_to_drop = [
    "GRID_TYPE",
    "SHOT_ATTEMPTED_FLAG",
    "PLAYER_ID",
    "TEAM_ID",
    "EVENT_TYPE",
    "GAME_DATE",
    "HTM",
    "VTM",
    "GAME_ID",
    "GAME_EVENT_ID",
    "LOC_X",
    "LOC_Y", 
    "MINUTES_REMAINING", 
    "SECONDS_REMAINING", 
    "SHOT_TYPE",
    "PERIOD", 
    "SHOT_DISTANCE"

]

In [4]:
# SHOT FREQUENCY OF EACH PLAYER

player_counts = all_shots['PLAYER_NAME'].value_counts()
print(player_counts.describe())

count     443.000000
mean      463.968397
std       357.667423
min         2.000000
25%       174.500000
50%       388.000000
75%       691.500000
max      1617.000000
Name: count, dtype: float64


In [5]:
# Count shots per player
player_counts = all_shots['PLAYER_NAME'].value_counts()

# Keep only players with 175 or more shots
players_to_keep = player_counts[player_counts >= 175].index

# Filter the dataframe
all_shots = all_shots[all_shots['PLAYER_NAME'].isin(players_to_keep)].copy()

player_counts = all_shots['PLAYER_NAME'].value_counts()
print(player_counts.describe())

count     332.000000
mean      594.015060
std       319.866215
min       175.000000
25%       336.500000
50%       512.500000
75%       781.000000
max      1617.000000
Name: count, dtype: float64


In [6]:
# ORIGINAL DATASET CLEANED
all_shots_clean = all_shots.drop(columns=cols_to_drop, errors='ignore')

# Keep only players with 174 or more shots
players_to_keep = player_counts[player_counts >= 174].index

# Filter the dataframe
all_shots_clean = all_shots_clean[all_shots_clean['PLAYER_NAME'].isin(players_to_keep)].copy()

# COMBINES THREE SHOT DESCRIPTORS INTO ONE
all_shots_zones_comb = all_shots_clean.copy()
all_shots_zones_comb['SHOT_ZONE'] = all_shots_zones_comb['SHOT_ZONE_BASIC'] + " - " + all_shots_zones_comb['SHOT_ZONE_AREA']
all_shots_zones_comb = all_shots_zones_comb.drop(columns=["SHOT_ZONE_BASIC","SHOT_ZONE_AREA","SHOT_ZONE_RANGE"], errors = 'ignore')

In [7]:
# SHOW ZONE REDUCTION (difference between orig & _zones dataframes )

In [8]:
cols = ["SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE"]

# Get unique values for each column
unique_lists = [list(all_shots_clean[col].unique()) for col in cols]

# Find max length to pad shorter lists
max_len = max(len(lst) for lst in unique_lists)

# Pad each list with empty strings so they align
padded_lists = [lst + [""]*(max_len - len(lst)) for lst in unique_lists]

# Combine into a DataFrame
unique_df = pd.DataFrame({col: padded for col, padded in zip(cols, padded_lists)})

print(unique_df)

         SHOT_ZONE_BASIC         SHOT_ZONE_AREA  SHOT_ZONE_RANGE
0      Above the Break 3  Right Side Center(RC)          24+ ft.
1              Mid-Range   Left Side Center(LC)        16-24 ft.
2        Restricted Area          Right Side(R)  Less Than 8 ft.
3  In The Paint (Non-RA)              Center(C)         8-16 ft.
4         Right Corner 3           Left Side(L)  Back Court Shot
5          Left Corner 3         Back Court(BC)                 
6              Backcourt                                        


In [9]:
zone_counts = all_shots_zones_comb["SHOT_ZONE"].value_counts()
zone_percent = zone_counts / zone_counts.sum() * 100

for zone, count in zone_counts.items():
    pct = zone_percent[zone]
    print(f"{zone}: {count}  ({pct:.2f}%)")


Restricted Area - Center(C): 63602  (32.25%)
In The Paint (Non-RA) - Center(C): 24902  (12.63%)
Above the Break 3 - Left Side Center(LC): 15834  (8.03%)
Above the Break 3 - Right Side Center(RC): 15116  (7.66%)
Mid-Range - Left Side(L): 12430  (6.30%)
Mid-Range - Right Side(R): 11968  (6.07%)
Above the Break 3 - Center(C): 10428  (5.29%)
Mid-Range - Center(C): 9124  (4.63%)
Mid-Range - Right Side Center(RC): 7737  (3.92%)
Left Corner 3 - Left Side(L): 7458  (3.78%)
Mid-Range - Left Side Center(LC): 7338  (3.72%)
Right Corner 3 - Right Side(R): 7006  (3.55%)
In The Paint (Non-RA) - Left Side(L): 1990  (1.01%)
In The Paint (Non-RA) - Right Side(R): 1830  (0.93%)
Backcourt - Back Court(BC): 397  (0.20%)
Above the Break 3 - Back Court(BC): 53  (0.03%)


In [10]:
# CAN REMOVE BACKCOURT CHOTS HERE
all_shots_zones_comb = all_shots_zones_comb[~all_shots_zones_comb ['SHOT_ZONE'].str.contains('Back Court', na=False)]

In [11]:
# SHOW SHOT CATEGORIZATION (difference between orig & _cat dataframes)

In [12]:
# Compute proportions of each action type
action_props = all_shots_zones_comb["ACTION_TYPE"].value_counts(normalize=True)

# Identify action types >= 1%
top_actions = action_props[action_props >= 0.01].index

# Filter the dataframe to only keep top actions
actions_filtered = all_shots_zones_comb[all_shots_zones_comb["ACTION_TYPE"].isin(top_actions)].copy()

# Recalculate counts and percentages after filtering
action_counts = actions_filtered["ACTION_TYPE"].value_counts()
action_percentages = actions_filtered["ACTION_TYPE"].value_counts(normalize=True) * 100  # percentage

# Calculate % of shots made for each action type
action_shot_made_pct = actions_filtered.groupby("ACTION_TYPE")["SHOT_MADE_FLAG"].mean() * 100

# Combine into a single dataframe for easy viewing
action_summary = pd.DataFrame({
    "Count": action_counts,
    "Percentage": action_percentages,
    "MadePercentage": action_shot_made_pct
})

# Sort by Count descending for readability
action_summary = action_summary.sort_values(by="MadePercentage", ascending=False)

print(action_summary)

                                Count  Percentage  MadePercentage
ACTION_TYPE                                                      
Dunk Shot                        3002    1.744608       86.842105
Cutting Layup Shot               3041    1.767273       80.170996
Driving Finger Roll Layup Shot   2134    1.240171       74.695408
Running Layup Shot               3432    1.994502       73.484848
Driving Layup Shot              12488    7.257385       61.082639
Pullup Jump shot                11779    6.845351       53.892521
Driving Floating Jump Shot       2112    1.227386       51.136364
Floating Jump shot               4953    2.878430       49.101555
Step Back Jump shot              4334    2.518698       48.823258
Tip Layup Shot                   3888    2.259506       47.222222
Hook Shot                        4430    2.574489       45.304740
Fadeaway Jump Shot               2890    1.679520       44.948097
Turnaround Jump Shot             3393    1.971838       42.793988
Layup Shot

In [13]:
# SAVE TO CSVs

# Merged zones, 57 shot types
all_shots_zones_comb.to_csv("shots_for_model.csv", index=False)

In [14]:
# Hyper Parameter Tuning for XG Boost Model

In [15]:
shots = pd.read_csv("shots_for_model.csv")

In [16]:
print("=== shots ===")
print(list(shots.columns))
print()

=== shots ===
['PLAYER_NAME', 'TEAM_NAME', 'ACTION_TYPE', 'SHOT_MADE_FLAG', 'SHOT_VALUE', 'SHOT_ZONE']



In [20]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, log_loss, brier_score_loss, roc_auc_score, make_scorer
import numpy as np
import numpy as np
import warnings
warnings.filterwarnings("ignore", message=".*use_label_encoder.*")



# --------------------------
# DATA & FEATURES
# --------------------------
features = [
    "PLAYER_NAME", "TEAM_NAME", 
    "ACTION_TYPE", "SHOT_ZONE"
]

target = "SHOT_MADE_FLAG"

# --------------------------
# PARAM GRID (same as before)
# --------------------------
param_grid = {
    "xgb__n_estimators": [100, 250, 500, 750, 1000],
    "xgb__max_depth": [3, 5, 8, 10, 12],
    "xgb__learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "xgb__subsample": [0.5, 0.75, 1.0],
    "xgb__colsample_bytree": [0.5, 0.75, 1.0]
}

# --------------------------
# TRAIN + TUNE
# --------------------------
def train_xgb_tuned(shots, features, target, param_grid):
    """
    Train an XGBoost classifier with hyperparameter tuning and report multiple evaluation metrics.

    Parameters:
    - shots: pd.DataFrame, dataset including features and target
    - features: list, column names to use as features
    - target: str, name of the target column
    - param_grid: dict, hyperparameter search space for RandomizedSearchCV

    Returns:
    - best_model: trained XGBClassifier pipeline
    - X_test, y_test: test set
    - y_proba: predicted probabilities for the positive class
    - accuracy, f1, logloss, brier, auc: evaluation metrics
    - predicted_success_rate, actual_success_rate: overall shot success
    """
    # --------------------------
    # Prepare data
    # --------------------------
    X = shots[features]
    y = shots[target]

    # Identify categorical features
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        [("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)],
        remainder="passthrough"
    )

    # --------------------------
    # Pipeline
    # --------------------------
    pipeline = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(eval_metric="logloss", n_jobs=1, random_state=42))
    ])

    # --------------------------
    # Train/test split
    # --------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # --------------------------
    # Randomized hyperparameter search
    # --------------------------
    search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_grid,
        n_iter=20,
        scoring=make_scorer(f1_score),
        cv=3,
        verbose=1,
        n_jobs=1,
        random_state=42
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    print("\nBest Hyperparameters:")
    print(search.best_params_)

    # --------------------------
    # Predictions
    # --------------------------
    y_proba = best_model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > 0.5).astype(int)

    # --------------------------
    # Standard metrics
    # --------------------------
    accuracy = best_model.score(X_test, y_test)
    f1 = f1_score(y_test, y_pred)

    # --------------------------
    # Probability-based metrics
    # --------------------------
    logloss = log_loss(y_test, y_proba)
    brier = brier_score_loss(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)

    # --------------------------
    # Overall predicted vs actual shot success
    # --------------------------
    predicted_success_rate = np.mean(y_proba)
    actual_success_rate = np.mean(y_test)

    # --------------------------
    # Print metrics with interpretation
    # --------------------------
    print(f"\nPredicted Overall Shot Success: {predicted_success_rate:.10f}")
    print(f"Actual Overall Shot Success: {actual_success_rate:.10f}")
    print(f"\nAccuracy: {accuracy:.3f} | F1 Score: {f1:.3f}")
    print(f"Log Loss: {logloss:.4f} (lower is better; measures how well predicted probabilities match actual outcomes)")
    print(f"Brier Score: {brier:.4f} (lower is better; mean squared error of predicted probabilities)")
    print(f"AUC-ROC: {auc:.3f} (1.0 = perfect ranking of shots, 0.5 = random)")
    print("Calibration curve indicates probability reliability.\n"
          "Points on diagonal = well-calibrated; above = underestimates; below = overestimates.")

    return best_model, X_test, y_test, y_proba, accuracy, f1, logloss, brier, auc, predicted_success_rate, actual_success_rate

# --------------------------
# Example run
# --------------------------
best_model, X_test, y_test, y_proba, acc, f1, logloss, brier, auc, predicted_success, actual_success = \
    train_xgb_tuned(shots, features, target, param_grid)

Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Hyperparameters:
{'xgb__subsample': 0.5, 'xgb__n_estimators': 1000, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.1, 'xgb__colsample_bytree': 0.5}

Predicted Overall Shot Success: 0.4551987350
Actual Overall Shot Success: 0.4543745077

Accuracy: 0.646 | F1 Score: 0.552
Log Loss: 0.6253 (lower is better; measures how well predicted probabilities match actual outcomes)
Brier Score: 0.2185 (lower is better; mean squared error of predicted probabilities)
AUC-ROC: 0.687 (1.0 = perfect ranking of shots, 0.5 = random)
Calibration curve indicates probability reliability.
Points on diagonal = well-calibrated; above = underestimates; below = overestimates.
