In [1]:
import pandas as pd

shots_clean_all_types = pd.read_csv("shots_clean_all_types.csv")
shots_comb_zones_all_types = pd.read_csv("shots_comb_zones_all_types.csv")


In [2]:
print("=== shots_clean_all_types ===")
print(list(shots_clean_all_types.columns))
print()

print("=== shots_comb_zones_all_types ===")
print(list(shots_comb_zones_all_types.columns))
print()

=== shots_clean_all_types ===
['PLAYER_NAME', 'TEAM_NAME', 'PERIOD', 'ACTION_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE', 'SHOT_MADE_FLAG', 'SHOT_VALUE', 'TIME_LEFT_SEC']

=== shots_comb_zones_all_types ===
['PLAYER_NAME', 'TEAM_NAME', 'PERIOD', 'ACTION_TYPE', 'SHOT_DISTANCE', 'SHOT_MADE_FLAG', 'SHOT_VALUE', 'TIME_LEFT_SEC', 'SHOT_ZONE']



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer
import numpy as np

# --------------------------
# DATASETS
# --------------------------
datasets = {
    "clean_all": shots_clean_all_types,
    "comb_all": shots_comb_zones_all_types,
}

# --------------------------
# FEATURE SETS
# --------------------------
feature_sets = {
    "clean_all": [
        "PLAYER_NAME", "TEAM_NAME",
        "PERIOD", "TIME_LEFT_SEC",
        "ACTION_TYPE",
        "SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE",
        "SHOT_DISTANCE"
    ],
    "comb_all": [
        "PLAYER_NAME", "TEAM_NAME", "PERIOD",
        "ACTION_TYPE",
        "SHOT_ZONE",
        "SHOT_DISTANCE", "TIME_LEFT_SEC"
    ]
}

target = "SHOT_MADE_FLAG"

# --------------------------
# HYPERPARAMETER GRID
# --------------------------
param_grid = {
    "xgb__n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 1000],
    "xgb__max_depth": [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20],
    "xgb__learning_rate": [0.01, 0.02, 0.03, 0.05, 0.07, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5],
    "xgb__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "xgb__colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

# --------------------------
# TRAIN + TUNE FUNCTION
# --------------------------
def train_xgb_tuned(df, features, target):
    X = df[features].copy()
    y = df[target]

    # detect categoricals
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
        ],
        remainder="passthrough"
    )

    # base pipeline
    model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42
        ))
    ])

    # F1 scorer
    f1_scorer = make_scorer(f1_score)

    # Randomized search
    search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=20,  # number of random combinations to try
        scoring=f1_scorer,
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    y_proba = best_model.predict_proba(X_test)[:, 1]
    acc = best_model.score(X_test, y_test)
    f1 = f1_score(y_test, (y_proba > 0.5).astype(int))
    
    print("Best hyperparameters:", search.best_params_)
    
    return best_model, X_test, y_test, y_proba, acc, f1

# --------------------------
# TRAIN + TUNE ALL DATASETS
# --------------------------
results_tuned = {}

for name, df in datasets.items():
    print(f"\n===== Training and tuning {name} dataset =====")
    feats = feature_sets[name]

    model, X_test, y_test, y_proba, acc, f1 = train_xgb_tuned(df, feats, target)

    results_tuned[name] = {
        "model": model,
        "X_test": X_test,
        "y_test": y_test,
        "pred_proba": y_proba,
        "accuracy": acc,
        "f1": f1
    }

    print(f"{name} accuracy: {acc:.3f} | F1: {f1:.3f}")



===== Training and tuning clean_all dataset =====
Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best hyperparameters: {'xgb__subsample': 0.7, 'xgb__n_estimators': 800, 'xgb__max_depth': 7, 'xgb__learning_rate': 0.05, 'xgb__colsample_bytree': 0.7}
clean_all accuracy: 0.653 | F1: 0.555

===== Training and tuning comb_all dataset =====
Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best hyperparameters: {'xgb__subsample': 0.8, 'xgb__n_estimators': 500, 'xgb__max_depth': 10, 'xgb__learning_rate': 0.05, 'xgb__colsample_bytree': 0.5}
comb_all accuracy: 0.652 | F1: 0.553


In [None]:
# achieved ideal hyper parameters, since these are nearly identical

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer

# --------------------------
# DATASETS
# --------------------------
datasets = {
    "clean_all_reduced": shots_clean_all_types,
    "comb_all_reduced": shots_comb_zones_all_types,
}

# --------------------------
# FEATURE SETS
# --------------------------
feature_sets = {
    "clean_all_reduced": [
        "PLAYER_NAME", "TEAM_NAME",
        "ACTION_TYPE",
        "SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE",
    ],
    "comb_all_reduced": [
        "PLAYER_NAME", "TEAM_NAME", 
        "ACTION_TYPE",
        "SHOT_ZONE"
    ]
}

target = "SHOT_MADE_FLAG"

# --------------------------
# HYPERPARAMETER GRID
# --------------------------
param_grid = {
    "xgb__n_estimators": [50, 100, 200, 300, 400, 500, 600, 700, 800, 1000],
    "xgb__max_depth": [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20],
    "xgb__learning_rate": [0.01, 0.02, 0.03, 0.05, 0.07, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5],
    "xgb__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "xgb__colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

# --------------------------
# TRAIN + TUNE FUNCTION
# --------------------------
def train_xgb_tuned(df, features, target):
    X = df[features].copy()
    y = df[target]

    # detect categorical features
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
        ],
        remainder="passthrough"
    )

    # pipeline
    model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42
        ))
    ])

    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # F1 scorer
    f1_scorer = make_scorer(f1_score)

    search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=20,  # number of random combinations
        scoring=f1_scorer,
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    y_proba = best_model.predict_proba(X_test)[:, 1]
    acc = best_model.score(X_test, y_test)
    f1 = f1_score(y_test, (y_proba > 0.5).astype(int))

    print("Best hyperparameters:", search.best_params_)

    return best_model, X_test, y_test, y_proba, acc, f1

# --------------------------
# TRAIN + TUNE ALL REDUCED FEATURE SETS
# --------------------------
results_reduced = {}

for name, df in datasets.items():
    print(f"\n===== Training and tuning {name} dataset =====")
    feats = feature_sets[name]

    model, X_test, y_test, y_proba, acc, f1 = train_xgb_tuned(df, feats, target)

    results_reduced[name] = {
        "model": model,
        "X_test": X_test,
        "y_test": y_test,
        "pred_proba": y_proba,
        "accuracy": acc,
        "f1": f1
    }

    print(f"{name} accuracy: {acc:.3f} | F1: {f1:.3f}")


===== Training and tuning clean_all_reduced dataset =====
Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best hyperparameters: {'xgb__subsample': 0.5, 'xgb__n_estimators': 800, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.07, 'xgb__colsample_bytree': 0.7}
clean_all_reduced accuracy: 0.651 | F1: 0.554

===== Training and tuning comb_all_reduced dataset =====
Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best hyperparameters: {'xgb__subsample': 0.5, 'xgb__n_estimators': 800, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.07, 'xgb__colsample_bytree': 0.7}
comb_all_reduced accuracy: 0.649 | F1: 0.553


In [68]:
# reduced feature sets don't improve accuracy when trained on league wide data

In [8]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

print("\n================ TEAM-LEVEL PERFORMANCE ================")

for dataset_name, res in results_tuned.items():
    print(f"\n===== {dataset_name} Dataset =====")

    model  = res["model"]
    X_test = res["X_test"].copy()
    y_test = res["y_test"]

    # Ensure TEAM_NAME is available for grouping
    if "TEAM_NAME" not in X_test.columns:
        print("TEAM_NAME missing — cannot compute team stats.\n")
        continue

    # Recompute predictions using the stored model
    y_pred = model.predict(X_test)

    # Build dataframe for team-level analysis
    df_team = pd.DataFrame({
        "TEAM_NAME": X_test["TEAM_NAME"],
        "y_true": y_test,
        "y_pred": y_pred
    })

    team_stats = []
    for team, group in df_team.groupby("TEAM_NAME"):
        acc = accuracy_score(group["y_true"], group["y_pred"])
        f1  = f1_score(group["y_true"], group["y_pred"], zero_division=0)
        team_stats.append((team, acc, f1))

    team_df = pd.DataFrame(team_stats, columns=["TEAM_NAME", "Accuracy", "F1"])

    # Top 5 teams by each metric
    top_acc = team_df.sort_values("Accuracy", ascending=False).head(5)
    top_f1  = team_df.sort_values("F1", ascending=False).head(5)

    print("\nTop Teams by Accuracy:")
    print(top_acc.to_string(index=False))

    print("\nTop Teams by F1:")
    print(top_f1.to_string(index=False))




===== clean_all Dataset =====

Top Teams by Accuracy:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.742969 0.677134
   Philadelphia 76ers  0.699071 0.616226
Golden State Warriors  0.686022 0.633779
      Milwaukee Bucks  0.678516 0.627240
   Los Angeles Lakers  0.677545 0.544457

Top Teams by F1:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.742969 0.677134
Golden State Warriors  0.686022 0.633779
      Milwaukee Bucks  0.678516 0.627240
   Philadelphia 76ers  0.699071 0.616226
     Sacramento Kings  0.664454 0.599119

===== comb_all Dataset =====

Top Teams by Accuracy:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.738005 0.697183
   Philadelphia 76ers  0.706833 0.606120
   Los Angeles Lakers  0.678274 0.517594
         Phoenix Suns  0.677468 0.543710
Golden State Warriors  0.674174 0.638333

Top Teams by F1:
             TEAM_NAME  Accuracy       F1
    Washington Wizards  0.738005 0.697183
 Golden State Warriors  0.674174

In [9]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

print("\n================ TEAM-LEVEL PERFORMANCE ================")

for dataset_name, res in results_reduced.items():
    print(f"\n===== {dataset_name} Dataset =====")

    model  = res["model"]
    X_test = res["X_test"].copy()
    y_test = res["y_test"]

    # Ensure TEAM_NAME is available for grouping
    if "TEAM_NAME" not in X_test.columns:
        print("TEAM_NAME missing — cannot compute team stats.\n")
        continue

    # Recompute predictions using the stored model
    y_pred = model.predict(X_test)

    # Build dataframe for team-level analysis
    df_team = pd.DataFrame({
        "TEAM_NAME": X_test["TEAM_NAME"],
        "y_true": y_test,
        "y_pred": y_pred
    })

    team_stats = []
    for team, group in df_team.groupby("TEAM_NAME"):
        acc = accuracy_score(group["y_true"], group["y_pred"])
        f1  = f1_score(group["y_true"], group["y_pred"], zero_division=0)
        team_stats.append((team, acc, f1))

    team_df = pd.DataFrame(team_stats, columns=["TEAM_NAME", "Accuracy", "F1"])

    # Top 5 teams by each metric
    top_acc = team_df.sort_values("Accuracy", ascending=False).head(5)
    top_f1  = team_df.sort_values("F1", ascending=False).head(5)

    print("\nTop Teams by Accuracy:")
    print(top_acc.to_string(index=False))

    print("\nTop Teams by F1:")
    print(top_f1.to_string(index=False))




===== clean_all_reduced Dataset =====

Top Teams by Accuracy:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.739844 0.671273
   Philadelphia 76ers  0.705504 0.619926
Golden State Warriors  0.693907 0.639053
     Sacramento Kings  0.679941 0.614565
   Los Angeles Lakers  0.679099 0.533333

Top Teams by F1:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.739844 0.671273
Golden State Warriors  0.693907 0.639053
   Philadelphia 76ers  0.705504 0.619926
      Milwaukee Bucks  0.672334 0.618018
     Sacramento Kings  0.679941 0.614565

===== comb_all_reduced Dataset =====

Top Teams by Accuracy:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.738766 0.694023
   Philadelphia 76ers  0.700955 0.604470
   Los Angeles Lakers  0.691900 0.543210
 New Orleans Pelicans  0.678214 0.582834
Golden State Warriors  0.670420 0.633250

Top Teams by F1:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.738766 0.694023
Golden State Warri

In [71]:
# reducing features improves accuracy when tested on a specific team, although the score is still lower than original because its not trained on all teams data