In [4]:
import pandas as pd

shots_clean_all_types = pd.read_csv("shots_clean_all_types.csv")
shots_comb_zones_all_types = pd.read_csv("shots_comb_zones_all_types.csv")


In [5]:
print("=== shots_clean_all_types ===")
print(list(shots_clean_all_types.columns))
print()

print("=== shots_comb_zones_all_types ===")
print(list(shots_comb_zones_all_types.columns))
print()

=== shots_clean_all_types ===
['PLAYER_NAME', 'TEAM_NAME', 'PERIOD', 'ACTION_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE', 'SHOT_MADE_FLAG', 'SHOT_VALUE', 'TIME_LEFT_SEC']

=== shots_comb_zones_all_types ===
['PLAYER_NAME', 'TEAM_NAME', 'PERIOD', 'ACTION_TYPE', 'SHOT_DISTANCE', 'SHOT_MADE_FLAG', 'SHOT_VALUE', 'TIME_LEFT_SEC', 'SHOT_ZONE']



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# --------------------------
# DATASETS
# --------------------------
datasets = {
    "clean_all": shots_clean_all_types,
    "comb_all": shots_comb_zones_all_types,
}

# --------------------------
# FEATURE SETS
# --------------------------
feature_sets = {
    "clean_all": [
        "PLAYER_NAME", "TEAM_NAME",
        "PERIOD", "TIME_LEFT_SEC",
        "ACTION_TYPE",
        "SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE",
        "SHOT_DISTANCE"
    ],
    "comb_all": [
        "PLAYER_NAME", "TEAM_NAME", "PERIOD",
        "ACTION_TYPE",
        "SHOT_ZONE",
        "SHOT_DISTANCE", "TIME_LEFT_SEC"
    ]
}

target = "SHOT_MADE_FLAG"

# --------------------------
# RANDOM FOREST TRAIN FUNCTION
# --------------------------
def train_rf(df, features, target):
    X = df[features].copy()
    y = df[target]

    # detect categorical columns
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
        ],
        remainder="passthrough"
    )

    # model pipeline
    model = Pipeline([
        ("prep", preprocessor),
        ("rf", RandomForestClassifier(
            n_estimators=500,
            max_depth=6,    # 
            random_state=42
        ))
    ])

    # split + train
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = model.score(X_test, y_test)
    f1 = f1_score(y_test, (y_proba > 0.5).astype(int))
    
    return model, X_test, y_test, y_proba, acc, f1


# --------------------------
# TRAIN ALL MODELS
# --------------------------
results_rf = {}

for name, df in datasets.items():
    print(f"\n===== Training {name} dataset (RF) =====")
    feats = feature_sets[name]

    model, X_test, y_test, y_proba, acc, f1 = train_rf(df, feats, target)

    results_rf[name] = {
        "model": model,
        "X_test": X_test,
        "y_test": y_test,
        "pred_proba": y_proba,
        "accuracy": acc,
        "f1": f1
    }

    print(f"{name} accuracy: {acc:.3f} | F1: {f1:.3f}")



===== Training clean_all dataset (RF) =====
clean_all accuracy: 0.623 | F1: 0.509

===== Training comb_all dataset (RF) =====
comb_all accuracy: 0.631 | F1: 0.486


In [20]:
import pandas as pd

# fi_df: the one you already computed
# Example columns: "Feature", "Importance"

# Create a new column for feature group
def get_feature_group(name):
    if name.startswith("cat__ACTION_TYPE"):
        return "ACTION_TYPE"
    elif name.startswith("cat__PLAYER_NAME"):
        return "PLAYER_NAME"
    elif name.startswith("cat__TEAM_NAME"):
        return "TEAM_NAME"
    elif name.startswith("remainder__SHOT_DISTANCE"):
        return "SHOT_DISTANCE"
    elif name.startswith("remainder__PERIOD"):
        return "PERIOD" # numeric feature name
    elif name.startswith("remainder__TIME_LEFT_SEC"):
        return "TIME_LEFT_SEC"  
    elif name.startswith("cat__SHOT_ZONE"):
        return "SHOT_ZONE"  
    else:
        return name  # fallback

fi_df["Feature_Group"] = fi_df["Feature"].apply(get_feature_group)

# Sum importance per group
grouped_fi = fi_df.groupby("Feature_Group")["Importance"].sum().sort_values(ascending=False)

print("\nFeature Importance by Group:")
print(grouped_fi.to_string())



Feature Importance by Group:
Feature_Group
PLAYER_NAME      0.555614
ACTION_TYPE      0.323093
TEAM_NAME        0.081948
SHOT_ZONE        0.029440
SHOT_DISTANCE    0.007114
TIME_LEFT_SEC    0.001562
PERIOD           0.001229


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


# --------------------------
# DATASETS
# --------------------------
datasets = {
    "clean_all": shots_clean_all_types,
    "comb_all": shots_comb_zones_all_types,
    "clean_all_reduced": shots_clean_all_types,
    "comb_all_reduced": shots_comb_zones_all_types,
}

# --------------------------
# FEATURE SETS
# --------------------------
feature_sets = {
    "clean_all": [
        "PLAYER_NAME", "TEAM_NAME",
        "PERIOD", "TIME_LEFT_SEC",
        "ACTION_TYPE",
        "SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE",
        "SHOT_DISTANCE"
    ],
    "comb_all": [
        "PLAYER_NAME", "TEAM_NAME", "PERIOD",
        "ACTION_TYPE",
        "SHOT_ZONE",
        "SHOT_DISTANCE", "TIME_LEFT_SEC"
    ],
    "clean_all_reduced": [
        "PLAYER_NAME", "TEAM_NAME",
        "ACTION_TYPE",
        "SHOT_ZONE_BASIC", "SHOT_ZONE_AREA", "SHOT_ZONE_RANGE",
    ],
    "comb_all_reduced": [
        "PLAYER_NAME", "TEAM_NAME", 
        "ACTION_TYPE",
        "SHOT_ZONE"
    ]
}

target = "SHOT_MADE_FLAG"

# --------------------------
# XGBOOST TRAIN FUNCTION
# --------------------------
def train_xgb(df, features, target):
    X = df[features].copy()
    y = df[target]

    # detect categoricals
    cat_features = [c for c in X.columns if X[c].dtype == "object"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
        ],
        remainder="passthrough"
    )

    # model pipeline
    model = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            n_estimators=500,
            max_depth=6,
            learning_rate=0.1,
            eval_metric="logloss",
            random_state=42
        ))
    ])

    # split + train
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = model.score(X_test, y_test)
    f1 = f1_score(y_test, (y_proba > 0.5).astype(int))
    
    return model, X_test, y_test, y_proba, acc, f1


# --------------------------
# TRAIN ALL MODELS
# --------------------------
results = {}

for name, df in datasets.items():
    print(f"\n===== Training {name} dataset =====")
    feats = feature_sets[name]

    model, X_test, y_test, y_proba, acc, f1 = train_xgb(df, feats, target)

    results[name] = {
        "model": model,
        "X_test": X_test,
        "y_test": y_test,
        "pred_proba": y_proba,
        "accuracy": acc
    }

    model, X_test, y_test, y_proba, acc, f1 = train_xgb(df, feats, target)
    print(f"{name} accuracy: {acc:.3f} | F1: {f1:.3f}")




===== Training clean_all dataset =====
clean_all accuracy: 0.649 | F1: 0.550

===== Training comb_all dataset =====
comb_all accuracy: 0.652 | F1: 0.554

===== Training clean_all_reduced dataset =====
clean_all_reduced accuracy: 0.651 | F1: 0.550

===== Training comb_all_reduced dataset =====
comb_all_reduced accuracy: 0.649 | F1: 0.547


In [22]:
# reduced feature sets don't improve accuracy when trained on league wide data

In [23]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

print("\n================ TEAM-LEVEL PERFORMANCE ================")

for dataset_name, res in results.items():
    print(f"\n===== {dataset_name} Dataset =====")

    model  = res["model"]
    X_test = res["X_test"].copy()
    y_test = res["y_test"]

    # Ensure TEAM_NAME is available for grouping
    if "TEAM_NAME" not in X_test.columns:
        print("TEAM_NAME missing â€” cannot compute team stats.\n")
        continue

    # Recompute predictions using the stored model
    y_pred = model.predict(X_test)

    # Build dataframe for team-level analysis
    df_team = pd.DataFrame({
        "TEAM_NAME": X_test["TEAM_NAME"],
        "y_true": y_test,
        "y_pred": y_pred
    })

    team_stats = []
    for team, group in df_team.groupby("TEAM_NAME"):
        acc = accuracy_score(group["y_true"], group["y_pred"])
        f1  = f1_score(group["y_true"], group["y_pred"], zero_division=0)
        team_stats.append((team, acc, f1))

    team_df = pd.DataFrame(team_stats, columns=["TEAM_NAME", "Accuracy", "F1"])

    # Top 5 teams by each metric
    top_acc = team_df.sort_values("Accuracy", ascending=False).head(5)
    top_f1  = team_df.sort_values("F1", ascending=False).head(5)

    print("\nTop Teams by Accuracy:")
    print(top_acc.to_string(index=False))

    print("\nTop Teams by F1:")
    print(top_f1.to_string(index=False))




===== clean_all Dataset =====

Top Teams by Accuracy:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.742969 0.676500
   Philadelphia 76ers  0.697641 0.605778
Golden State Warriors  0.679570 0.616309
      Milwaukee Bucks  0.672334 0.616637
     Sacramento Kings  0.667404 0.599823

Top Teams by F1:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.742969 0.676500
      Milwaukee Bucks  0.672334 0.616637
Golden State Warriors  0.679570 0.616309
   Philadelphia 76ers  0.697641 0.605778
     Sacramento Kings  0.667404 0.599823

===== comb_all Dataset =====

Top Teams by Accuracy:
         TEAM_NAME  Accuracy       F1
Washington Wizards  0.742574 0.702465
Philadelphia 76ers  0.709037 0.609467
Los Angeles Lakers  0.685844 0.528944
    Boston Celtics  0.678102 0.553191
      Phoenix Suns  0.672946 0.552577

Top Teams by F1:
            TEAM_NAME  Accuracy       F1
   Washington Wizards  0.742574 0.702465
Golden State Warriors  0.658408 0.618609
   Philadel

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

# Filter Wizards shots
df_wiz = shots_comb_zones_all_types[shots_comb_zones_all_types["TEAM_NAME"] == "Washington Wizards"].copy()
target = "SHOT_MADE_FLAG"

# Feature sets
feature_sets = {
    "full": ["PLAYER_NAME", "PERIOD", "ACTION_TYPE", "SHOT_ZONE", "SHOT_DISTANCE", "TIME_LEFT_SEC"],
    "reduced": ["PLAYER_NAME", "ACTION_TYPE", "SHOT_ZONE"]
}

results_wiz = {}

for name, features in feature_sets.items():
    print(f"\n===== Wizards XGBoost ({name} features) =====")

    X = df_wiz[features]
    y = df_wiz[target]

    # Identify categorical/numeric
    categorical_features = [f for f in features if f in ["PLAYER_NAME", "ACTION_TYPE", "SHOT_ZONE"]]
    numeric_features = [f for f in features if f not in categorical_features]

    # Preprocessor
    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ], remainder="passthrough")

    # Pipeline
    pipeline = Pipeline([
        ("prep", preprocessor),
        ("xgb", XGBClassifier(
            n_estimators=500,
            max_depth=6,
            learning_rate=0.1,
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        ))
    ])

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Fit model
    pipeline.fit(X_train, y_train)

    # Predict & evaluate
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {acc:.3f} | F1: {f1:.3f}")

    # Save results
    results_wiz[name] = {
        "model": pipeline,
        "X_test": X_test,
        "y_test": y_test,
        "y_pred": y_pred,
        "accuracy": acc,
        "f1": f1
    }



===== Wizards XGBoost (full features) =====


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.685 | F1: 0.643

===== Wizards XGBoost (reduced features) =====


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.708 | F1: 0.650


In [25]:
# reducing features improves accuracy when tested on a specific team, although the score is still lower than original because its not trained on all teams data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

# Filter Wizards shots
df_wiz = shots_comb_zones_all_types[shots_comb_zones_all_types["TEAM_NAME"] == "Washington Wizards"].copy()
target = "SHOT_MADE_FLAG"

# Feature sets
feature_sets = {
    "full": ["PLAYER_NAME", "PERIOD", "ACTION_TYPE", "SHOT_ZONE", "SHOT_DISTANCE", "TIME_LEFT_SEC"],
    "reduced": ["PLAYER_NAME", "ACTION_TYPE", "SHOT_ZONE"]
}

results_wiz = {}

for name, features in feature_sets.items():
    print(f"\n===== Wizards Random Forest ({name} features) =====")

    X = df_wiz[features]
    y = df_wiz[target]

    # Identify categorical/numeric
    categorical_features = [f for f in features if f in ["PLAYER_NAME", "ACTION_TYPE", "SHOT_ZONE"]]
    numeric_features = [f for f in features if f not in categorical_features]

    # Preprocessor
    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ], remainder="passthrough")

    # Pipeline
    pipeline = Pipeline([
        ("prep", preprocessor),
        ("rf", RandomForestClassifier(
            n_estimators=500,
            max_depth=None,      # You can change this
            random_state=42
        ))
    ])

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Fit model
    pipeline.fit(X_train, y_train)

    # Predict & evaluate
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {acc:.3f} | F1: {f1:.3f}")

    # Save results
    results_wiz[name] = {
        "model": pipeline,
        "X_test": X_test,
        "y_test": y_test,
        "y_pred": y_pred,
        "accuracy": acc,
        "f1": f1
    }



===== Wizards Random Forest (full features) =====
