In [816]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier

In [817]:
def create_rate_statistics(df):
    '''
    This method calculates rate-based and per-season statistics for basketball players, 
    including metrics like career length, durability, and a log-scaled volume score.
    '''
    out = df.copy()

    # Guard against divide-by-zero for brand-new players
    out["games"].replace(0, np.nan, inplace=True)

    # Career length (inclusive)
    out["career_years"] = out["end_year"] - out["start_year"] + 1

    # ----- Per-season production (body of work) -----
    per_season_sources = ["pts", "ast", "trb", "stl", "blk",
                          "fg", "fga", "ft", "fta",
                          "vorp", "ws", "ows", "dws"]
    for col in per_season_sources:
        out[f"{col}_per_season"] = out[col] / out["career_years"]

    # Durability baseline
    out["games_per_season"] = out["games"] / out["career_years"]

    # Simple log-scaled volume score (0–100ish)
    volume_cols = ["pts", "ast", "trb", "stl", "blk"]
    out["volume_score"] = np.log1p(out[volume_cols]).sum(axis=1) * 10

    return out

In [818]:
def create_longevity_features(df):
    '''
    This function generates various longevity-related features for basketball players 
    based on their career statistics, such as career length, durability, and All-Star consistency.
    It returns a DataFrame with additional features that quantify player longevity and durability.
    '''
    out = df.copy()

    out["career_years"] = out["end_year"] - out["start_year"] + 1

    # Longevity tiers (binary)
    out["long_career_15"] = (out["career_years"] >= 15).astype(int)
    out["long_career_18"] = (out["career_years"] >= 18).astype(int)
    out["long_career_20"] = (out["career_years"] >= 20).astype(int)

    # Total games tiers
    out["games_1000"] = (out["games"] >= 1000).astype(int)
    out["games_1200"] = (out["games"] >= 1200).astype(int)
    out["games_1400"] = (out["games"] >= 1400).astype(int)

    # Production + longevity blends
    out["longevity_scorer"] = (
        (out["career_years"] >= 15) & (out["pts"] >= 15000)).astype(int)
    out["longevity_playmaker"] = (
        (out["career_years"] >= 12) & (out["ast"] >= 4000)).astype(int)
    out["longevity_rebounder"] = (
        (out["career_years"] >= 12) & (out["trb"] >= 8000)).astype(int)

    # Iron-man durability
    out["games_per_season"] = out["games"] / out["career_years"]
    out["iron_man"] = (out["games_per_season"] >= 70).astype(int)

    # All-Star consistency
    out["asg_consistency"] = out["asg_selections"] / out["career_years"]
    out["consistent_allstar"] = (out["asg_consistency"] >= 0.40).astype(int)

    # Composite longevity score (0-100ish)
    out["longevity_score"] = (
        out["career_years"] * 2.0 +          # seasons matter most
        out["games_per_season"] * 0.5 +      # durability
        out["asg_consistency"] * 25 +        # star-level seasons
        out[["longevity_scorer",
             "longevity_playmaker",
             "longevity_rebounder"]].sum(axis=1) * 5
    )

    return out

In [819]:
def create_accolade_enhanced_features(df):
    """
    Converts raw award counts into interpretable scores and rates.
    """
    out = df.copy()

    # Binary elite flags
    out["has_mvp"] = (out["mvp_count"] >= 1).astype(int)
    out["has_multi_mvp"] = (out["mvp_count"] >= 2).astype(int)
    out["has_finals_mvp"] = (out["finals_mvp_count"] >= 1).astype(int)
    out["has_dpoy"] = (out["dpoy_count"] >= 1).astype(int)

    # Weighted award score
    out["major_awards_score"] = (
        out["mvp_count"] * 10 +
        out["finals_mvp_count"] * 8 +
        out["dpoy_count"] * 6 +
        out["asg_mvp_count"] * 2
    )

    # All-NBA score
    out["all_nba_score"] = (
        out["1st"] * 5 +
        out["2nd"] * 3 +
        out["3rd"] * 1
    )

    # Total accolades
    out["total_accolades"] = (
        out["major_awards_score"] +
        out["all_nba_score"] +
        out["asg_selections"]
    )

    # Rate per season
    out["career_years"] = out["end_year"] - out["start_year"] + 1
    out["accolades_per_season"] = out["total_accolades"] / out["career_years"]

    return out

In [820]:
def create_greatness_scores(df):
    """
    Synthesises awards, longevity, and volume into a single 'greatness'
    perspective without leaning on PER/BPM/TS%-style efficiency stats.
    """
    out = df.copy()

    if "longevity_score" not in out.columns or "major_awards_score" not in out.columns:
        raise ValueError(
            "Run longevity and accolade feature builders before calling create_greatness_scores.")

    # MVP-calibre check – actual MVP or extreme career totals
    out["mvp_calibre"] = (
        (out["mvp_count"] >= 1) |
        # rough 'elite franchise anchor' proxy
        ((out["pts"] >= 25000) & (out["ast"] >= 6000))
    ).astype(int)

    # Superstar proxy – sustained All-NBA presence or major awards
    out["superstar_flag"] = (
        (out["all_nba_score"] >= 15) |
        (out["major_awards_score"] >= 10)
    ).astype(int)

    # Composite greatness (0-100 scale-ish)
    out["greatness_score"] = (
        out["major_awards_score"] * 2.0 +
        out["longevity_score"] * 1.5 +
        out["volume_score"] * 0.8 +
        out["mvp_calibre"] * 10 +
        out["superstar_flag"] * 5
    )

    return out

In [821]:
def weight_features_for_accolades(X: pd.DataFrame, multiplier: float = 2.5) -> pd.DataFrame:
    """
    Heavily emphasise accolade columns *only*.  Efficiency keywords are
    intentionally ignored.
    """
    out = X.copy()
    accolade_keys = ["asg_selections", "all_nba", "total_accolades",
                     "accolades_per_season", "major_awards", "mvp",
                     "dpoy", "finals_mvp"]
    for col in out.columns:
        if any(key in col.lower() for key in accolade_keys):
            out[col] *= multiplier
    return out

In [822]:
def weight_features_for_longevity(X: pd.DataFrame,
                                  longevity_mult: float = 1.5,
                                  totals_mult: float = 1.3) -> pd.DataFrame:
    """
    Boost career-length / games features and raw counting totals.
    """
    out = X.copy()
    longevity_keys = ["career_years", "longevity",
                      "iron_man", "games_per_season"]
    total_keys = ["pts", "ast", "trb", "stl", "blk", "fg", "ft", "games"]

    for col in out.columns:
        low = col.lower()
        if any(key in low for key in longevity_keys):
            out[col] *= longevity_mult
        elif (any(key in low for key in total_keys)
              and "per_game" not in low
              and "per_season" not in low
              and "pct" not in low):
            out[col] *= totals_mult
    return out

In [823]:
def weight_features_for_greatness(X: pd.DataFrame,
                                  greatness_mult: float = 1.8) -> pd.DataFrame:
    """
    Final pass – boost composite greatness metrics created above.  No
    efficiency keywords are examined.
    """
    out = X.copy()
    greatness_keys = ["greatness_score", "mvp_calibre", "superstar_flag"]
    for col in out.columns:
        if any(key in col.lower() for key in greatness_keys):
            out[col] *= greatness_mult
    return out

In [824]:
#1. Load your raw career-level table

# Load and prepare data
df = pd.read_csv('../data/master.csv')
df['is_active'] = df['end_year'] >= 2023

#2. feature engineering pipeline
df_feat = (
    df
    .pipe(create_rate_statistics)                 
    .pipe(create_accolade_enhanced_features)
    .pipe(create_longevity_features)
    .pipe(create_greatness_scores)               
)

#3. Train / active Split
train_df = df_feat[~df_feat["is_active"]]
active_df = df_feat[df_feat["is_active"]]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  out["games"].replace(0, np.nan, inplace=True)


In [825]:
#4. Prepare the data (X, y)

drop_cols = [
    "hof",                    # target
    "name", "pos",
    "start_year", "end_year",
    "weight", "height",
    "is_active",
    # raw award counts we converted to composites (optional—safe to drop)
    "mvp_count", "finals_mvp_count", "asg_mvp_count",
]

X = train_df.drop(columns=drop_cols, errors="ignore")
y = train_df["hof"]

In [826]:
#5. applying the weighted helpers
Xw = (
    X
    .pipe(weight_features_for_accolades,        multiplier=2.5)
    .pipe(weight_features_for_longevity,        longevity_mult=2.0, totals_mult=1.8)
    .pipe(weight_features_for_greatness,        greatness_mult=1.5)
)

#6. Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(
    Xw, y, test_size=0.20, stratify=y, random_state=42
)

In [827]:
#7. Model definitions
logreg = Pipeline([
    ("scale", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(
        class_weight="balanced", max_iter=2000, solver="saga"))
])

rf = RandomForestClassifier(
    n_estimators=800, max_depth=None, n_jobs=-1,
    class_weight="balanced", random_state=42)

xgb = XGBClassifier(
    use_label_encoder=False, eval_metric="logloss",
    learning_rate=0.05, n_estimators=800,
    subsample=0.9, colsample_bytree=0.8,
    random_state=42)

gb = GradientBoostingClassifier(
    learning_rate=0.05, n_estimators=1000,
    max_depth=3, random_state=42)

lgbm = LGBMClassifier(
    objective="binary",
    class_weight="balanced",
    n_estimators=800,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42)

models = {
    "LogReg": logreg,
    "RandomForest": rf,
    "XGBoost": xgb,
    "GradBoost": gb,
    "LightGBM": lgbm,
}

In [828]:
#8. Fit and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)

    # Tree models can use raw X; pipeline handles scaling internally
    X_eval = X_test
    if name == "LogReg":   # scaling inside pipeline
        probs = model.predict_proba(X_eval)[:, 1]
        preds = model.predict(X_eval)
    else:
        probs = model.predict_proba(X_eval)[:, 1]
        preds = model.predict(X_eval)

    print(f"\n=== {name} ===")
    print("ROC-AUC :", round(roc_auc_score(y_test, probs), 3))
    print("Brier   :", round(brier_score_loss(y_test, probs), 3))
    print(classification_report(y_test, preds, digits=3))


=== LogReg ===
ROC-AUC : 0.98
Brier   : 0.031
              precision    recall  f1-score   support

           0      0.990     0.980     0.985       293
           1      0.833     0.909     0.870        33

    accuracy                          0.972       326
   macro avg      0.911     0.944     0.927       326
weighted avg      0.974     0.972     0.973       326


=== RandomForest ===
ROC-AUC : 0.993
Brier   : 0.016
              precision    recall  f1-score   support

           0      0.986     0.997     0.992       293
           1      0.967     0.879     0.921        33

    accuracy                          0.985       326
   macro avg      0.977     0.938     0.956       326
weighted avg      0.984     0.985     0.984       326



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
ROC-AUC : 0.981
Brier   : 0.015
              precision    recall  f1-score   support

           0      0.983     0.997     0.990       293
           1      0.966     0.848     0.903        33

    accuracy                          0.982       326
   macro avg      0.974     0.923     0.947       326
weighted avg      0.981     0.982     0.981       326


=== GradBoost ===
ROC-AUC : 0.987
Brier   : 0.02
              precision    recall  f1-score   support

           0      0.983     0.993     0.988       293
           1      0.933     0.848     0.889        33

    accuracy                          0.979       326
   macro avg      0.958     0.921     0.939       326
weighted avg      0.978     0.979     0.978       326

[LightGBM] [Info] Number of positive: 133, number of negative: 1170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info]

In [829]:
#9. predict hof probs for active players

# Build feature matrix for the active cohort
Xa = (
    active_df.drop(columns=drop_cols, errors="ignore")
             .pipe(weight_features_for_accolades, multiplier=2.5)
             .pipe(weight_features_for_longevity, longevity_mult=2.0, totals_mult=1.8)
             .pipe(weight_features_for_greatness,  greatness_mult=1.5)
)

# DataFrame to collect results
pred_df = active_df[["name", "start_year", "end_year"]].copy()

for name, model in models.items():
    # For the log-reg pipeline, scaling is inside; other models take raw Xa
    probs = model.predict_proba(Xa)[:, 1]
    pred_df[f"prob_{name.lower()}"] = probs

# Rank by each model (optional)
for name in models:
    pred_df[f"rank_{name.lower()}"] = pred_df[f"prob_{name.lower()}"].rank(
        ascending=False, method="min"
    )

# Sort by average rank to get a consensus ordering
pred_df["avg_rank"] = pred_df[[
    f"rank_{n.lower()}" for n in models]].mean(axis=1)
pred_df = pred_df.sort_values("avg_rank").reset_index(drop=True)

# Display top 25 active HoF candidates
display(pred_df.head(25))

Unnamed: 0,name,start_year,end_year,prob_logreg,prob_randomforest,prob_xgboost,prob_gradboost,prob_lightgbm,rank_logreg,rank_randomforest,rank_xgboost,rank_gradboost,rank_lightgbm,avg_rank
0,Kevin Durant,2008,2025,1.0,0.9475,0.998557,1.0,1.0,3.0,2.0,3.0,2.0,2.0,2.4
1,Stephen Curry,2010,2025,1.0,0.93875,0.998569,1.0,1.0,4.0,4.0,2.0,3.0,1.0,2.8
2,Chris Paul,2006,2025,0.99985,0.92875,0.998748,0.999998,1.0,10.0,6.0,1.0,7.0,3.0,5.4
3,James Harden,2010,2025,1.0,0.93875,0.997133,0.999999,1.0,6.0,4.0,7.0,6.0,5.0,5.6
4,LeBron James,2004,2025,1.0,0.915,0.997213,0.999995,1.0,1.0,8.0,6.0,9.0,6.0,6.0
5,Giannis Antetokounmpo,2014,2025,1.0,0.9025,0.995337,0.999999,1.0,2.0,9.0,10.0,4.0,7.0,6.4
6,Anthony Davis,2013,2025,0.997343,0.945,0.996077,1.0,1.0,13.0,3.0,8.0,1.0,8.0,6.6
7,Russell Westbrook,2009,2025,0.999993,0.95125,0.998366,0.999996,0.999993,8.0,1.0,4.0,8.0,15.0,7.2
8,Damian Lillard,2013,2025,0.998865,0.92375,0.995672,0.999999,1.0,12.0,7.0,9.0,5.0,4.0,7.4
9,Nikola Jokić,2016,2025,1.0,0.7575,0.976829,0.999982,0.999999,5.0,14.0,14.0,11.0,9.0,10.6


In [830]:
#10. Feature importance extraction

def top_feats_from_series(series, k=15):
    """Return top-k features as 'feat: weight' strings."""
    return [f"{idx}: {val:.3f}" for idx, val in series.head(k).items()]


for name, model in models.items():
    print(f"\n### {name}")

    if name == "LogReg":
        coef = model.named_steps["clf"].coef_[0]
        s = pd.Series(coef, index=Xw.columns).abs(
        ).sort_values(ascending=False)
        print("Top coefficients →", *top_feats_from_series(s), sep="\n  ")

    elif name in ["RandomForest", "GradBoost", "LightGBM"]:
        s = pd.Series(model.feature_importances_, index=Xw.columns)\
              .sort_values(ascending=False)
        print("Top feature_importances_ →", *
              top_feats_from_series(s), sep="\n  ")

    elif name == "XGBoost":
        # get_score returns dict; default importance_type="gain"
        score = model.get_booster().get_score(importance_type="gain")
        s = pd.Series(score).sort_values(ascending=False)
        print("Top gain-based features →", *
              top_feats_from_series(s, k=15), sep="\n  ")


### LogReg
Top coefficients →
  vorp: 1.641
  vorp_per_season: 1.587
  blk_pct: 1.489
  pf: 1.437
  tov_pct: 1.414
  stl: 1.355
  drb: 1.306
  ast_pct: 1.295
  orb_pct: 1.293
  usg_pct: 1.205
  ts_pct: 1.196
  asg_selections: 1.187
  orb: 1.070
  ast_per_season: 0.996
  consistent_allstar: 0.812

### RandomForest
Top feature_importances_ →
  asg_selections: 0.077
  total_accolades: 0.069
  accolades_per_season: 0.063
  asg_consistency: 0.062
  bpm: 0.047
  fta_per_season: 0.040
  ws_per_season: 0.040
  obpm: 0.038
  all_nba_score: 0.035
  tov: 0.032
  longevity_score: 0.031
  ft_per_season: 0.030
  consistent_allstar: 0.026
  vorp: 0.024
  fta: 0.024

### XGBoost
Top gain-based features →
  asg_selections: 36.153
  total_accolades: 22.614
  accolades_per_season: 6.291
  asg_consistency: 5.651
  ows: 4.803
  bpm: 4.559
  greatness_score: 3.294
  volume_score: 2.633
  tov: 2.608
  ft_per_season: 2.286
  drb: 2.056
  ws: 1.627
  pts: 1.627
  vorp: 1.531
  vorp_per_season: 1.484

### Grad