In [71]:
import os
import time
import datetime as dt
from typing import List, Dict, Optional

import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandingsv3
from nba_api.stats.library.parameters import SeasonTypeAllStar
from requests.exceptions import ReadTimeout, ConnectionError as RequestsConnectionError

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [72]:
# =========================
# CONFIG
# =========================

CURRENT_SEASON = "2025-26"
MIN_GAMES = 10  # minimum games played for eligibility
MIN_MINUTES = 15.0  # minimum minutes per game

TOP_N = 5  # how many candidates to report

TRAINING_CSV = "historical_dpoy_training.csv"
DAILY_DIR = "dpoy_daily"
MASTER_LOG = "dpoy_daily_log.csv"

SLEEP = 1.0  # pause between nba_api calls to be polite


In [73]:
# Features used by the logistic model for BOTH historical and current data.
# Keep these simple and consistent:
MODEL_FEATURES = [
    "bpg",  # blocks per game
    "spg",  # steals per game
    "rpg",  # rebounds per game
    "stocks",  # spg + bpg
    "inv_def_rank",  # 1 / team_def_rank
]


In [74]:
# =========================
# UTILS
# =========================


def _sleep_backoff(attempt: int, base: float = 1.0):
    """Exponential backoff sleep for retries."""
    time.sleep(base * (2 ** (attempt - 1)))


def ensure_dir(path: str):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)


In [75]:
# =========================
# 1. BUILD HISTORICAL DATA (INCLUDING 2024-25)
# =========================


def build_historical_dpoy_training() -> pd.DataFrame:
    """
    Build a training dataset from past DPOY races.
    Each row = player-season with basic defensive stats + label (1 = DPOY, 0 = not).
    Numbers are approximate but directionally correct.

    Covers: 2007-08 through 2024-25.
    """

    rows: List[Dict] = []

    def add(season, name, team, bpg, spg, rpg, drtg, dws, def_rank, label):
        rows.append(
            dict(
                season=season,
                player=name,
                team=team,
                bpg=bpg,
                spg=spg,
                rpg=rpg,
                drtg=drtg,
                dws=dws,
                def_rank=def_rank,  # lower is better (1 = best defense)
                label=label,
            )
        )

    # ===== 2024-25 (Evan Mobley) =====
    add(
        "2024-25",
        "Evan Mobley",
        "CLE",
        bpg=1.6,
        spg=0.9,
        rpg=9.3,
        drtg=108.6,
        dws=5.0,
        def_rank=8,
        label=1,
    )

    # Extra strong defenders as negatives (approximate placeholders)
    add(
        "2024-25",
        "Dyson Daniels",
        "ATL",
        bpg=0.5,
        spg=2.0,
        rpg=6.5,
        drtg=110.0,
        dws=3.5,
        def_rank=15,
        label=0,
    )

    add(
        "2024-25",
        "Draymond Green",
        "GSW",
        bpg=0.8,
        spg=1.2,
        rpg=7.5,
        drtg=108.0,
        dws=3.8,
        def_rank=12,
        label=0,
    )

    # ===== 2023-24 (Gobert) =====
    add("2023-24", "Rudy Gobert", "MIN", 2.1, 0.7, 12.9, 108, 5.6, 1, 1)
    add("2023-24", "Victor Wembanyama", "SAS", 3.6, 1.2, 10.6, 112, 4.0, 20, 0)
    add("2023-24", "Bam Adebayo", "MIA", 1.0, 1.1, 10.4, 110, 3.5, 5, 0)
    add("2023-24", "Anthony Davis", "LAL", 2.3, 1.2, 12.6, 108, 4.5, 9, 0)

    # ===== 2022-23 (Jaren Jackson Jr.) =====
    add("2022-23", "Jaren Jackson Jr.", "MEM", 3.0, 1.0, 6.8, 110, 4.0, 2, 1)
    add("2022-23", "Brook Lopez", "MIL", 2.5, 0.5, 6.7, 110, 3.5, 4, 0)
    add("2022-23", "Bam Adebayo", "MIA", 0.8, 1.2, 9.2, 111, 3.0, 9, 0)

    # ===== 2021-22 (Marcus Smart) =====
    add("2021-22", "Marcus Smart", "BOS", 0.3, 1.7, 3.8, 107, 4.6, 1, 1)
    add("2021-22", "Mikal Bridges", "PHX", 0.5, 1.2, 4.2, 108, 3.5, 3, 0)
    add("2021-22", "Rudy Gobert", "UTA", 2.1, 0.7, 14.7, 104, 4.8, 10, 0)

    # ===== 2020-21 (Rudy Gobert) =====
    add("2020-21", "Rudy Gobert", "UTA", 2.7, 0.6, 13.5, 101, 5.2, 3, 1)
    add("2020-21", "Ben Simmons", "PHI", 0.6, 1.6, 7.2, 106, 3.7, 2, 0)
    add("2020-21", "Draymond Green", "GSW", 0.8, 1.7, 7.1, 104, 3.5, 5, 0)

    # ===== 2019-20 (Giannis) =====
    add("2019-20", "Giannis Antetokounmpo", "MIL", 1.0, 1.0, 13.6, 97, 5.0, 1, 1)
    add("2019-20", "Anthony Davis", "LAL", 2.3, 1.5, 9.3, 102, 4.1, 3, 0)
    add("2019-20", "Rudy Gobert", "UTA", 2.0, 0.7, 13.5, 105, 4.4, 13, 0)

    # ===== 2018-19 (Rudy Gobert) =====
    add("2018-19", "Rudy Gobert", "UTA", 2.3, 0.8, 12.9, 104, 6.0, 2, 1)
    add("2018-19", "Paul George", "OKC", 0.4, 2.2, 8.2, 104, 4.7, 4, 0)

    # ===== 2017-18 (Rudy Gobert) =====
    add("2017-18", "Rudy Gobert", "UTA", 2.3, 0.8, 10.7, 101, 4.9, 1, 1)
    add("2017-18", "Joel Embiid", "PHI", 1.8, 0.6, 11.0, 103, 4.3, 4, 0)

    # ===== 2016-17 (Draymond Green) =====
    add("2016-17", "Draymond Green", "GSW", 1.4, 2.0, 7.9, 101, 4.9, 2, 1)
    add("2016-17", "Kawhi Leonard", "SAS", 0.7, 1.8, 5.8, 98, 4.6, 1, 0)
    add("2016-17", "Rudy Gobert", "UTA", 2.6, 0.6, 12.8, 101, 5.6, 3, 0)

    # ===== 2015-16 (Kawhi Leonard) =====
    add("2015-16", "Kawhi Leonard", "SAS", 1.0, 1.8, 6.8, 96, 5.5, 1, 1)
    add("2015-16", "Draymond Green", "GSW", 1.4, 1.5, 9.5, 100, 5.0, 4, 0)
    add("2015-16", "Hassan Whiteside", "MIA", 3.7, 0.6, 11.8, 100, 4.5, 7, 0)

    # ===== 2014-15 (Kawhi Leonard) =====
    add("2014-15", "Kawhi Leonard", "SAS", 0.8, 2.3, 7.2, 97, 5.0, 3, 1)
    add("2014-15", "Draymond Green", "GSW", 1.3, 1.6, 8.2, 99, 4.8, 1, 0)
    add("2014-15", "DeAndre Jordan", "LAC", 2.2, 0.7, 15.0, 100, 5.2, 16, 0)

    # ===== 2013-14 (Joakim Noah) =====
    add("2013-14", "Joakim Noah", "CHI", 1.5, 1.2, 11.3, 97, 6.6, 2, 1)
    add("2013-14", "Roy Hibbert", "IND", 2.2, 0.4, 6.6, 99, 4.0, 1, 0)
    add("2013-14", "DeAndre Jordan", "LAC", 2.5, 1.0, 13.6, 102, 4.3, 7, 0)

    # ===== 2012-13 (Marc Gasol) =====
    add("2012-13", "Marc Gasol", "MEM", 1.7, 1.0, 7.8, 98, 5.4, 2, 1)
    add("2012-13", "Serge Ibaka", "OKC", 3.0, 0.4, 7.7, 101, 4.7, 4, 0)
    add("2012-13", "LeBron James", "MIA", 0.9, 1.7, 8.0, 103, 4.8, 5, 0)

    # ===== 2011-12 (Tyson Chandler) =====
    add("2011-12", "Tyson Chandler", "NYK", 1.4, 0.9, 9.9, 99, 5.0, 5, 1)
    add("2011-12", "Serge Ibaka", "OKC", 3.7, 0.5, 7.5, 100, 4.1, 9, 0)
    add("2011-12", "LeBron James", "MIA", 0.9, 1.9, 8.0, 101, 4.5, 4, 0)

    # ===== 2010-11 (Dwight Howard) =====
    add("2010-11", "Dwight Howard", "ORL", 2.4, 1.4, 14.1, 94, 7.7, 3, 1)
    add("2010-11", "Kevin Garnett", "BOS", 0.8, 1.3, 8.9, 97, 4.5, 2, 0)
    add("2010-11", "Tyson Chandler", "DAL", 1.1, 0.4, 9.4, 100, 4.3, 10, 0)

    return df


def ensure_historical_csv(csv_path: str = TRAINING_CSV):
    """
    If the historical DPOY training CSV does not exist, build it from
    the hard-coded data in build_historical_dpoy_training().
    """
    if os.path.exists(csv_path):
        print(f"[historical] Found existing {csv_path}, using it.")
        return

    print(f"[historical] {csv_path} not found. Building it now...")
    df = build_historical_dpoy_training()
    df.to_csv(csv_path, index=False)
    print(f"[historical] Saved {len(df)} rows to {csv_path}")


def load_historical_dpoy_training(csv_path: str = TRAINING_CSV) -> pd.DataFrame:
    """
    Load historical DPOY training data from CSV.
    Ensures key derived columns exist; if not, creates them.
    """
    ensure_historical_csv(csv_path)

    df = pd.read_csv(csv_path)

    # Ensure derived columns exist
    if "stocks" not in df.columns:
        df["stocks"] = df["bpg"] + df["spg"]
    if "inv_def_rank" not in df.columns:
        df["inv_def_rank"] = 1.0 / df["def_rank"].replace(0, pd.NA)

    # We no longer REQUIRE dws or drtg_inv for the model,
    # but it's fine if they exist.
    return df


In [76]:
# =========================
# 2. TRAIN MODEL
# =========================


def train_dpoy_model(training_csv: str = TRAINING_CSV) -> Pipeline:
    """
    Train a logistic regression DPOY model on a small, consistent feature set
    to avoid instability and extreme probabilities.
    """
    df = load_historical_dpoy_training(training_csv)

    # Use the shared MODEL_FEATURES
    X = df[MODEL_FEATURES].values
    y = df["label"].values

    # Gentler logistic regression:
    # - no class_weight='balanced' (which can cause extreme probs on tiny data)
    # - moderate regularization C=0.5
    model = Pipeline(
        steps=[
            ("scaler", StandardScaler()),
            (
                "clf",
                LogisticRegression(
                    max_iter=1000,
                    C=0.5,
                    class_weight=None,
                    penalty="l2",
                ),
            ),
        ]
    )

    model.fit(X, y)
    return model


In [77]:
# =========================
# 3. CURRENT SEASON DEFENSIVE STATS FROM nba_api
# =========================


def fetch_current_defensive_stats(season: str) -> pd.DataFrame:
    """
    Fetch per-game defensive stats and team defensive rank proxy for a given season.
    """

    # --- Player stats (per-game) ---
    last_exc: Optional[Exception] = None
    for attempt in range(1, 4):
        try:
            print(
                f"[stats] Fetching player stats for {season} (attempt {attempt}/3)..."
            )
            resp = leaguedashplayerstats.LeagueDashPlayerStats(
                season=season,
                season_type_all_star=SeasonTypeAllStar.regular,
                per_mode_detailed="PerGame",
                measure_type_detailed_defense="Base",
                timeout=10,
            )
            stats = resp.get_data_frames()[0]
            print(f"[stats] -> got {len(stats)} player rows")
            break
        except (ReadTimeout, RequestsConnectionError) as e:
            print(f"[stats] timeout: {e}")
            last_exc = e
            if attempt < 3:
                _sleep_backoff(attempt)
        except Exception as e:
            print(f"[stats] error: {e}")
            last_exc = e
            if attempt < 3:
                _sleep_backoff(attempt)
    else:
        raise RuntimeError(
            f"Failed to fetch player stats for {season} after 3 attempts."
        ) from last_exc

    stats = stats.rename(
        columns={
            "PTS": "ppg",
            "REB": "rpg",
            "AST": "apg",
            "STL": "spg",
            "BLK": "bpg",
            "PLUS_MINUS": "plus_minus",
        }
    )

    stats["games_played"] = stats.get("GP", 0)
    stats["minutes_per_game"] = stats.get("MIN", 0.0)

    # Eligibility filter
    stats = stats[
        (stats["games_played"] >= MIN_GAMES)
        & (stats["minutes_per_game"] >= MIN_MINUTES)
    ].copy()

    time.sleep(SLEEP)

    # --- Team standings: build defensive rank proxy ---
    for attempt in range(1, 4):
        try:
            print(f"[teams] Fetching standings for {season} (attempt {attempt}/3)...")
            resp = leaguestandingsv3.LeagueStandingsV3(
                season=season,
                timeout=10,
            )
            standings = resp.get_data_frames()[0]
            print(f"[teams] -> got {len(standings)} team rows")
            break
        except Exception as e:
            print(f"[teams] error/timeout: {e}")
            last_exc = e
            if attempt < 3:
                _sleep_backoff(attempt)
    else:
        raise RuntimeError(
            f"Failed to fetch standings for {season} after 3 attempts."
        ) from last_exc

    if "TeamID" in standings.columns:
        standings = standings.rename(columns={"TeamID": "TEAM_ID"})
    if "WinPCT" in standings.columns:
        standings = standings.rename(columns={"WinPCT": "win_pct"})

    # Try to get opponent PPG; fallback to win_pct
    opp_pts_col = None
    for col in standings.columns:
        if "OPP" in col.upper() and "PTS" in col.upper():
            opp_pts_col = col
            break

    if opp_pts_col is not None:
        standings["opp_ppg"] = standings[opp_pts_col]
        standings["def_rank"] = standings["opp_ppg"].rank(
            method="dense", ascending=True
        )
    else:
        standings["def_rank"] = standings["win_pct"].rank(
            method="dense", ascending=False
        )

    team_def = standings[["TEAM_ID", "def_rank"]].copy()

    merged = stats.merge(
        team_def,
        on="TEAM_ID",
        how="left",
        validate="m:1",
    )

    # Derived features for current season
    merged["stocks"] = merged["spg"] + merged["bpg"]
    merged["inv_def_rank"] = 1.0 / merged["def_rank"].clip(lower=1.0)
    merged["dws_proxy"] = merged["stocks"] * merged["minutes_per_game"] / 36.0
    # approximate drtg_inv from plus_minus (no true DRtg here)
    merged["drtg_inv"] = (merged["plus_minus"].fillna(0.0) + 10.0) / 20.0

    return merged


def get_current_defensive_leader_candidates(season: str) -> pd.DataFrame:
    """
    Pull defensive leaders from nba_api:
      - Blocks leaders
      - Steals leaders
      - Defensive rebounds leaders
      - Stocks (steals+blocks)
    Merges them into a unique candidate pool.
    """

    print(f"[leaders] Fetching current defensive leaders for {season}...")

    resp = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        season_type_all_star=SeasonTypeAllStar.regular,
        per_mode_detailed="PerGame",
        measure_type_detailed_defense="Base",
        timeout=10,
    )
    df = resp.get_data_frames()[0]

    df = df.rename(
        columns={
            "PTS": "ppg",
            "REB": "rpg",
            "AST": "apg",
            "STL": "spg",
            "BLK": "bpg",
            "PLUS_MINUS": "plus_minus",
        }
    )

    df["stocks"] = df["spg"] + df["bpg"]

    # Get the top players for each defensive category
    blk_top = df.nlargest(30, "bpg")
    stl_top = df.nlargest(30, "spg")
    reb_top = df.nlargest(30, "rpg")
    stocks_top = df.nlargest(30, "stocks")

    # Combine all pools
    combined = pd.concat([blk_top, stl_top, reb_top, stocks_top], ignore_index=True)
    combined = combined.drop_duplicates(subset="PLAYER_ID").reset_index(drop=True)

    print(f"[leaders] Combined defensive leader pool: {len(combined)} players")

    return combined


In [78]:
# =========================
# 4. PROFILE FILTERS (BASED ON LAST ~5 DPOY WINNERS)
# =========================


def apply_dpoy_profile_filters(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter current-season players to those who roughly match the profile
    of recent DPOY winners (Mobley, Gobert, JJJ, Smart, etc.).
    Assumes df has: minutes_per_game, def_rank, spg, bpg, rpg, stocks.
    """

    # 1) Team defense: winners almost always on elite defenses
    # use top-10 as a rule (user requirement)
    mask_team = df["def_rank"] <= 10

    # 2) Minutes: real DPOYs play a solid starter load.
    mask_minutes = df["minutes_per_game"] >= 28.0

    # 3) Stocks: DPOYs are big event creators (blocks + steals).
    # Most winners are at ~2.0+ stocks per game.
    mask_stocks = df["stocks"] >= 2.0

    # 4) Defensive load style:
    #   - Rim protectors: big blocks
    #   - Perimeter stoppers: big steals
    mask_rim_or_perimeter = (df["bpg"] >= 1.3) | (df["spg"] >= 1.5)

    # 5) Rebounding floor: Smart was ~3.8 RPG, bigs are much higher.
    mask_reb = df["rpg"] >= 3.5

    filtered = df[
        mask_team & mask_minutes & mask_stocks & mask_rim_or_perimeter & mask_reb
    ].copy()

    return filtered


In [79]:
# =========================
# 5. PREDICT CURRENT SEASON DPOY CANDIDATES
# =========================


def predict_dpoy_candidates_for_season(
    season: str = CURRENT_SEASON,
    top_n: int = TOP_N,
    training_csv: str = TRAINING_CSV,
) -> pd.DataFrame:
    print("\n== Training DPOY model from historical CSV...")
    model = train_dpoy_model(training_csv)

    print(f"\n== Fetching defensive leaders for {season}...")
    leader_pool = get_current_defensive_leader_candidates(season)

    print(f"\n== Fetching full defensive stats for {season}...")
    cur = fetch_current_defensive_stats(season)

    # Keep only defensive leaders
    cur = cur[cur["PLAYER_ID"].isin(leader_pool["PLAYER_ID"])].copy()

    # Apply profile filters (team defense, minutes, stocks, etc.)
    cur = apply_dpoy_profile_filters(cur)

    if cur.empty:
        print("Warning: no players passed the DPOY profile filters.")
        return cur

    # Use the SAME features as training
    X = cur[MODEL_FEATURES].fillna(0.0).values
    probs = model.predict_proba(X)[:, 1]
    cur["dpoy_prob"] = probs

    cur_sorted = cur.sort_values("dpoy_prob", ascending=False).reset_index(drop=True)
    return cur_sorted.head(top_n)


In [82]:
# =========================
# 6. DAILY PIPELINE
# =========================


def run_daily_dpoy_update(
    season: str = CURRENT_SEASON,
    top_n: int = TOP_N,
    training_csv: str = TRAINING_CSV,
):
    today = dt.date.today().isoformat()
    tag = season.replace("-", "")

    print(f"\n=== Running daily DPOY update for {season} on {today} ===")

    top_df = predict_dpoy_candidates_for_season(
        season=season,
        top_n=top_n,
        training_csv=training_csv,
    )

    if top_df.empty:
        print("No candidates to save for today (empty result after filters).")
        return top_df

    top_df = top_df.copy()
    top_df["run_date"] = today
    top_df["season"] = season

    # 1) Per-day snapshot
    ensure_dir(DAILY_DIR)
    daily_filename = os.path.join(
        DAILY_DIR,
        f"dpoy_daily_top{top_n}_{tag}_{today}.csv",
    )
    top_df.to_csv(daily_filename, index=False)
    print(f"Saved daily top-{top_n} snapshot to {daily_filename}")

    # 2) Append to master log
    if os.path.exists(MASTER_LOG):
        existing = pd.read_csv(MASTER_LOG)
        combined = pd.concat([existing, top_df], ignore_index=True)
    else:
        combined = top_df

    combined.to_csv(MASTER_LOG, index=False)
    print(f"Updated master log at {MASTER_LOG}")

    # 3) Pretty print
    print("\n===== DPOY MODEL TOP CANDIDATES (TODAY) =====")
    for i, row in top_df.iterrows():
        print(
            f"{i + 1}. {row['PLAYER_NAME']} ({row['TEAM_ABBREVIATION']}), "
            f"{row['rpg']:.1f} REB, {row['spg']:.1f} STL, {row['bpg']:.1f} BLK, "
            f"stocks={row['stocks']:.2f}, team_def_rank={row['def_rank']:.0f}, "
            f"prob={row['dpoy_prob']:.3f}"
        )

    return top_df


In [83]:
# =========================
# 7. ENTRYPOINT
# =========================

if __name__ == "__main__":
    run_daily_dpoy_update()



=== Running daily DPOY update for 2025-26 on 2025-11-30 ===

== Training DPOY model from historical CSV...
[historical] Found existing historical_dpoy_training.csv, using it.

== Fetching defensive leaders for 2025-26...
[leaders] Fetching current defensive leaders for 2025-26...
[leaders] Combined defensive leader pool: 79 players

== Fetching full defensive stats for 2025-26...
[stats] Fetching player stats for 2025-26 (attempt 1/3)...
[stats] -> got 486 player rows
[teams] Fetching standings for 2025-26 (attempt 1/3)...
[teams] -> got 30 team rows
Saved daily top-5 snapshot to dpoy_daily/dpoy_daily_top5_202526_2025-11-30.csv
Updated master log at dpoy_daily_log.csv

===== DPOY MODEL TOP CANDIDATES (TODAY) =====
1. Mikal Bridges (NYK), 4.6 REB, 2.1 STL, 1.2 BLK, stocks=3.30, team_def_rank=1, prob=0.681
2. OG Anunoby (NYK), 5.6 REB, 1.9 STL, 0.7 BLK, stocks=2.60, team_def_rank=1, prob=0.653
3. Victor Wembanyama (SAS), 12.9 REB, 1.1 STL, 3.6 BLK, stocks=4.70, team_def_rank=6, prob=0.3