In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib


In [39]:
# --------------------------------------------------------------------------------------
# CONFIG
# --------------------------------------------------------------------------------------

LABELED_SEASONS = [
    "2010-11",
    "2011-12",
    "2012-13",
    "2013-14",
    "2014-15",
    "2015-16",
    "2016-17",
    "2017-18",
    "2018-19",
    "2019-20",
    "2020-21",
    "2021-22",
    "2022-23",
    "2023-24",
]

UNLABELED_SEASONS = ["2024-25"]

MVP_WINNERS = {
    "2010-11": "Derrick Rose",
    "2011-12": "LeBron James",
    "2012-13": "LeBron James",
    "2013-14": "Kevin Durant",
    "2014-15": "Stephen Curry",
    "2015-16": "Stephen Curry",
    "2016-17": "Russell Westbrook",
    "2017-18": "James Harden",
    "2018-19": "Giannis Antetokounmpo",
    "2019-20": "Giannis Antetokounmpo",
    "2020-21": "Nikola Jokic",
    "2021-22": "Nikola Jokic",
    "2022-23": "Joel Embiid",
    "2023-24": "Nikola Jokic",
}

# --------------------------------------------------------------------------------------
# HELPERS
# --------------------------------------------------------------------------------------


def season_end_year(season_str: str) -> int:
    start, end = season_str.split("-")
    return int(start[:2] + end)


def get_player_stats_nbaapi(season: str) -> pd.DataFrame:
    """
    Pull per-game player stats for a season using nba_api.
    Robust: only keeps columns that actually exist in the response.
    """
    print(f"  - Fetching player stats (nba_api) for {season}")
    resp = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        season_type_all_star=SeasonTypeAllStar.regular,
        per_mode_detailed="PerGame",
    )
    df = resp.get_data_frames()[0]

    desired_keep = [
        "PLAYER_ID",
        "PLAYER_NAME",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "GP",
        "MIN",
        "PTS",
        "REB",
        "AST",
        "PLUS_MINUS",
        # "USG_PCT",  # <-- removed because it's not present in your data
        "STL",
        "BLK",
        "TOV",
        "FG_PCT",
        "FG3_PCT",
        "FT_PCT",
    ]

    existing_keep = [c for c in desired_keep if c in df.columns]
    missing = [c for c in desired_keep if c not in df.columns]
    if missing:
        print(f"    > Warning: missing columns in player stats for {season}: {missing}")

    df = df[existing_keep].copy()

    # Rename only columns that exist
    rename_map = {
        "PTS": "ppg",
        "REB": "rpg",
        "AST": "apg",
        "PLUS_MINUS": "plus_minus",
        "STL": "spg",
        "BLK": "bpg",
        "TOV": "tov",
        "FG_PCT": "fg_pct",
        "FG3_PCT": "fg3_pct",
        "FT_PCT": "ft_pct",
    }
    rename_map = {k: v for k, v in rename_map.items() if k in df.columns}
    df.rename(columns=rename_map, inplace=True)

    if "GP" in df.columns:
        df["games_played"] = df["GP"]
    if "MIN" in df.columns:
        df["minutes_per_game"] = df["MIN"]

    return df


def get_team_standings_nbaapi(season: str) -> pd.DataFrame:
    """
    Pull team standings for a season using nba_api.
    """
    print(f"  - Fetching team standings (nba_api) for {season}")
    resp = leaguestandingsv3.LeagueStandingsV3(season=season)
    df = resp.get_data_frames()[0]

    desired_keep = [
        "TeamID",
        "TeamSlug",
        "TeamName",
        "Conference",
        "ConferenceRank",
        "WINS",
        "LOSSES",
        "WinPCT",
    ]
    existing_keep = [c for c in desired_keep if c in df.columns]
    missing = [c for c in desired_keep if c not in df.columns]
    if missing:
        print(f"    > Warning: missing columns in standings for {season}: {missing}")

    df = df[existing_keep].copy()

    rename_map = {
        "TeamID": "TEAM_ID",
        "Conference": "conference",
        "ConferenceRank": "conf_rank",
        "WINS": "wins",
        "LOSSES": "losses",
        "WinPCT": "win_pct",
    }
    rename_map = {k: v for k, v in rename_map.items() if k in df.columns}
    df.rename(columns=rename_map, inplace=True)

    if "wins" in df.columns:
        df["team_wins"] = df["wins"]
    else:
        df["team_wins"] = pd.NA

    return df


def build_season_df(season: str, labeled: bool = True) -> pd.DataFrame:
    """
    Build one season's dataset using ONLY nba_api:
      - per-game player stats
      - team standings
      - MVP label if labeled=True
    """
    print(f"\n=== Building data for {season} (labeled={labeled}) ===")

    players = get_player_stats_nbaapi(season)
    time.sleep(1)

    teams = get_team_standings_nbaapi(season)
    time.sleep(1)

    df = players.merge(teams, on="TEAM_ID", how="left", validate="m:1")

    df["season"] = season
    df["season_end_year"] = season_end_year(season)

    if labeled:
        winner_name = MVP_WINNERS[season]
        df["mvp"] = (df["PLAYER_NAME"] == winner_name).astype(int)
        mvp_count = df["mvp"].sum()
        if mvp_count != 1:
            print(
                f"  ! WARNING: season {season} has {mvp_count} MVP rows "
                f"(expected 1). Check spelling for '{winner_name}'."
            )
    else:
        df["mvp"] = pd.NA

    return df


# --------------------------------------------------------------------------------------
# MAIN
# --------------------------------------------------------------------------------------


def main():
    all_seasons = []

    for season in LABELED_SEASONS:
        try:
            season_df = build_season_df(season, labeled=True)
            if season_df is None or season_df.empty:
                print(f"  ! {season}: empty labeled dataframe, skipping.")
            else:
                all_seasons.append(season_df)
        except Exception as e:
            print(f"FATAL error for labeled season {season}: {e}")

    for season in UNLABELED_SEASONS:
        try:
            season_df = build_season_df(season, labeled=False)
            if season_df is None or season_df.empty:
                print(f"  ! {season}: empty unlabeled dataframe, skipping.")
            else:
                all_seasons.append(season_df)
        except Exception as e:
            print(f"FATAL error for unlabeled season {season}: {e}")

    if not all_seasons:
        raise ValueError("No season data collected. Check earlier error messages.")

    full_df = pd.concat(all_seasons, ignore_index=True)

    cols_order = [
        "season",
        "season_end_year",
        "PLAYER_ID",
        "PLAYER_NAME",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "conference",
        "conf_rank",
        "wins",
        "losses",
        "win_pct",
        "team_wins",
        "games_played",
        "minutes_per_game",
        "ppg",
        "rpg",
        "apg",
        "plus_minus",
        "spg",
        "bpg",
        "tov",
        "fg_pct",
        "fg3_pct",
        "ft_pct",
        "mvp",
    ]

    cols_order = [c for c in cols_order if c in full_df.columns] + [
        c for c in full_df.columns if c not in cols_order
    ]

    full_df = full_df[cols_order]

    out_name = "historical_mvp_data.csv"
    full_df.to_csv(out_name, index=False)
    print(f"\nSaved {len(full_df)} rows to {out_name}")


if __name__ == "__main__":
    main()



=== Building data for 2010-11 (labeled=True) ===
  - Fetching player stats (nba_api) for 2010-11
  - Fetching team standings (nba_api) for 2010-11

=== Building data for 2011-12 (labeled=True) ===
  - Fetching player stats (nba_api) for 2011-12
  - Fetching team standings (nba_api) for 2011-12

=== Building data for 2012-13 (labeled=True) ===
  - Fetching player stats (nba_api) for 2012-13
  - Fetching team standings (nba_api) for 2012-13

=== Building data for 2013-14 (labeled=True) ===
  - Fetching player stats (nba_api) for 2013-14
  - Fetching team standings (nba_api) for 2013-14

=== Building data for 2014-15 (labeled=True) ===
  - Fetching player stats (nba_api) for 2014-15
  - Fetching team standings (nba_api) for 2014-15

=== Building data for 2015-16 (labeled=True) ===
  - Fetching player stats (nba_api) for 2015-16
  - Fetching team standings (nba_api) for 2015-16

=== Building data for 2016-17 (labeled=True) ===
  - Fetching player stats (nba_api) for 2016-17
  - Fetching t

In [40]:
import pandas as pd

df = pd.read_csv("historical_mvp_data.csv")


# Helper: set MVP = 1 for the player whose name contains a substring
def fix_mvp_for_season(df, season, name_substring):
    mask_season = df["season"] == season
    # reset any previous labels (just in case)
    df.loc[mask_season, "mvp"] = 0

    # find any player whose name contains the substring (case insensitive)
    mask_player = mask_season & df["PLAYER_NAME"].str.contains(
        name_substring, case=False, na=False
    )

    matches = df.loc[mask_player, "PLAYER_NAME"].unique()
    print(f"{season}: matching players for '{name_substring}': {matches}")

    df.loc[mask_player, "mvp"] = 1
    return df


# Fix Jokić seasons
df = fix_mvp_for_season(df, "2020-21", "jokic")
df = fix_mvp_for_season(df, "2021-22", "jokic")
df = fix_mvp_for_season(
    df, "2023-24", "jokic"
)  # adjust if that season’s MVP changes later

df.to_csv("historical_mvp_data.csv", index=False)
print("Patched historical_mvp_data.csv with correct Jokić MVP labels.")


2020-21: matching players for 'jokic': []
2021-22: matching players for 'jokic': []
2023-24: matching players for 'jokic': []
Patched historical_mvp_data.csv with correct Jokić MVP labels.


In [41]:
import pandas as pd

df = pd.read_csv("historical_mvp_data.csv")

# some seasons will have NaNs for conference / wins on players if something glitchy,
# so guard against that
df["wins"] = df["wins"].fillna(0)
df["conference"] = df["conference"].fillna("Unknown")

# compute conference rank per season & conference based on team wins (descending)
df["conf_rank"] = (
    df.groupby(["season", "conference"])["wins"]
    .rank(method="dense", ascending=False)
    .astype(int)
)

df.to_csv("historical_mvp_data.csv", index=False)
print("Added conf_rank column based on wins within each conference.")


Added conf_rank column based on wins within each conference.


In [42]:
CSV_PATH = "historical_mvp_data.csv"
MODEL_PATH = "mvp_model_rf.pkl"

# 1. Load data
df = pd.read_csv(CSV_PATH)

# 2. Use only labeled rows (mvp = 0 or 1)
train_df = df[df["mvp"].isin([0, 1])].copy()

# 3. Features
FEATURE_COLS = [
    "ppg",
    "rpg",
    "apg",
    "plus_minus",
    "team_wins",
    "conf_rank",
    "win_pct",
    "minutes_per_game",
    "games_played",
]

# Drop rows missing any of these
train_df = train_df.dropna(subset=FEATURE_COLS + ["mvp"])

X = train_df[FEATURE_COLS]
y = train_df["mvp"]

# 4. Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. RandomForest model
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    random_state=42,
    class_weight="balanced",  # helps with the big class imbalance
    n_jobs=-1,  # use all cores
)

model.fit(X_train, y_train)

# 6. Eval
val_pred_proba = model.predict_proba(X_val)[:, 1]
val_pred = (val_pred_proba > 0.5).astype(int)

print("Validation AUC:", roc_auc_score(y_val, val_pred_proba))
print(classification_report(y_val, val_pred))

# 7. Save model + feature list
joblib.dump({"model": model, "feature_cols": FEATURE_COLS}, MODEL_PATH)
print(f"Saved RandomForest MVP model to {MODEL_PATH}")


Validation AUC: 0.7440807799442898
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1436
         1.0       0.00      0.00      0.00         2

    accuracy                           1.00      1438
   macro avg       0.50      0.50      0.50      1438
weighted avg       1.00      1.00      1.00      1438

Saved RandomForest MVP model to mvp_model_rf.pkl


In [43]:
import time
import datetime as dt
import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandingsv3
from nba_api.stats.library.parameters import SeasonTypeAllStar
import joblib

MODEL_PATH = "mvp_model_rf.pkl"

# Set to whatever season you're evaluating, e.g. "2024-25"
CURRENT_SEASON = "2025-26"

MIN_PPG = 25.0
MAX_CONF_RANK = 3
TOP_K = 10

# ---------- Data fetchers ----------


def get_current_player_stats(season: str) -> pd.DataFrame:
    resp = leaguedashplayerstats.LeagueDashPlayerStats(
        season=season,
        season_type_all_star=SeasonTypeAllStar.regular,
        per_mode_detailed="PerGame",
    )
    df = resp.get_data_frames()[0]

    keep = [
        "PLAYER_ID",
        "PLAYER_NAME",
        "TEAM_ID",
        "TEAM_ABBREVIATION",
        "GP",
        "MIN",
        "PTS",
        "REB",
        "AST",
        "PLUS_MINUS",
        "STL",
        "BLK",
        "TOV",
        "FG_PCT",
        "FG3_PCT",
        "FT_PCT",
    ]
    keep = [c for c in keep if c in df.columns]
    df = df[keep].copy()

    rename_map = {
        "PTS": "ppg",
        "REB": "rpg",
        "AST": "apg",
        "PLUS_MINUS": "plus_minus",
        "STL": "spg",
        "BLK": "bpg",
        "TOV": "tov",
        "FG_PCT": "fg_pct",
        "FG3_PCT": "fg3_pct",
        "FT_PCT": "ft_pct",
    }
    rename_map = {k: v for k, v in rename_map.items() if k in df.columns}
    df.rename(columns=rename_map, inplace=True)

    if "GP" in df.columns:
        df["games_played"] = df["GP"]
    else:
        df["games_played"] = None

    if "MIN" in df.columns:
        df["minutes_per_game"] = df["MIN"]
    else:
        df["minutes_per_game"] = None

    return df


def get_current_team_standings(season: str) -> pd.DataFrame:
    resp = leaguestandingsv3.LeagueStandingsV3(season=season)
    df = resp.get_data_frames()[0]

    cols = ["TeamID", "Conference", "WINS", "LOSSES", "WinPCT"]
    cols = [c for c in cols if c in df.columns]
    df = df[cols].copy()

    df.rename(
        columns={
            "TeamID": "TEAM_ID",
            "Conference": "conference",
            "WINS": "wins",
            "LOSSES": "losses",
            "WinPCT": "win_pct",
        },
        inplace=True,
    )

    df["wins"] = df["wins"].fillna(0)
    df["conference"] = df["conference"].fillna("Unknown")

    # derive conf_rank from wins within each conference
    df["conf_rank"] = (
        df.groupby("conference")["wins"]
        .rank(method="dense", ascending=False)
        .astype(int)
    )

    df["team_wins"] = df["wins"]
    return df


def build_current_features(season: str) -> pd.DataFrame:
    players = get_current_player_stats(season)
    time.sleep(1)
    teams = get_current_team_standings(season)
    time.sleep(1)

    df = players.merge(
        teams[
            [
                "TEAM_ID",
                "conference",
                "conf_rank",
                "wins",
                "losses",
                "win_pct",
                "team_wins",
            ]
        ],
        on="TEAM_ID",
        how="left",
        validate="m:1",
    )
    return df


# ---------- MVP ranking ----------


def daily_mvp_ranking(
    season: str = CURRENT_SEASON,
    min_ppg: float = MIN_PPG,
    max_conf_rank: int = MAX_CONF_RANK,
    top_k: int = TOP_K,
):
    obj = joblib.load(MODEL_PATH)
    model = obj["model"]
    feature_cols = obj["feature_cols"]

    df = build_current_features(season)

    # Apply your constraints
    df = df.copy()
    df = df[(df["ppg"] >= min_ppg) & (df["conf_rank"] <= max_conf_rank)]

    if df.empty:
        print("No players satisfy MVP constraints today.")
        return df

    # Drop rows missing any of the features the model expects
    df = df.dropna(subset=feature_cols)
    if df.empty:
        print("No players have full feature data today after dropping NaNs.")
        return df

    X = df[feature_cols]
    df["mvp_probability"] = model.predict_proba(X)[:, 1]

    df = df.sort_values("mvp_probability", ascending=False)

    # Print top_k
    print(
        df[
            [
                "PLAYER_NAME",
                "TEAM_ABBREVIATION",
                "conference",
                "conf_rank",
                "ppg",
                "rpg",
                "apg",
                "mvp_probability",
            ]
        ].head(top_k)
    )

    today = dt.date.today().isoformat()
    out_name = f"mvp_daily_{today}.csv"
    df.to_csv(out_name, index=False)
    print("Saved daily MVP ranking to", out_name)

    return df


if __name__ == "__main__":
    daily_mvp_ranking()


                 PLAYER_NAME TEAM_ABBREVIATION conference  conf_rank   ppg  \
423  Shai Gilgeous-Alexander               OKC       West          1  32.9   
30             Austin Reaves               LAL       West          2  28.5   
363             Nikola Jokić               DEN       West          2  28.9   
59           Cade Cunningham               DET       East          1  28.8   
318              Luka Dončić               LAL       West          2  35.1   
467        Victor Wembanyama               SAS       West          3  26.2   
459              Tyler Herro               MIA       East          3  25.7   

      rpg   apg  mvp_probability  
423   4.9   6.7         0.279988  
30    5.9   6.7         0.233977  
363  12.4  10.9         0.210000  
59    6.4   9.4         0.207981  
318   8.5   9.4         0.103998  
467  12.9   4.0         0.055996  
459   5.3   3.0         0.000000  
Saved daily MVP ranking to mvp_daily_2025-11-30.csv
