# Season-Scale Momentum Analysis

Build a game-level dataset across a season, quantify decisive lead changes and late-momentum alignment, and evaluate predictive lift.

In [1]:
# Bootstrap project root for imports
import sys, os, pathlib
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
print("Project root:", PROJECT_ROOT)

Project root: /Users/suhruth/Desktop/nba_momentum_factor


In [2]:
from scripts.momentum_utils import get_season_game_ids, build_season_dataset
import pandas as pd, numpy as np
from pathlib import Path
SEASON = "2023-24"
SEASON_TYPE = "Regular Season"
DATA_DIR = Path("../data"); DATA_DIR.mkdir(parents=True, exist_ok=True)

## Step 6: Season Dataset Assembly

In [3]:
game_ids = get_season_game_ids(SEASON, SEASON_TYPE)
len(game_ids), game_ids[:5]

(1230, ['0022300061', '0022300062', '0022300063', '0022300065', '0022300067'])

In [4]:
season_df = build_season_dataset(game_ids, sample_size=200, sleep_s=0.7)
SEASON_PATH = DATA_DIR / f"season_{SEASON.replace('-', '')}_momentum_sample.csv"
season_df.to_csv(SEASON_PATH, index=False)
print("Saved season dataset →", SEASON_PATH, "| rows =", len(season_df))
season_df.head()

Processed 25/200 games
Processed 50/200 games
Processed 75/200 games
Processed 100/200 games
Processed 125/200 games
Processed 150/200 games
Processed 175/200 games
Processed 200/200 games
Saved season dataset → ../data/season_202324_momentum_sample.csv | rows = 200


Unnamed: 0,game_id,home_win,final_margin,ot_game,diff_halftime,diff_q4start,diff_at_2min,momentum_l2_mean,max_run_points,avg_run_points,stability_invvar,last_lead_change_time,decisive_last_lead_change
0,22300061,0,12.0,0,-9.0,-7.0,-12.0,-12.98,13.0,2.947368,0.042136,186.0,True
1,22300062,1,4.0,0,15.0,-6.0,1.0,1.925,11.0,2.957746,0.031197,2589.0,True
2,22300063,0,6.0,0,1.0,-5.0,-4.0,-3.916667,7.0,2.461538,0.046531,2621.0,True
3,22300065,1,4.0,0,5.0,9.0,0.0,0.0125,8.0,2.413793,0.049982,2791.0,True
4,22300067,1,1.0,0,0.0,3.0,0.0,-2.983333,10.0,2.986667,0.050603,2868.0,True


### Step 6.4 — Export for Tableau
Export a 1-row-per-game table with helper columns used by the dashboard.

In [8]:
# Exports the exact schema the Tableau dashboard expects
import numpy as np
import pandas as pd

out = season_df.copy()

# Helper columns for Tableau
out["decided_before_5min"] = (out["last_lead_change_time"] <= 2580).astype(int)  # 5:00 in Q4
out["late_momentum_sign"] = np.sign(out["momentum_l2_mean"]).replace(0, np.nan)
out["winner_sign"] = np.where(out["home_win"] == 1, 1, -1).astype(int)
out["near_tie_flag"] = (out["diff_at_2min"].abs() <= 2).astype(int)

cols = [
    "game_id","home_win","final_margin","ot_game",
    "last_lead_change_time","decisive_last_lead_change",
    "diff_q4start","diff_at_2min","momentum_l2_mean",
    "max_run_points","stability_invvar",
    "decided_before_5min","late_momentum_sign","winner_sign","near_tie_flag"
]
out = out[cols]

path = "../data/tableau_momentum_export.csv"   # note: ../data because notebooks/ is your CWD
out.to_csv(path, index=False)
print("Saved →", path, "| rows =", len(out))
out.head()

Saved → ../data/tableau_momentum_export.csv | rows = 200


Unnamed: 0,game_id,home_win,final_margin,ot_game,last_lead_change_time,decisive_last_lead_change,diff_q4start,diff_at_2min,momentum_l2_mean,max_run_points,stability_invvar,decided_before_5min,late_momentum_sign,winner_sign,near_tie_flag
0,22300061,0,12.0,0,186.0,True,-7.0,-12.0,-12.98,13.0,0.042136,1,-1.0,-1,0
1,22300062,1,4.0,0,2589.0,True,-6.0,1.0,1.925,11.0,0.031197,0,1.0,1,1
2,22300063,0,6.0,0,2621.0,True,-5.0,-4.0,-3.916667,7.0,0.046531,0,-1.0,-1,0
3,22300065,1,4.0,0,2791.0,True,9.0,0.0,0.0125,8.0,0.049982,0,1.0,1,1
4,22300067,1,1.0,0,2868.0,True,3.0,0.0,-2.983333,10.0,0.050603,0,-1.0,1,1


## Step 7: Momentum Decisiveness & Alignment

In [9]:
Q4_END = 2880
CUTOFF_5MIN = Q4_END - 300

df_decisive = season_df.copy()
df_decisive["last_flip_before_5min"] = df_decisive["last_lead_change_time"] <= CUTOFF_5MIN
mask_known = df_decisive["decisive_last_lead_change"].notna() & df_decisive["last_flip_before_5min"].notna()
subset = df_decisive[mask_known]
rate_decisive = (subset["last_flip_before_5min"] & subset["decisive_last_lead_change"]).mean()
rate_late_flips = (~subset["last_flip_before_5min"]).mean()
print(f"Share 'decided' by last lead change before 5:00: {rate_decisive:.3f}")
print(f"Share last lead change after 5:00: {rate_late_flips:.3f}")

Share 'decided' by last lead change before 5:00: 0.745
Share last lead change after 5:00: 0.255


In [10]:
align_df = season_df.copy()
align_df["late_momentum_sign"] = np.sign(align_df["momentum_l2_mean"].replace(0, np.nan))
align_df["winner_sign"] = np.where(align_df["home_win"] == 1, 1, -1)
align_df = align_df.dropna(subset=["late_momentum_sign"])
alignment_rate = (align_df["late_momentum_sign"] == align_df["winner_sign"]).mean()
print(f"Alignment (final 2:00 momentum sign vs winner): {alignment_rate:.3f}")

near_tie = season_df.copy()
near_tie = near_tie[near_tie["diff_at_2min"].abs() <= 2]
near_tie = near_tie.assign(late_momentum_sign=np.sign(near_tie["momentum_l2_mean"].replace(0, np.nan)))
near_tie = near_tie.dropna(subset=["late_momentum_sign"])
win_rate_when_mom_home = (near_tie[near_tie["late_momentum_sign"] > 0]["home_win"] == 1).mean()
win_rate_when_mom_away = (near_tie[near_tie["late_momentum_sign"] < 0]["home_win"] == 0).mean()
print(f"Near-tie — win rate when late momentum favors home: {win_rate_when_mom_home:.3f}")
print(f"Near-tie — win rate when late momentum favors away: {win_rate_when_mom_away:.3f}")

Alignment (final 2:00 momentum sign vs winner): 0.890
Near-tie — win rate when late momentum favors home: 0.731
Near-tie — win rate when late momentum favors away: 0.714


## Step 8: Predictive Lift (Baseline vs +Momentum)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

model_df = season_df.copy()
baseline_feats = ["diff_q4start", "diff_at_2min", "ot_game"]
momentum_feats = ["momentum_l2_mean", "max_run_points", "stability_invvar"]
use_cols = ["home_win"] + baseline_feats + momentum_feats
model_df = model_df.dropna(subset=use_cols)

X_base = model_df[baseline_feats]
X_full = model_df[baseline_feats + momentum_feats]
y = model_df["home_win"].astype(int)

Xb_train, Xb_test, y_train, y_test = train_test_split(X_base, y, test_size=0.25, random_state=42, stratify=y)
Xf_train, Xf_test, _, _ = train_test_split(X_full, y, test_size=0.25, random_state=42, stratify=y)

pipe = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=200))])

pipe.fit(Xb_train, y_train)
pred_base = pipe.predict_proba(Xb_test)[:,1]
auc_base = roc_auc_score(y_test, pred_base)

pipe.fit(Xf_train, y_train)
pred_full = pipe.predict_proba(Xf_test)[:,1]
auc_full = roc_auc_score(y_test, pred_full)

print(f"AUC — Baseline: {auc_base:.3f}")
print(f"AUC — +Momentum: {auc_full:.3f}")
print(f"Δ AUC: {auc_full - auc_base:+.3f}")

AUC — Baseline: 0.950
AUC — +Momentum: 0.958
Δ AUC: +0.008
