# NCAA Men’s Basketball Wins Prediction (Portfolio Version)

This notebook is a clean, reproducible upgrade of the original project. It loads the provided team‑season dataset, evaluates simple and interpretable models with season‑based testing, and saves results/figures for the README.


In [1]:
from pathlib import Path
import sys

# Project paths
PROJECT_ROOT = Path.cwd().parents[0]
DATA_DIR = PROJECT_ROOT / "data"
FIG_DIR = PROJECT_ROOT / "figures"
RESULTS_DIR = PROJECT_ROOT / "results"

# Make src importable
sys.path.append(str(PROJECT_ROOT))


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.run_pipeline import (
    load_data,
    prepare_dataset,
    compute_vif,
    run_evaluations,
    choose_model,
    build_models,
    FEATURES,
    TARGET,
    plot_predicted_vs_actual,
    plot_residuals_vs_predicted,
    plot_residuals_hist,
    plot_coefficients,
)


## Load and audit data

In [3]:
df_raw = load_data()
df = prepare_dataset(df_raw)

print("Rows, columns:", df.shape)
print("Years:", sorted(df['YEAR'].unique())[:3], "...", sorted(df['YEAR'].unique())[-3:])

# Missingness snapshot
missing = df.isna().mean().sort_values(ascending=False)
missing.head(10)


Rows, columns: (3876, 25)
Years: [np.int64(2013), np.int64(2014), np.int64(2015)] ... [np.int64(2021), np.int64(2022), np.int64(2023)]


RK            0.908927
SEED          0.824561
POSTSEASON    0.824561
FTR           0.000000
YEAR          0.000000
WAB           0.000000
ADJ_T         0.000000
3P_D          0.000000
3P_O          0.000000
2P_D          0.000000
dtype: float64

In [4]:
df.dtypes

TEAM           object
CONF           object
G               int64
W               int64
ADJOE         float64
ADJDE         float64
BARTHAG       float64
EFG_O         float64
EFG_D         float64
TOR           float64
TORD          float64
ORB           float64
DRB           float64
FTR           float64
FTRD          float64
2P_O          float64
2P_D          float64
3P_O          float64
3P_D          float64
ADJ_T         float64
WAB           float64
POSTSEASON     object
SEED          float64
YEAR            int64
RK            float64
dtype: object

## Feature correlation + multicollinearity (VIF)

In [5]:
corr_matrix = df[FEATURES + [TARGET]].corr()

# Save for README/portfolio
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
corr_matrix.to_csv(RESULTS_DIR / "feature_correlations.csv", index=True)

corr_matrix

Unnamed: 0,ADJOE,ADJDE,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,W
ADJOE,1.0,-0.491175,0.721126,-0.308806,-0.591426,-0.121494,0.261225,-0.247823,0.095045,-0.306348,0.632547,-0.303095,0.55949,-0.188022,0.046715,0.732526
ADJDE,-0.491175,1.0,-0.194581,0.78896,0.169334,-0.276782,-0.28286,0.329995,-0.074296,0.174824,-0.235314,0.712163,-0.065039,0.575847,0.194961,-0.653999
EFG_O,0.721126,-0.194581,1.0,-0.107692,-0.366312,-0.159164,-0.134808,-0.319192,-0.061385,-0.358698,0.889703,-0.091683,0.75028,-0.09262,0.093646,0.593221
EFG_D,-0.308806,0.78896,-0.107692,1.0,0.051735,-0.064906,-0.353358,0.159623,-0.182624,0.070364,-0.134469,0.902571,-0.035662,0.720204,0.228029,-0.588539
TOR,-0.591426,0.169334,-0.366312,0.051735,1.0,0.153941,0.172986,0.222977,0.167245,0.284051,-0.29811,0.045888,-0.311475,0.040639,-0.061747,-0.431768
TORD,-0.121494,-0.276782,-0.159164,-0.064906,0.153941,1.0,0.156662,0.298114,0.11292,0.354446,-0.09735,-0.019256,-0.177714,-0.11893,-0.024687,0.137953
ORB,0.261225,-0.28286,-0.134808,-0.353358,0.172986,0.156662,1.0,0.095112,0.335231,0.206984,-0.102869,-0.369493,-0.10475,-0.170655,-0.081068,0.309732
DRB,-0.247823,0.329995,-0.319192,0.159623,0.222977,0.298114,0.095112,1.0,0.181108,0.310466,-0.323907,0.154299,-0.179347,0.113283,0.055509,-0.321245
FTR,0.095045,-0.074296,-0.061385,-0.182624,0.167245,0.11292,0.335231,0.181108,1.0,0.340134,-0.064991,-0.216867,-0.012877,-0.035174,0.04511,0.152017
FTRD,-0.306348,0.174824,-0.358698,0.070364,0.284051,0.354446,0.206984,0.310466,0.340134,1.0,-0.378258,0.02928,-0.180118,0.119331,0.014166,-0.255197


In [6]:
vif_df = compute_vif(df[FEATURES])
vif_df.to_csv(RESULTS_DIR / "vif.csv", index=False)
vif_df.head(10)

Unnamed: 0,feature,vif
3,EFG_D,237.234665
2,EFG_O,152.122473
11,2P_D,126.698534
10,2P_O,74.036895
13,3P_D,49.429191
12,3P_O,35.078741
0,ADJOE,11.614078
1,ADJDE,9.620141
4,TOR,2.880881
5,TORD,2.755916


## Evaluation (season holdout + rolling years)

In [7]:
metrics_df, holdout_metrics, holdout_preds, holdout_df = run_evaluations(df)
metrics_df.to_csv(RESULTS_DIR / "metrics.csv", index=False)
metrics_df.head(10)

Unnamed: 0,model_name,evaluation_method,R2,MAE,RMSE
0,BaselineMean,SeasonHoldout_2023,-0.037571,5.082936,6.356769
1,Baseline_ADJOE_minus_ADJDE,SeasonHoldout_2023,0.606848,3.136,3.912983
2,LinearRegression,SeasonHoldout_2023,0.807093,2.161385,2.74095
3,RidgeCV,SeasonHoldout_2023,0.807281,2.159989,2.739619
4,LassoCV,SeasonHoldout_2023,0.807869,2.155424,2.735432
5,BaselineMean,Rolling_2019,-0.023132,5.171116,6.439165
6,Baseline_ADJOE_minus_ADJDE,Rolling_2019,0.628697,3.083723,3.87907
7,LinearRegression,Rolling_2019,0.845595,1.999396,2.501463
8,RidgeCV,Rolling_2019,0.845747,1.998226,2.500234
9,LassoCV,Rolling_2019,0.845164,1.99785,2.504956


In [8]:
# Choose the simplest model that generalizes best on the season holdout
chosen_model_name = choose_model(holdout_metrics)
chosen_model_name

'LinearRegression'

## Fit chosen model on holdout split and generate plots

In [9]:
latest_year = holdout_df['YEAR'].max()
train_df = df[df['YEAR'] < latest_year]
test_df = df[df['YEAR'] == latest_year]

X_train = train_df[FEATURES]
y_train = train_df[TARGET]
X_test = test_df[FEATURES]
y_test = test_df[TARGET]

models = build_models()
chosen_model = models[chosen_model_name]
chosen_model.fit(X_train, y_train)
preds = chosen_model.predict(X_test)

# Save plots
FIG_DIR.mkdir(parents=True, exist_ok=True)
plot_predicted_vs_actual(
    y_test,
    preds,
    FIG_DIR / "predicted_vs_actual_holdout.png",
    title=f"Predicted vs Actual Wins (Holdout {latest_year})",
)

residuals = preds - y_test.values
plot_residuals_vs_predicted(
    preds,
    residuals,
    FIG_DIR / "residuals_vs_predicted_holdout.png",
    title=f"Residuals vs Predicted (Holdout {latest_year})",
)

plot_residuals_hist(
    residuals,
    FIG_DIR / "residuals_hist_holdout.png",
    title=f"Residuals Distribution (Holdout {latest_year})",
)

plot_coefficients(
    chosen_model,
    FEATURES,
    FIG_DIR / "standardized_coefficients.png",
    title=f"Standardized Coefficients ({chosen_model_name})",
)


In [10]:
# Top errors for holdout season
errors_df = pd.DataFrame({
    "team": test_df["TEAM"].values,
    "season": test_df["YEAR"].values,
    "actual_wins": y_test.values,
    "predicted_wins": preds,
})
errors_df["error"] = errors_df["predicted_wins"] - errors_df["actual_wins"]

over = errors_df.sort_values("error", ascending=False).head(10)
under = errors_df.sort_values("error", ascending=True).head(10)
top_errors = pd.concat([over, under], ignore_index=True)

top_errors.to_csv(RESULTS_DIR / "top_errors.csv", index=False)
top_errors.head(20)

Unnamed: 0,team,season,actual_wins,predicted_wins,error
0,Texas Tech,2023,16,21.028151,5.028151
1,New Mexico St.,2023,9,13.784239,4.784239
2,Stanford,2023,14,18.590629,4.590629
3,Wyoming,2023,9,13.338869,4.338869
4,Ohio St.,2023,16,20.158416,4.158416
5,Harvard,2023,14,17.719105,3.719105
6,Pepperdine,2023,9,12.498471,3.498471
7,Tennessee,2023,25,28.455313,3.455313
8,Rutgers,2023,19,22.355497,3.355497
9,Mississippi,2023,12,15.255334,3.255334


## Holdout summary

In [11]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

summary = {
    "chosen_model": chosen_model_name,
    "holdout_year": int(latest_year),
    "holdout_R2": float(r2_score(y_test, preds)),
    "holdout_MAE": float(mean_absolute_error(y_test, preds)),
    "holdout_RMSE": float(np.sqrt(mean_squared_error(y_test, preds))),
}

summary_df = pd.DataFrame([summary])
summary_df.to_csv(RESULTS_DIR / "holdout_summary.csv", index=False)
summary_df

Unnamed: 0,chosen_model,holdout_year,holdout_R2,holdout_MAE,holdout_RMSE
0,LinearRegression,2023,0.807093,2.161385,2.74095
