# NCAA Men’s Basketball Wins Prediction (Portfolio Version)

This notebook is a clean, reproducible upgrade of the original project. It loads the provided team‑season dataset, evaluates simple and interpretable models with season‑based testing, and saves results/figures for the README.


In [None]:
from pathlib import Path
import sys

# Project paths
PROJECT_ROOT = Path.cwd().parents[0]
DATA_DIR = PROJECT_ROOT / "data"
FIG_DIR = PROJECT_ROOT / "figures"
RESULTS_DIR = PROJECT_ROOT / "results"

# Make src importable
sys.path.append(str(PROJECT_ROOT))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.run_pipeline import (
    load_data,
    prepare_dataset,
    compute_vif,
    run_evaluations,
    choose_model,
    build_models,
    FEATURES,
    TARGET,
    plot_predicted_vs_actual,
    plot_residuals_vs_predicted,
    plot_residuals_hist,
    plot_coefficients,
    plot_learning_curve,
)


## Load and audit data

In [None]:
df_raw = load_data()
df = prepare_dataset(df_raw)

print("Rows, columns:", df.shape)
print("Years:", sorted(df['YEAR'].unique())[:3], "...", sorted(df['YEAR'].unique())[-3:])

# Missingness snapshot
missing = df.isna().mean().sort_values(ascending=False)
missing.head(10)


In [None]:
df.dtypes

## Feature correlation + multicollinearity (VIF)

In [None]:
corr_matrix = df[FEATURES + [TARGET]].corr()

# Save for README/portfolio
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
corr_matrix.to_csv(RESULTS_DIR / "feature_correlations.csv", index=True)

corr_matrix

In [None]:
vif_df = compute_vif(df[FEATURES])
vif_df.to_csv(RESULTS_DIR / "vif.csv", index=False)
vif_df.head(10)

## Evaluation (season holdout + rolling years)

In [None]:
metrics_df, holdout_metrics, holdout_preds, holdout_df = run_evaluations(df)
metrics_df.to_csv(RESULTS_DIR / "metrics.csv", index=False)
metrics_df.head(10)

In [None]:
# Choose the simplest model that generalizes best on the season holdout
chosen_model_name = choose_model(holdout_metrics)
chosen_model_name

## Fit chosen model on holdout split and generate plots

In [None]:
latest_year = holdout_df['YEAR'].max()
train_df = df[df['YEAR'] < latest_year]
test_df = df[df['YEAR'] == latest_year]

X_train = train_df[FEATURES]
y_train = train_df[TARGET]
X_test = test_df[FEATURES]
y_test = test_df[TARGET]

models = build_models()
chosen_model = models[chosen_model_name]
chosen_model.fit(X_train, y_train)
preds = chosen_model.predict(X_test)

# Save plots
FIG_DIR.mkdir(parents=True, exist_ok=True)
plot_predicted_vs_actual(
    y_test,
    preds,
    FIG_DIR / "predicted_vs_actual_holdout.png",
    title=f"Predicted vs Actual Wins (Holdout {latest_year})",
)

residuals = preds - y_test.values
plot_residuals_vs_predicted(
    preds,
    residuals,
    FIG_DIR / "residuals_vs_predicted_holdout.png",
    title=f"Residuals vs Predicted (Holdout {latest_year})",
)

plot_residuals_hist(
    residuals,
    FIG_DIR / "residuals_hist_holdout.png",
    title=f"Residuals Distribution (Holdout {latest_year})",
)

plot_coefficients(
    chosen_model,
    FEATURES,
    FIG_DIR / "standardized_coefficients.png",
    title=f"Standardized Coefficients ({chosen_model_name})",
)


## Learning curve (underfitting/overfitting)

In [None]:
lc_df = plot_learning_curve(
    chosen_model,
    df[FEATURES],
    df[TARGET],
    df["YEAR"],
    FIG_DIR / "learning_curve_mae.png",
    title=f"Learning Curve (MAE) - {chosen_model_name}",
)

lc_df.to_csv(RESULTS_DIR / "learning_curve_mae.csv", index=False)
lc_df

In [None]:
# Top errors for holdout season
errors_df = pd.DataFrame({
    "team": test_df["TEAM"].values,
    "season": test_df["YEAR"].values,
    "actual_wins": y_test.values,
    "predicted_wins": preds,
})
errors_df["error"] = errors_df["predicted_wins"] - errors_df["actual_wins"]

over = errors_df.sort_values("error", ascending=False).head(10)
under = errors_df.sort_values("error", ascending=True).head(10)
top_errors = pd.concat([over, under], ignore_index=True)

top_errors.to_csv(RESULTS_DIR / "top_errors.csv", index=False)
top_errors.head(20)

## Holdout summary

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

summary = {
    "chosen_model": chosen_model_name,
    "holdout_year": int(latest_year),
    "holdout_R2": float(r2_score(y_test, preds)),
    "holdout_MAE": float(mean_absolute_error(y_test, preds)),
    "holdout_RMSE": float(np.sqrt(mean_squared_error(y_test, preds))),
}

summary_df = pd.DataFrame([summary])
summary_df.to_csv(RESULTS_DIR / "holdout_summary.csv", index=False)
summary_df