In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Start by loading past Tournament results and my NCAAW probabilites without finals flipped and no clipping applied. Compute the win margins with the right signs (lower TeamID first).

In [None]:
data_dir = Path("/kaggle/input/ncaaw-march-mania-2021-spread/WDataFiles_Stage2_Spread/")
tourney_results = pd.read_csv(data_dir / "WNCAATourneyDetailedResults.csv")
tourney_results = tourney_results.assign(Margin=lambda x: 
                                         np.sign(x.LTeamID - x.WTeamID) * (x.WScore - x.LScore))
prob_df = pd.read_csv("/kaggle/input/2021-ncaaw-solution-39th-place/submit_stage1_elo_tuned.csv")

Check that the margins and especially their signs look good.

In [None]:
tourney_results[["Season", "WTeamID", "LTeamID", "WScore", "LScore", "Margin"]].head(20)

Split the ID from the stage 1 main competition submission file to have Season, T1_TeamID, and T2_TeamID. We will later merge on these keys.

In [None]:
def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))


for ii, row in prob_df.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    prob_df.loc[ii, "Season"] = year
    prob_df.loc[ii, 'T1_TeamID'] = t1
    prob_df.loc[ii, 'T2_TeamID'] = t2
prob_df["Season"] = prob_df["Season"].astype(int)
prob_df["T1_TeamID"] = prob_df["T1_TeamID"].astype(int)
prob_df["T2_TeamID"] = prob_df["T2_TeamID"].astype(int)
prob_df.head()

Also add T1/2_TeamID to the tournament results.

In [None]:
for ii, row in tourney_results.iterrows():
    t1_id, t2_id = row.WTeamID, row.LTeamID
    if t1_id > t2_id:
        t2_id, t1_id = t1_id, t2_id
    tourney_results.loc[ii, "T1_TeamID"] = t1_id
    tourney_results.loc[ii, "T2_TeamID"] = t2_id
    tourney_results.loc[ii, "Margin"] = row.Margin
tourney_results["T1_TeamID"] = tourney_results["T1_TeamID"].astype(int)
tourney_results["T2_TeamID"] = tourney_results["T2_TeamID"].astype(int)
tourney_results.head(10)

And then merge the stage1 submission with the tournament results.

In [None]:
merged = prob_df.merge(tourney_results, on=["T1_TeamID", "T2_TeamID"])
merged.plot("Pred", "Margin", kind="scatter");

Looks noisy but the trend is right. At this point I decided not to find a functional form a fit a spline but thought about taking a windowed average. How large a window? Well that led me to using KNN and tuning k via GridSearchCV.

In [None]:
knn = KNeighborsRegressor()
param_grid = dict(n_neighbors=np.arange(1, 50))
clf = GridSearchCV(knn, param_grid)
clf.fit(merged.Pred.to_numpy().reshape(-1, 1), merged.Margin)
clf.best_index_

And now use the best model with k=41 to predict. This is what I submitted and gave me first place.

In [None]:
stage2_df = pd.read_csv("/kaggle/input/2021-ncaaw-solution-39th-place/elo_2021-04-05.csv")
pred_margin = clf.best_estimator_.predict(stage2_df.Pred.to_numpy().reshape(-1, 1))
stage2_df["Pred"] = pred_margin
stage2_df[["ID", "Pred"]].to_csv("/kaggle/working/elo_calibrated_margin_submitted.csv", index=False)

Let's check this notebook against my final score.

In [None]:
stage2_result = pd.read_csv("/kaggle/input/2021-ncaaw-solution-39th-place/2021NCAAWTourneyMarginResults.csv")
pred_results_combined = stage2_result.merge(stage2_df, on="ID", how="left")
print(f"Final score: {mean_squared_error(pred_results_combined.Margin, pred_results_combined.Pred, squared=False)}")

Notebook score and leaderboard match.

But wait? Have you noticed anything above? I didn't for a few days.

There are way to many tournament games in the merged frame. I forgot to merge on Season, too! I guess I am very lucky that Women's College Basketball is very stable over many years and the noise this mistake introduced is not catastrophic.

So, what would have happened had I noticed my error and corrected it before the submission deadline? Let's see and do the merge correctly:

In [None]:
merged = prob_df.merge(tourney_results, on=["T1_TeamID", "T2_TeamID", "Season"])
merged.plot("Pred", "Margin", kind="scatter");


This already looks much better and less noisy.

In [None]:
knn = KNeighborsRegressor()
param_grid = dict(n_neighbors=np.arange(1, 50))
clf = GridSearchCV(knn, param_grid)
clf.fit(merged.Pred.to_numpy().reshape(-1, 1), merged.Margin)
clf.best_index_

In [None]:
stage2_df = pd.read_csv("/kaggle/input/2021-ncaaw-solution-39th-place/elo_2021-04-05.csv")
pred_margin = clf.best_estimator_.predict(stage2_df.Pred.to_numpy().reshape(-1, 1))
stage2_df["Pred"] = pred_margin
stage2_df[["ID", "Pred"]].to_csv("/kaggle/working/elo_calibrated_margin_fix_season_merge.csv", index=False)

In [None]:
pred_results_combined = stage2_result.merge(stage2_df, on="ID", how="left")
print(f"Fixed score: {mean_squared_error(pred_results_combined.Margin, pred_results_combined.Pred, squared=False)}")

Oops, I could have won more decisively. Well, first place is first place.

And let's have a quick look at the model:

In [None]:
x = np.linspace(0, 1, 101)
pred = clf.best_estimator_.predict(x.reshape(-1, 1))
plt.plot(x, pred)
plt.xlabel("Win Probability")
plt.ylabel("Margin")
plt.hlines(0, 0, 1, linestyle="dotted", colors="k")
plt.vlines(0.5, -30, 45, linestyle="dotted", colors="k");