# Kaggle March Madness Challenge 2024

Google Cloud and NCAA® have teamed up to bring you this year’s version of the Kaggle machine learning competition. Another year, another chance to anticipate the upsets, call the probabilities, and put your bracketology skills to the leaderboard test. Kagglers will join the millions of fans who attempt to forecast the outcomes of March Madness® during this year's NCAA Division I Men’s and Women’s Basketball Championships. But unlike most fans, you will pick your bracket using a combination of NCAA’s historical data and your computing power, while the ground truth unfolds on national television.

### Preparation
Import packages and load in initial datasets

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn import model_selection 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [37]:
features_men = pd.read_csv("features_men.csv")
features_women = pd.read_csv("features_women.csv")

In [2]:
tourney_results = pd.concat(
    [
        pd.read_csv("MNCAATourneyDetailedResults.csv"),
        pd.read_csv("WNCAATourneyDetailedResults.csv"),
    ],
    ignore_index=True,
)

seeds = pd.concat(
    [
        pd.read_csv("MNCAATourneySeeds.csv"),
        pd.read_csv("WNCAATourneySeeds.csv"),
    ],
    ignore_index=True,
)

regular_results = pd.concat(
    [
        pd.read_csv("MRegularSeasonDetailedResults.csv"),
        pd.read_csv("WRegularSeasonDetailedResults.csv"),
    ],
    ignore_index=True,
)

In [3]:
def prepare_data(df):

    dfswap = df[
        [
            "Season",
            "DayNum",
            "LTeamID",
            "LScore",
            "WTeamID",
            "WScore",
            "WLoc",
            "NumOT",
            "LFGM",
            "LFGA",
            "LFGM3",
            "LFGA3",
            "LFTM",
            "LFTA",
            "LOR",
            "LDR",
            "LAst",
            "LTO",
            "LStl",
            "LBlk",
            "LPF",
            "WFGM",
            "WFGA",
            "WFGM3",
            "WFGA3",
            "WFTM",
            "WFTA",
            "WOR",
            "WDR",
            "WAst",
            "WTO",
            "WStl",
            "WBlk",
            "WPF",
        ]
    ]

    dfswap.loc[df["WLoc"] == "H", "WLoc"] = "A"
    dfswap.loc[df["WLoc"] == "A", "WLoc"] = "H"
    df.columns.values[6] = "location"
    dfswap.columns.values[6] = "location"

    df.columns = [x.replace("W", "T1_").replace("L", "T2_") for x in list(df.columns)]
    dfswap.columns = [
        x.replace("L", "T1_").replace("W", "T2_") for x in list(dfswap.columns)
    ]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location == "N", "location"] = "0"
    output.loc[output.location == "H", "location"] = "1"
    output.loc[output.location == "A", "location"] = "-1"
    output.location = output.location.astype(int)

    output["PointDiff"] = output["T1_Score"] - output["T2_Score"]

    return output

In [4]:
regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [5]:
import numpy as np

boxscore_cols = [
    "T1_FGM",
    "T1_FGA",
    "T1_FGM3",
    "T1_FGA3",
    "T1_OR",
    "T1_Ast",
    "T1_TO",
    "T1_Stl",
    "T1_PF",
    "T2_FGM",
    "T2_FGA",
    "T2_FGM3",
    "T2_FGA3",
    "T2_OR",
    "T2_Ast",
    "T2_TO",
    "T2_Stl",
    "T2_Blk",
    "PointDiff",
]


funcs = [np.mean]

In [6]:
season_statistics = (
    regular_data.groupby(["Season", "T1_TeamID"])[boxscore_cols]
    .agg(funcs)
    .reset_index()
)
season_statistics.columns = [
    "".join(col).strip() for col in season_statistics.columns.values
]

season_statistics_T1 = season_statistics.copy()
season_statistics_T2 = season_statistics.copy()

season_statistics_T1.columns = [
    "T1_" + x.replace("T1_", "").replace("T2_", "opponent_")
    for x in list(season_statistics_T1.columns)
]
season_statistics_T2.columns = [
    "T2_" + x.replace("T1_", "").replace("T2_", "opponent_")
    for x in list(season_statistics_T2.columns)
]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

In [7]:
last14days_stats_T1 = regular_data.loc[regular_data.DayNum > 118].reset_index(drop=True)
last14days_stats_T1["win"] = np.where(last14days_stats_T1["PointDiff"] > 0, 1, 0)
last14days_stats_T1 = (
    last14days_stats_T1.groupby(["Season", "T1_TeamID"])["win"]
    .mean()
    .reset_index(name="T1_win_ratio_14d")
)

last14days_stats_T2 = regular_data.loc[regular_data.DayNum > 118].reset_index(drop=True)
last14days_stats_T2["win"] = np.where(last14days_stats_T2["PointDiff"] < 0, 1, 0)
last14days_stats_T2 = (
    last14days_stats_T2.groupby(["Season", "T2_TeamID"])["win"]
    .mean()
    .reset_index(name="T2_win_ratio_14d")
)

In [8]:
import statsmodels.api as sm

regular_season_effects = regular_data[
    ["Season", "T1_TeamID", "T2_TeamID", "PointDiff"]
].copy()
regular_season_effects["T1_TeamID"] = regular_season_effects["T1_TeamID"].astype(str)
regular_season_effects["T2_TeamID"] = regular_season_effects["T2_TeamID"].astype(str)
regular_season_effects["win"] = np.where(regular_season_effects["PointDiff"] > 0, 1, 0)
march_madness = pd.merge(
    seeds[["Season", "TeamID"]], seeds[["Season", "TeamID"]], on="Season"
)
march_madness.columns = ["Season", "T1_TeamID", "T2_TeamID"]
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(
    regular_season_effects, march_madness, on=["Season", "T1_TeamID", "T2_TeamID"]
)


def team_quality(season):
    formula = "win~-1+T1_TeamID+T2_TeamID"
    glm = sm.GLM.from_formula(
        formula=formula,
        data=regular_season_effects.loc[regular_season_effects.Season == season, :],
        family=sm.families.Binomial(),
    ).fit()

    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ["TeamID", "quality"]
    quality["Season"] = season
    # quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains("T1_")].reset_index(drop=True)
    quality["TeamID"] = quality["TeamID"].apply(lambda x: x[10:14]).astype(int)
    return quality


glm_quality = pd.concat(
    [
        team_quality(2010),
        team_quality(2011),
        team_quality(2012),
        team_quality(2013),
        team_quality(2014),
        team_quality(2015),
        team_quality(2016),
        team_quality(2017),
        team_quality(2018),
        team_quality(2019),
        ##team_quality(2020),
        team_quality(2021),
        team_quality(2022),
        team_quality(2023),
        team_quality(2024),
    ]
).reset_index(drop=True)

glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ["T1_TeamID", "T1_quality", "Season"]
glm_quality_T2.columns = ["T2_TeamID", "T2_quality", "Season"]

In [9]:
seeds["seed"] = seeds["Seed"].apply(lambda x: int(x[1:3]))

seeds_T1 = seeds[["Season", "TeamID", "seed"]].copy()
seeds_T2 = seeds[["Season", "TeamID", "seed"]].copy()
seeds_T1.columns = ["Season", "T1_TeamID", "T1_seed"]
seeds_T2.columns = ["Season", "T2_TeamID", "T2_seed"]

In [10]:
tourney_data = tourney_data[
    ["Season", "DayNum", "T1_TeamID", "T1_Score", "T2_TeamID", "T2_Score"]
]

tourney_data = pd.merge(
    tourney_data, season_statistics_T1, on=["Season", "T1_TeamID"], how="left"
)
tourney_data = pd.merge(
    tourney_data, season_statistics_T2, on=["Season", "T2_TeamID"], how="left"
)

tourney_data = pd.merge(
    tourney_data, last14days_stats_T1, on=["Season", "T1_TeamID"], how="left"
)
tourney_data = pd.merge(
    tourney_data, last14days_stats_T2, on=["Season", "T2_TeamID"], how="left"
)

tourney_data = pd.merge(
    tourney_data, glm_quality_T1, on=["Season", "T1_TeamID"], how="left"
)
tourney_data = pd.merge(
    tourney_data, glm_quality_T2, on=["Season", "T2_TeamID"], how="left"
)

tourney_data = pd.merge(tourney_data, seeds_T1, on=["Season", "T1_TeamID"], how="left")
tourney_data = pd.merge(tourney_data, seeds_T2, on=["Season", "T2_TeamID"], how="left")

tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]
tourney_data["Score_diff"] = tourney_data["T1_Score"] - tourney_data["T2_Score"]

In [11]:
import itertools

tourney_2024 = pd.read_csv("2024_tourney_seeds.csv")
tourney_2024["seed"] = tourney_2024["Seed"].apply(lambda x: int(x[1:3]))

tourney_2024_mens = tourney_2024.query('Tournament == "M"')
tourney_2024_womens = tourney_2024.query('Tournament == "W"')

comb_mens = pd.DataFrame(
    data=list(itertools.combinations(tourney_2024_mens["TeamID"], 2))
)
comb_womens = pd.DataFrame(
    data=list(itertools.combinations(tourney_2024_womens["TeamID"], 2))
)
comb_merged = pd.concat([comb_mens, comb_womens])

comb_merged["T1_TeamID"] = np.minimum(
    np.array(comb_merged[0]), np.array(comb_merged[1])
)
comb_merged["T2_TeamID"] = np.maximum(
    np.array(comb_merged[0]), np.array(comb_merged[1])
)
comb_merged = comb_merged.sort_values(["T1_TeamID", "T2_TeamID"]).reset_index()
comb_merged = comb_merged[["T1_TeamID", "T2_TeamID"]]

comb_merged["Season"] = 2024
comb_merged["ID"] = (
    comb_merged["Season"].astype(str)
    + "_"
    + comb_merged["T1_TeamID"].astype(str)
    + "_"
    + comb_merged["T2_TeamID"].astype(str)
)

sample_sub = comb_merged[["ID", "Season", "T1_TeamID", "T2_TeamID"]]
sample_sub_mens = sample_sub.query("T1_TeamID <= 2000")
sample_sub_womens = sample_sub.query("T1_TeamID >= 3000")

sample_sub.tail()

Unnamed: 0,ID,Season,T1_TeamID,T2_TeamID
4027,2024_3439_3453,2024,3439,3453
4028,2024_3439_3465,2024,3439,3465
4029,2024_3452_3453,2024,3452,3453
4030,2024_3452_3465,2024,3452,3465
4031,2024_3453_3465,2024,3453,3465


In [12]:
pred_xgb = pd.merge(
    sample_sub, season_statistics_T1, on=["Season", "T1_TeamID"], how="left"
)
pred_xgb = pd.merge(
    pred_xgb, season_statistics_T2, on=["Season", "T2_TeamID"], how="left"
)

pred_xgb = pd.merge(pred_xgb, glm_quality_T1, on=["Season", "T1_TeamID"], how="left")

pred_xgb = pd.merge(pred_xgb, glm_quality_T2, on=["Season", "T2_TeamID"], how="left")

pred_xgb = pd.merge(pred_xgb, seeds_T1, on=["Season", "T1_TeamID"], how="left")
pred_xgb = pd.merge(pred_xgb, seeds_T2, on=["Season", "T2_TeamID"], how="left")
pred_xgb = pd.merge(
    pred_xgb, last14days_stats_T1, on=["Season", "T1_TeamID"], how="left"
)
pred_xgb = pd.merge(
    pred_xgb, last14days_stats_T2, on=["Season", "T2_TeamID"], how="left"
)

pred_xgb["Seed_diff"] = pred_xgb["T1_seed"] - pred_xgb["T2_seed"]

In [13]:
y = tourney_data["T1_Score"] - tourney_data["T2_Score"]
y.describe()

count    4284.000000
mean        0.000000
std        17.442357
min       -89.000000
25%       -11.000000
50%         0.000000
75%        11.000000
max        89.000000
dtype: float64

In [14]:
features = (
    list(season_statistics_T1.columns[2:999])
    + list(season_statistics_T2.columns[2:999])
    + list(seeds_T1.columns[2:999])
    + list(seeds_T2.columns[2:999])
    + list(last14days_stats_T1.columns[2:999])
    + list(last14days_stats_T2.columns[2:999])
    + ["Seed_diff"]
    + ["T1_quality", "T2_quality"]
)

len(features)

45

In [15]:
import xgboost as xgb

X = tourney_data[features].values
dtrain = xgb.DMatrix(X, label=y)

In [16]:
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000
    x = preds - labels
    grad = x / (x**2 / c**2 + 1)
    hess = -(c**2) * (x**2 - c**2) / (x**2 + c**2) ** 2
    return grad, hess

In [17]:
param = {}
# param['objective'] = 'reg:linear'
param["eval_metric"] = "mae"
param["booster"] = "gbtree"
param["eta"] = 0.02  # change to ~0.02 for final run
param["subsample"] = 0.35
param["colsample_bytree"] = 0.7
param["num_parallel_tree"] = 10  # recommend 10
param["min_child_weight"] = 40
param["gamma"] = 10
param["max_depth"] = 3
param["silent"] = 1

print(param)

{'eval_metric': 'mae', 'booster': 'gbtree', 'eta': 0.02, 'subsample': 0.35, 'colsample_bytree': 0.7, 'num_parallel_tree': 10, 'min_child_weight': 40, 'gamma': 10, 'max_depth': 3, 'silent': 1}


In [18]:
from sklearn.model_selection import KFold

xgb_cv = []
repeat_cv = 10  # recommend 10

for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    xgb_cv.append(
        xgb.cv(
            params=param,
            dtrain=dtrain,
            obj=cauchyobj,
            num_boost_round=3000,
            folds=KFold(n_splits=5, shuffle=True, random_state=i),
            early_stopping_rounds=25,
            verbose_eval=50,
        )
    )

Fold repeater 0
[0]	train-mae:13.52858+0.02169	test-mae:13.53180+0.08869
[50]	train-mae:10.42633+0.04889	test-mae:10.54049+0.16158
[100]	train-mae:9.65129+0.06146	test-mae:9.87255+0.20401
[150]	train-mae:9.36615+0.06340	test-mae:9.68739+0.21569
[200]	train-mae:9.20060+0.06264	test-mae:9.61471+0.21974
[250]	train-mae:9.07688+0.06334	test-mae:9.58294+0.21980
[300]	train-mae:8.97202+0.06581	test-mae:9.56441+0.21884
[350]	train-mae:8.87638+0.06613	test-mae:9.55270+0.21657
[400]	train-mae:8.78646+0.06617	test-mae:9.54684+0.21486
[450]	train-mae:8.70110+0.06686	test-mae:9.54098+0.21294
[500]	train-mae:8.61807+0.06619	test-mae:9.53728+0.21278
[550]	train-mae:8.54030+0.06565	test-mae:9.53705+0.21412
[600]	train-mae:8.46642+0.06557	test-mae:9.53269+0.21523
[649]	train-mae:8.39578+0.06486	test-mae:9.53474+0.21645
Fold repeater 1
[0]	train-mae:13.52843+0.04561	test-mae:13.52993+0.18347
[50]	train-mae:10.42246+0.04497	test-mae:10.54579+0.20334
[100]	train-mae:9.64815+0.04006	test-mae:9.88510+0.187

In [19]:
iteration_counts = [np.argmin(x["test-mae-mean"].values) for x in xgb_cv]
val_mae = [np.min(x["test-mae-mean"].values) for x in xgb_cv]
iteration_counts, val_mae

([624, 334, 481, 537, 614, 457, 428, 481, 414, 409],
 [9.532264137165837,
  9.579495132055154,
  9.542730775555224,
  9.559457688686738,
  9.510969543200638,
  9.545046688242298,
  9.528233641564475,
  9.512955234529484,
  9.589857387525646,
  9.54625527162272])

In [20]:
oof_preds = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    preds = y.copy()
    kfold = KFold(n_splits=5, shuffle=True, random_state=i)
    for train_index, val_index in kfold.split(X, y):
        dtrain_i = xgb.DMatrix(X[train_index], label=y[train_index])
        dval_i = xgb.DMatrix(X[val_index], label=y[val_index])
        model = xgb.train(
            params=param,
            dtrain=dtrain_i,
            num_boost_round=iteration_counts[i],
            verbose_eval=50,
        )
        preds[val_index] = model.predict(dval_i)
    oof_preds.append(np.clip(preds, -30, 30))

Fold repeater 0
Fold repeater 1
Fold repeater 2
Fold repeater 3
Fold repeater 4
Fold repeater 5
Fold repeater 6
Fold repeater 7
Fold repeater 8
Fold repeater 9


In [21]:
from scipy.interpolate import UnivariateSpline


val_cv = []
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i], np.where(y > 0, 1, 0)))
    dat = sorted(dat, key=lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]] = dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit, 0.025, 0.975)

In [22]:
Xsub = pred_xgb[features].values
dtest = xgb.DMatrix(Xsub)

In [23]:
sub_models = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    sub_models.append(
        xgb.train(
            params=param,
            dtrain=dtrain,
            num_boost_round=int(iteration_counts[i] * 1.05),
            verbose_eval=50,
        )
    )

Fold repeater 0
Fold repeater 1
Fold repeater 2
Fold repeater 3
Fold repeater 4
Fold repeater 5
Fold repeater 6
Fold repeater 7
Fold repeater 8
Fold repeater 9


In [24]:
sub_preds = []
for i in range(repeat_cv):
    sub_preds.append(
        np.clip(
            spline_model[i](np.clip(sub_models[i].predict(dtest), -30, 30)),
            0.025,
            0.975,
        )
    )

pred_xgb["Pred_xgb"] = pd.DataFrame(sub_preds).mean(axis=0)

In [25]:
# pred_lr.head()

In [26]:
pred_xgb.head()

Unnamed: 0,ID,Season,T1_TeamID,T2_TeamID,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,T1_ORmean,T1_Astmean,...,T2_opponent_Blkmean,T2_PointDiffmean,T1_quality,T2_quality,T1_seed,T2_seed,T1_win_ratio_14d,T2_win_ratio_14d,Seed_diff,Pred_xgb
0,2024_1103_1104,2024,1103,1104,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,...,4.3125,9.6875,-547208600000000.0,1356707000000000.0,14,4,0.6,0.333333,10,0.191282
1,2024_1103_1112,2024,1103,1112,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,...,3.757576,15.69697,-547208600000000.0,2455506000000000.0,14,2,0.6,0.5,12,0.025556
2,2024_1103_1120,2024,1103,1120,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,...,3.235294,15.294118,-547208600000000.0,2757983000000000.0,14,4,0.6,1.0,10,0.141067
3,2024_1103_1124,2024,1103,1124,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,...,3.4375,8.8125,-547208600000000.0,1347774000000000.0,14,3,0.6,0.5,11,0.154102
4,2024_1103_1140,2024,1103,1140,25.5625,56.34375,7.5,23.40625,8.1875,12.3125,...,2.212121,11.939394,-547208600000000.0,531801500000000.0,14,6,0.6,0.5,8,0.251481


In [27]:
# preds = pred_lr.copy()
# preds = pd.merge(preds, pred_xgb[['ID', 'Pred_xgb']], on = "ID", how = "left")
# preds['Pred'] = 0.2*preds['Pred_lr'] + 0.8*preds['Pred_xgb']
preds = pred_xgb
preds["Pred"] = preds["Pred_xgb"]
# Connecticut win
# preds["Pred"] = preds["Pred"].where(preds["T1_TeamID"] != 1163, 1)
# preds["Pred"] = preds["Pred"].where(preds["T2_TeamID"] != 1163, 0)

# # South Carolina win
# preds["Pred"] = preds["Pred"].where(preds["T1_TeamID"] != 3376, 1)
# preds["Pred"] = preds["Pred"].where(preds["T2_TeamID"] != 3376, 0)

preds = preds[["ID", "Pred"]]
preds.tail()

Unnamed: 0,ID,Pred
4027,2024_3439_3453,0.606959
4028,2024_3439_3465,0.923077
4029,2024_3452_3453,0.604403
4030,2024_3452_3465,0.869065
4031,2024_3453_3465,0.800089


In [28]:
round_slots = pd.read_csv("MNCAATourneySlots.csv")
round_slots = round_slots[round_slots["Season"] == 2024]
round_slots = round_slots[round_slots["Slot"].str.contains("R")]

seeds = pd.read_csv("2024_tourney_seeds.csv")
seeds_m = seeds[seeds["Tournament"] == "M"]
seeds_w = seeds[seeds["Tournament"] == "W"]

preds["ID"] = preds["ID"].str.split("_")

In [29]:
def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index("Seed")["TeamID"].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}

    for teams, proba in zip(preds["ID"], preds["Pred"]):
        team1, team2 = teams[1], teams[2]

        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, random_values, sim=True):
    """
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - random_values (array-like): Array with precomputed random-values.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    """
    winners = []
    slots = []

    for slot, strong, weak, random_val in zip(
        round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed, random_values
    ):
        team1, team2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        proba = probas[str(team1)][str(team2)]

        if sim:
            # Randomly determine the winner based on the probability
            winner = team1 if random_val < proba else team2
        else:
            # Determine the winner based on the higher probability
            winner = [team1, team2][np.argmax([proba, 1 - proba])]

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    """
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    """
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Precompute random-values
    random_values = np.random.random(size=(brackets, len(round_slots)))

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(
            round_slots,
            seed_dict,
            inverted_seed_dict,
            probas_dict,
            random_values[b - 1],
            sim,
        )

        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({"Bracket": bracket, "Slot": slots, "Team": results})

    return result_df

In [30]:
from tqdm import tqdm


n_brackets = 100000
result_m = run_simulation(
    brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=True
)
result_m["Tournament"] = "M"
result_w = run_simulation(
    brackets=n_brackets, seeds=seeds_w, preds=preds, round_slots=round_slots, sim=True
)
result_w["Tournament"] = "W"
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ["RowId"]

100%|██████████| 100000/100000 [00:05<00:00, 17792.66it/s]
100%|██████████| 100000/100000 [00:05<00:00, 17635.34it/s]


In [31]:
submission.query('Slot == "R6CH" & Tournament == "M"').groupby(
    "Team"
).count().sort_values("Bracket", ascending=False).head(20)

Unnamed: 0_level_0,Bracket,Slot,Tournament
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Z01,17833,17833,17833
W01,13003,13003,13003
X02,11391,11391,11391
W02,9515,9515,9515
Y01,7398,7398,7398
X01,5046,5046,5046
Y02,4402,4402,4402
Z02,3332,3332,3332
Y03,2831,2831,2831
W03,2503,2503,2503


In [32]:
submission.query('Slot == "R6CH" & Tournament == "W"').groupby(
    "Team"
).count().sort_values("Bracket", ascending=False).head(20)

Unnamed: 0_level_0,Bracket,Slot,Tournament
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
W01,44033,44033,44033
Y01,17701,17701,17701
X01,17416,17416,17416
Y03,4485,4485,4485
Z03,2675,2675,2675
Z02,2529,2529,2529
Z01,2218,2218,2218
W02,2090,2090,2090
Y02,2007,2007,2007
X02,1712,1712,1712


In [33]:
submission.to_csv("submission_final_.csv")
submission

Unnamed: 0_level_0,Bracket,Slot,Team,Tournament
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,R1W1,W01,M
1,1,R1W2,W02,M
2,1,R1W3,W03,M
3,1,R1W4,W04,M
4,1,R1W5,W05,M
...,...,...,...,...
12599995,100000,R4Y1,Y01,W
12599996,100000,R4Z1,Z03,W
12599997,100000,R5WX,W01,W
12599998,100000,R5YZ,Y01,W


In [34]:
import pandas as pd


# Assuming you have a DataFrame called 'df' with columns 'submission Slot' and 'Team'
grouped_df_no_connecticut = (
    submission.groupby("Slot")["Team"].apply(lambda x: x.mode()[0]).reset_index()
)

grouped_df_no_connecticut

Unnamed: 0,Slot,Team
0,R1W1,W01
1,R1W2,W02
2,R1W3,W03
3,R1W4,W04
4,R1W5,W05
...,...,...
58,R4Y1,Y01
59,R4Z1,Z01
60,R5WX,W01
61,R5YZ,Y01


In [35]:
grouped_df_no_connecticut.to_csv("grouped_df.csv", index=False)

End