# Women's Tournament Scoring

Using the model generated brackets, find the brackets that best perform against a hypothetical public pool. 

### Setup Data

In [1]:
season = 2024
playin_losers = (  # remove play-in losers from seeding data
    3357,  # Sacred Heart
    3162,  # Columbia
    3120,  # Auburn
    3221,  # Holy Cross
)

season

2024

In [2]:
import pickle

with open(f'../data/preprocessed/womens_simulation/{season}_womens_simulation.pkl', 'rb') as f:
    simulation_data = pickle.load(f)

results = simulation_data['results']
candidates = simulation_data['candidates']
public = simulation_data['public']

del simulation_data

'Done'

'Done'

In [3]:
def dict_to_tuple(d):
    """Get values of dictionary in a tuple, sorted by key"""
    return tuple(d[key] for key in sorted(d.keys()))

In [4]:
import numpy as np

results_array = np.array(tuple(dict_to_tuple(result) for result in results))

results_array.shape

(30000, 63)

In [5]:
candidates_array = np.array(tuple(dict_to_tuple(c) for c in candidates))

candidates_array.shape

(40000, 63)

In [6]:
public_array = np.array(tuple(dict_to_tuple(p) for p in public))

public_array.shape

(20000, 63)

### Score Brackets

In [7]:
round_scores = np.array([10]*32 + [20]*16 + [40]*8 + [80]*4 + [160]*2 + [320]*1)

def score_brackets(brackets, ground_truth):
    """Score array of brackets compared to a bracket treated as ground truth"""
    return np.sum((brackets == ground_truth)*round_scores, axis=1)

In [8]:
from tqdm.autonotebook import tqdm

# each row is a public bracket and each column is the tournament simulation

public_scores = np.array(tuple(
    score_brackets(public_array, result) for result in tqdm(results_array)
)).transpose()

public_scores.shape

  from tqdm.autonotebook import tqdm


  0%|          | 0/30000 [00:00<?, ?it/s]

(20000, 30000)

In [9]:
# each row is a candidate bracket and each column is the tournament simulation

candidates_scores = np.array(tuple(
    score_brackets(candidates_array, result) for result in tqdm(results_array)
)).transpose()

candidates_scores.shape

  0%|          | 0/30000 [00:00<?, ?it/s]

(40000, 30000)

In [10]:
def get_best_brackets(pool_size: int, number_of_brackets: int, first_payout: int, second_payout: int, third_payout: int):
    """
    Get the best n possible brackets for a given pool size. 
    Subsequent brackets choices will headge previous bracket choices. 
    """
    third_place = np.quantile(public_scores, q=1-3/pool_size, axis=0)
    second_place = np.quantile(public_scores, q=1-2/pool_size, axis=0)
    first_place = np.quantile(public_scores, q=1-1/pool_size, axis=0)

    candidates_prizes = (
        (candidates_scores > third_place)*(third_payout) + 
        (candidates_scores > second_place)*(second_payout - third_payout) + 
        (candidates_scores > first_place)*(first_payout - second_payout)
    )

    best_brackets = []  # indexes of best brackets from candidates
    for _ in tqdm(range(number_of_brackets)):
        best_available_bracket = candidates_prizes.mean(axis=1).argmax()  # index of best available bracket given previous brackets selected
        prize = candidates_prizes.mean(axis=1)[best_available_bracket]
        best_brackets.append((best_available_bracket, prize))

        # hedging: remove any ground truths that the best available bracket won in
        ignore_indexes = np.where(candidates_prizes[best_available_bracket] != 0)[0]
        candidates_prizes = np.delete(candidates_prizes, ignore_indexes, axis=1)

    return best_brackets

In [26]:
bb = get_best_brackets(
    pool_size=10_000, 
    number_of_brackets=10, 
    first_payout=85, 
    second_payout=12, 
    third_payout=3,
)

bb

  0%|          | 0/10 [00:00<?, ?it/s]

[(19226, 0.5099666666666667),
 (28878, 0.47581188519067513),
 (715, 0.4562707951381816),
 (10590, 0.45838747179101413),
 (730, 0.4166580266113703),
 (37736, 0.3725012189176012),
 (30498, 0.32242449752709673),
 (34494, 0.3098929820223925),
 (36133, 0.29665411664417135),
 (4363, 0.28299660289647777)]

### Display Brackets

In [12]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df_teams = df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\WTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M
...,...,...
371,3474,Queens NC
372,3475,Southern Indiana
373,3476,Stonehill
374,3477,TX A&M Commerce


In [13]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] == season, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds = df_seeds.loc[~df_seeds['TeamID'].isin(playin_losers), :].reset_index(drop=True)

df_seeds.insert(1, 'Region Seed', df_seeds['Region'] + df_seeds['Seed'].astype(str).str.zfill(2))

df_seeds

Unnamed: 0,Season,Region Seed,Seed,Region,Play In,TeamID
0,2024,W01,1,W,False,3376
1,2024,W02,2,W,False,3323
2,2024,W03,3,W,False,3333
3,2024,W04,4,W,False,3231
4,2024,W05,5,W,False,3328
...,...,...,...,...,...,...
59,2024,Z12,12,Z,True,3435
60,2024,Z13,13,Z,False,3267
61,2024,Z14,14,Z,False,3238
62,2024,Z15,15,Z,False,3263


In [14]:
id_to_team = dict(zip(df_teams['TeamID'], df_teams['TeamName']))
region_seed_to_team = dict(zip(df_seeds['Region Seed'], df_seeds['TeamID'].map(id_to_team)))

len(region_seed_to_team)

64

In [15]:
def display_bracket(bracket):
    display_dict = {game: region_seed_to_team[region_seed] for game, region_seed in bracket.items()}

    print('-'*30)
    print()
    print('ROUND OF 64 WINNERS')
    print()
    for region in ('W', 'X', 'Y', 'Z'):
        print(f'REGION {region}')
        for slot in (1, 8, 5, 4, 6, 3, 7, 2):
            key = f'R1{region}{slot}'
            print(f'{key}: {display_dict[key]}')
        print()
    print('-'*30)
    print()

    print('ROUND OF 32 WINNERS')
    print()
    for region in ('W', 'X', 'Y', 'Z'):
        print(f'REGION {region}')
        for slot in (1, 4, 3, 2):
            key = f'R2{region}{slot}'
            print(f'{key}: {display_dict[key]}')
        print()
    print('-'*30)
    print()

    print('ROUND OF 16 WINNERS')
    print()
    for region in ('W', 'X', 'Y', 'Z'):
        print(f'REGION {region}')
        for slot in (1, 2):
            key = f'R3{region}{slot}'
            print(f'{key}: {display_dict[key]}')
        print()
    print('-'*30)
    print()

    print('ELITE EIGHT WINNERS')
    print()
    for region in ('W', 'X', 'Y', 'Z'):
        print(f'REGION {region}')
        for slot in (1, ):
            key = f'R4{region}{slot}'
            print(f'{key}: {display_dict[key]}')
        print()
    print('-'*30)
    print()

    print('FINAL FOUR WINNERS')
    print()
    for matchup in ('WX', 'YZ'):
        key = f'R5{matchup}'
        print(f'{key}: {display_dict[key]}')
        print()
    print('-'*30)
    print()

    print('FINALS WINNER')
    print()
    for matchup in ('CH', ):
        key = f'R6{matchup}'
        print(f'{key}: {display_dict[key]}')
        print()
    print('-'*30)
    print()

In [27]:
for b, _ in bb:
    display_bracket(candidates[b])

------------------------------

ROUND OF 64 WINNERS

REGION W
R1W1: South Carolina
R1W8: Michigan St
R1W5: Oklahoma
R1W4: Indiana
R1W6: Nebraska
R1W3: Oregon St
R1W7: Mississippi
R1W2: Notre Dame

REGION X
R1X1: Texas
R1X8: Alabama
R1X5: Utah
R1X4: Gonzaga
R1X6: Tennessee
R1X3: NC State
R1X7: Iowa St
R1X2: Stanford

REGION Y
R1Y1: Iowa
R1Y8: West Virginia
R1Y5: Colorado
R1Y4: Kansas St
R1Y6: Louisville
R1Y3: LSU
R1Y7: Creighton
R1Y2: UCLA

REGION Z
R1Z1: USC
R1Z8: Kansas
R1Z5: Baylor
R1Z4: Virginia Tech
R1Z6: Arizona
R1Z3: Connecticut
R1Z7: Duke
R1Z2: Ohio St

------------------------------

ROUND OF 32 WINNERS

REGION W
R2W1: South Carolina
R2W4: Indiana
R2W3: Oregon St
R2W2: Notre Dame

REGION X
R2X1: Texas
R2X4: Gonzaga
R2X3: NC State
R2X2: Stanford

REGION Y
R2Y1: Iowa
R2Y4: Colorado
R2Y3: LSU
R2Y2: UCLA

REGION Z
R2Z1: USC
R2Z4: Virginia Tech
R2Z3: Connecticut
R2Z2: Ohio St

------------------------------

ROUND OF 16 WINNERS

REGION W
R3W1: South Carolina
R3W2: Oregon St

REGION 