In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

!pip install tensorflow_decision_forests
# TF-DF requires Tensorflow < 2.15 or tf_keras
!pip install tf_keras
!pip install wurlitzer

import tensorflow as tf
import tensorflow_decision_forests as tfdf
import tf_keras

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow~=2.16.1 (from tensorflow_decision_forests)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting tf-keras~=2.16 (from tensorflow_decision_forests)
  Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow~=2.16.1->tensorflow_decision_forests)
  Downl

In [2]:
DATA_PATH = 'drive/MyDrive/march_madness/2024/march-machine-learning-mania-2024/'

In [4]:
# Read in Team names
tmp1 = pd.read_csv(DATA_PATH + 'MTeams.csv')
tmp2 = pd.read_csv(DATA_PATH + 'WTeams.csv')
df_teams = pd.concat((tmp1,tmp2))
df_teams = df_teams.drop(['FirstD1Season', 'LastD1Season'], axis=1)

# Read in Tourney Slots and add Team Names
df_slots = pd.read_csv(DATA_PATH + '2024_tourney_seeds.csv')
df_slots = df_slots.merge(df_teams, on='TeamID', how='left')
print(df_slots.shape)
df_slots.head()

(128, 4)


Unnamed: 0,Tournament,Seed,TeamID,TeamName
0,M,W01,1163,Connecticut
1,M,W02,1235,Iowa St
2,M,W03,1228,Illinois
3,M,W04,1120,Auburn
4,M,W05,1361,San Diego St


In [5]:
# Read in submission csv
df_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
print(df_submission.shape)
df_submission.head()

(126, 5)


Unnamed: 0,RowId,Tournament,Bracket,Slot,Team
0,0,M,1,R1W1,W01
1,1,M,1,R1W8,W08
2,2,M,1,R1W5,W05
3,3,M,1,R1W4,W04
4,4,M,1,R1W6,W06


In [6]:
# Build the bracket rounds
genders = ['M', 'W']

bracket = []
team1names = []
team2names = []
team1ids = []
team2ids = []
tournament = []
for gender in genders:
  i = 0
  slots = df_slots[df_slots['Tournament'] == gender]
  for r in ['W', 'X', 'Y', 'Z']:
    for g in range(8):
      chalk1 = f'{g+1:02d}'
      chalk2 = f'{16-g:02d}'

      seed1 = f'{r}{chalk1}'
      seed2 = f'{r}{chalk2}'

      team1 = slots[slots['Seed'] == seed1]['TeamName'].values[0]
      team2 = slots[slots['Seed'] == seed2]['TeamName'].values[0]

      id1 = slots[slots['Seed'] == seed1]['TeamID'].values[0]
      id2 = slots[slots['Seed'] == seed2]['TeamID'].values[0]

      bracket.append(f'R1{r}{g+1:d}')
      team1names.append(team1)
      team2names.append(team2)
      team1ids.append(id1)
      team2ids.append(id2)
      i += 1

  for r in ['W', 'X', 'Y', 'Z']:
    for g in range(4):
      bracket.append(f'R2{r}{g+1:d}')
      team1names.append('A')
      team2names.append('B')
      team1ids.append(1000)
      team2ids.append(2000)
      i += 1

  for r in ['W', 'X', 'Y', 'Z']:
    for g in range(2):
      bracket.append(f'R3{r}{g+1:d}')
      team1names.append('A')
      team2names.append('B')
      team1ids.append(1000)
      team2ids.append(2000)
      i += 1

  for r in ['W', 'X', 'Y', 'Z']:
    bracket.append(f'R4{r}1')
    team1names.append('A')
    team2names.append('B')
    team1ids.append(1000)
    team2ids.append(2000)
    i += 1

  for r in ['WX', 'YZ']:
    bracket.append(f'R5{r}')
    team1names.append('A')
    team2names.append('B')
    team1ids.append(1000)
    team2ids.append(2000)
    i += 1

  bracket.append('R6CH')
  team1names.append('A')
  team2names.append('B')
  team1ids.append(1000)
  team2ids.append(2000)
  i += 1

  tournament += [gender]*i

df_bracket = pd.DataFrame()
df_bracket['Tournament'] = tournament
df_bracket['Slot'] = bracket
df_bracket['TeamAID'] = team1ids
df_bracket['TeamBID'] = team2ids
print(df_bracket.shape)
df_bracket.head()

(126, 4)


Unnamed: 0,Tournament,Slot,TeamAID,TeamBID
0,M,R1W1,1163,1391
1,M,R1W2,1235,1355
2,M,R1W3,1228,1287
3,M,R1W4,1120,1463
4,M,R1W5,1361,1412


In [7]:
df_submission = df_submission.merge(df_bracket, on=['Tournament', 'Slot'])
print(df_submission.shape)
df_submission.head()

(126, 7)


Unnamed: 0,RowId,Tournament,Bracket,Slot,Team,TeamAID,TeamBID
0,0,M,1,R1W1,W01,1163,1391
1,1,M,1,R1W8,W08,1194,1321
2,2,M,1,R1W5,W05,1361,1412
3,3,M,1,R1W4,W04,1120,1463
4,4,M,1,R1W6,W06,1140,1182


In [8]:
def get_game_data(teamA, teamB, teams, features):

  teamA = teams[teams.TeamID == teamA].drop(['TeamName'], axis=1)
  teamB= teams[teams.TeamID == teamB].drop(['TeamName'], axis=1)

  teamA = pd.merge(teamA, features, on='TeamID', how='left')
  teamB = pd.merge(teamB, features, on='TeamID', how='left').drop('Season', axis=1)

  teamA.rename(columns={
      'TeamID' : 'TeamIDA',
      'Seed' : 'SeedA',
      'OffEff' : 'OffEffA',
      'DefEff' : 'DefEffA',
      'EFT' : 'EFTA',
      'WinRatio' : 'WinRatioA',
      'WinRatio14' : 'WinRatio14A'
  }, inplace=True)

  teamB.rename(columns={
      'TeamID' : 'TeamIDB',
      'Seed' : 'SeedB',
      'OffEff' : 'OffEffB',
      'DefEff' : 'DefEffB',
      'EFT' : 'EFTB',
      'WinRatio' : 'WinRatioB',
      'WinRatio14' : 'WinRatio14B'
  }, inplace=True)

  game = pd.concat([teamA, teamB], axis=1)
  cols = ['Season', 'TeamIDA', 'TeamIDB', 'SeedA', 'SeedB',
          'WinRatio14A', 'OffEffA', 'DefEffA', 'EFTA', 'WinRatioA',
          'WinRatio14B', 'OffEffB', 'DefEffB', 'EFTB', 'WinRatioB']
  game = game[cols]

  return game

In [11]:
# Read in feature data for 2024
df_features = pd.read_csv(DATA_PATH + '../features_2024.csv')
print(df_features.shape)
df_features.head()

(722, 8)


Unnamed: 0,Season,TeamID,Seed,WinRatio14,OffEff,DefEff,EFT,WinRatio
0,2024,1101,17.0,0.333333,101.185469,105.019952,0.471068,0.451613
1,2024,1102,17.0,0.0,108.731169,119.933816,0.537471,0.290323
2,2024,1103,14.0,0.6,110.85874,102.551863,0.520358,0.6875
3,2024,1104,4.0,0.333333,124.222709,110.197276,0.565917,0.65625
4,2024,1105,17.0,0.5,98.146361,107.635701,0.457754,0.333333


In [None]:
# some fake probability predictions, these will come from the model
rng = np.random.default_rng()
preds = rng.uniform(size=32)

print('-- Round of 64 --')
genders = ['M', 'W']
for gender in genders:
  tmp = df_submission[df_submission['Tournament'] == gender]
  for i in range(tmp.shape[0]):
    slot = tmp.iloc[i,:]['Slot']
    if slot.startswith('R1'):



      s1, s2 = get_seeds(teams[0], teams[1], df_slots)

    if preds[g] >= rng.uniform(): # draw from a distribution to determine who wins

        if g % 2 == 0:
            round_of_32[j][0] = teams[0]
        elif g % 2 == 1:
            round_of_32[j][1] = teams[0]
            j += 1

        print('%s (%s) beats %s (%s) with probability: %.4f' % (teams[0], s1, teams[1], s2, preds[g]))

    else:

        if g % 2 == 0:
            round_of_32[j][0] = teams[1]
        elif g % 2 == 1:
            round_of_32[j][1] = teams[1]
            j += 1

        print('%s (%s) beats %s (%s) with probability: %.4f' % (teams[1], s2, teams[0], s1, 1-preds[g]))


-- Round of 64 --

W Region:
Purdue (W01) beats F Dickinson (W16) with probability: 0.5059
Marquette (W02) beats Vermont (W15) with probability: 0.4937
Montana St (W14) beats Kansas St (W03) with probability: 0.6273
Louisiana (W13) beats Tennessee (W04) with probability: 0.9140
Duke (W05) beats Oral Roberts (W12) with probability: 0.2938
Kentucky (W06) beats Providence (W11) with probability: 0.8946
Michigan St (W07) beats USC (W10) with probability: 0.8997
FL Atlantic (W09) beats Memphis (W08) with probability: 0.7779

X Region:
SE Missouri St (X16) beats Alabama (X01) with probability: 0.3602
Arizona (X02) beats Princeton (X15) with probability: 0.6451
Baylor (X03) beats UC Santa Barbara (X14) with probability: 0.4265
Virginia (X04) beats Furman (X13) with probability: 0.3924
San Diego St (X05) beats Col Charleston (X12) with probability: 0.4155
NC State (X11) beats Creighton (X06) with probability: 0.5744
Missouri (X07) beats Utah St (X10) with probability: 0.4337
West Virginia (X09