# Get 2024 Men's Data

Get tournament matchup matrix for 2024

In [1]:
season = 2024
playin_losers = (  # remove play-in losers from seeding data
    1224,  # Howard
    1438,  # Virginia
    1286,  # Montana St
    1129,  # Boise St
)

model_path = '../model/mens_20240316'

season

2024

### Previous Tournament Results

In [2]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv(r'..\data\preprocessed\kaggle\tournament_results.csv')

df = df.loc[df['Season'] == season, :].reset_index(drop=True)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results
0,2024,1101,Abilene Chr,-1.0,-0.333333
1,2024,1102,Air Force,-1.0,-1.000000
2,2024,1103,Akron,-1.0,-0.666667
3,2024,1104,Alabama,2.0,1.333333
4,2024,1105,Alabama A&M,-1.0,-1.000000
...,...,...,...,...,...
373,2024,1474,Queens NC,-1.0,-1.000000
374,2024,1475,Southern Indiana,-1.0,-1.000000
375,2024,1476,Stonehill,-1.0,-1.000000
376,2024,1477,TX A&M Commerce,-1.0,-1.000000


### Barttorvik Ratings

In [3]:
df_barttorvik = pd.read_csv(r'..\data\preprocessed\barttorvik\barttorvik.csv')

df_barttorvik = df_barttorvik.loc[df_barttorvik['Season'] == season, :].reset_index(drop=True)

df_barttorvik

Unnamed: 0,Season,TEAM,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2024,Houston,0.882353,119.1,85.7,33.4,0.9778,49.7,44.0,13.7,24.7,36.9,29.9,39.0,63.4,10.4
1,2024,Connecticut,0.911765,126.8,93.9,32.9,0.9692,57.1,45.1,14.9,16.2,36.5,33.3,32.5,64.6,10.8
2,2024,Purdue,0.878788,126.1,94.8,31.3,0.9640,56.0,47.7,16.5,14.0,37.4,42.8,23.0,67.7,10.9
3,2024,Iowa St.,0.794118,113.5,86.6,26.9,0.9572,51.9,47.1,15.7,25.7,31.6,36.1,35.2,67.6,6.7
4,2024,Auburn,0.794118,120.7,92.1,28.6,0.9571,54.1,43.4,14.9,18.2,32.9,38.2,41.0,69.9,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2024,Stonehill,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0
358,2024,St. Francis PA,0.266667,93.1,118.0,-24.9,0.0620,47.2,53.0,21.2,17.1,32.9,32.6,35.4,65.5,-18.6
359,2024,IUPUI,0.187500,92.1,116.9,-24.8,0.0610,46.5,58.2,21.3,18.5,30.0,33.2,33.4,67.3,-21.6
360,2024,Coppin St.,0.068966,85.0,111.2,-26.2,0.0437,42.1,51.3,22.9,21.8,27.0,31.1,38.3,66.4,-23.0


In [4]:
df_spellings = pd.read_csv(
    r'..\data\unprocessed\kaggle\MTeamSpellings.csv', 
    encoding='cp1252'  # fixes issue with fancy quotes
)

df_spellings.loc[df_spellings.shape[0]] = ['fdu', 1192]

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101
...,...,...
1161,youngstown st.,1464
1162,youngstown state,1464
1163,youngstown-st,1464
1164,youngstown-state,1464


In [5]:
from fuzzywuzzy.fuzz import token_sort_ratio
from fuzzywuzzy import process
from tqdm.autonotebook import tqdm

team_spellings = df_spellings['TeamNameSpelling'].unique()
barttorvik_teams = df_barttorvik['TEAM'].unique()

df_match = pd.DataFrame(
    [
        [
            barttorvik_team,
            *process.extract(
                barttorvik_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for barttorvik_team in tqdm(barttorvik_teams)
    ],
    columns=['Barttorvik Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  from tqdm.autonotebook import tqdm


  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,Barttorvik Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,Queens,queens nc,80
2,St. Thomas,st thomas mn,86
3,UT Rio Grande Valley,texas rio grande valley,88
4,Texas A&M Commerce,tx a&m commerce,91
5,Cal St. Bakersfield,cal state bakersfield,92
6,Mississippi Valley St.,mississippi valley state,93
7,Southeast Missouri St.,southeast missouri state,93
8,Texas A&M Corpus Chris,texas a&m-corpus christi,96
9,Pacific,pacific,100


In [6]:
barttorvik_to_spelling = dict(zip(df_match['Barttorvik Team'], df_match['Team Spelling']))
spelling_to_id = dict(zip(df_spellings['TeamNameSpelling'], df_spellings['TeamID']))

df_barttorvik.insert(1, 'TeamID', df_barttorvik['TEAM'].map(barttorvik_to_spelling).map(spelling_to_id))

df_barttorvik

Unnamed: 0,Season,TeamID,TEAM,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2024,1222,Houston,0.882353,119.1,85.7,33.4,0.9778,49.7,44.0,13.7,24.7,36.9,29.9,39.0,63.4,10.4
1,2024,1163,Connecticut,0.911765,126.8,93.9,32.9,0.9692,57.1,45.1,14.9,16.2,36.5,33.3,32.5,64.6,10.8
2,2024,1345,Purdue,0.878788,126.1,94.8,31.3,0.9640,56.0,47.7,16.5,14.0,37.4,42.8,23.0,67.7,10.9
3,2024,1235,Iowa St.,0.794118,113.5,86.6,26.9,0.9572,51.9,47.1,15.7,25.7,31.6,36.1,35.2,67.6,6.7
4,2024,1120,Auburn,0.794118,120.7,92.1,28.6,0.9571,54.1,43.4,14.9,18.2,32.9,38.2,41.0,69.9,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2024,1476,Stonehill,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0
358,2024,1384,St. Francis PA,0.266667,93.1,118.0,-24.9,0.0620,47.2,53.0,21.2,17.1,32.9,32.6,35.4,65.5,-18.6
359,2024,1237,IUPUI,0.187500,92.1,116.9,-24.8,0.0610,46.5,58.2,21.3,18.5,30.0,33.2,33.4,67.3,-21.6
360,2024,1164,Coppin St.,0.068966,85.0,111.2,-26.2,0.0437,42.1,51.3,22.9,21.8,27.0,31.1,38.3,66.4,-23.0


In [7]:
df = pd.merge(
    df,
    df_barttorvik.drop(columns=['TEAM']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2024,1101,Abilene Chr,-1.0,-0.333333,0.470588,100.3,104.5,-4.2,0.3859,47.1,51.6,17.7,20.2,25.9,37.9,39.9,68.5,-11.2
1,2024,1102,Air Force,-1.0,-1.000000,0.290323,106.5,111.6,-5.1,0.3697,53.8,54.2,18.8,17.6,23.8,29.6,39.1,61.9,-13.3
2,2024,1103,Akron,-1.0,-0.666667,0.705882,105.0,101.6,3.4,0.5940,52.0,48.6,17.2,16.6,29.5,33.6,29.0,65.8,-4.5
3,2024,1104,Alabama,2.0,1.333333,0.656250,125.1,102.1,23.0,0.9115,56.3,49.9,16.0,15.6,34.9,35.2,39.6,72.6,3.3
4,2024,1105,Alabama A&M,-1.0,-1.000000,0.352941,92.5,107.4,-14.9,0.1528,45.8,48.9,22.5,19.6,31.9,47.7,45.9,70.6,-17.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,2024,1474,Queens NC,-1.0,-1.000000,0.424242,103.4,111.0,-7.6,0.3069,49.9,54.1,15.7,15.4,27.6,33.7,29.4,72.8,-13.6
374,2024,1475,Southern Indiana,-1.0,-1.000000,0.250000,96.4,111.7,-15.3,0.1552,45.6,49.5,19.4,16.6,27.0,37.6,48.6,69.1,-19.2
375,2024,1476,Stonehill,-1.0,-1.000000,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0
376,2024,1477,TX A&M Commerce,-1.0,-1.000000,0.393939,94.3,111.5,-17.2,0.1262,46.0,52.4,16.7,18.3,24.3,30.8,39.2,66.2,-14.0


### Barttorvik Previous Seasons

In [8]:
df_barttorvik_prev = pd.read_csv(r'..\data\preprocessed\barttorvik_full_season\barttorvik_full_season.csv')

df_barttorvik_prev = df_barttorvik_prev.loc[df_barttorvik_prev['Season'] == season, :].reset_index(drop=True)

df_barttorvik_prev

Unnamed: 0,Season,TEAM,Past Year BARTHAG,Past 4 Years BARTHAG
0,2024,Abilene Christian,0.4420,0.588625
1,2024,Air Force,0.5665,0.370875
2,2024,Akron,0.6479,0.648850
3,2024,Alabama,0.9548,0.884050
4,2024,Alabama A&M,0.2121,0.130000
...,...,...,...,...
362,2024,Wright St.,0.4449,0.569925
363,2024,Wyoming,0.5360,0.540900
364,2024,Xavier,0.8891,0.835725
365,2024,Yale,0.7534,0.684333


In [9]:
team_spellings = df_spellings['TeamNameSpelling'].unique()
barttorvik_prev_teams = df_barttorvik_prev['TEAM'].unique()

df_match = pd.DataFrame(
    [
        [
            barttorvik_prev_team,
            *process.extract(
                barttorvik_prev_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for barttorvik_prev_team in tqdm(barttorvik_prev_teams)
    ],
    columns=['Barttorvik Prev Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/367 [00:00<?, ?it/s]

Unnamed: 0,Barttorvik Prev Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,Queens,queens nc,80
2,St. Thomas,st thomas mn,86
3,UT Rio Grande Valley,texas rio grande valley,88
4,Texas A&M Commerce,tx a&m commerce,91
5,Winston Salem St.,winston-salem-state,91
6,Cal St. Bakersfield,cal state bakersfield,92
7,Southeast Missouri St.,southeast missouri state,93
8,Mississippi Valley St.,mississippi valley state,93
9,Texas A&M Corpus Chris,texas a&m-corpus christi,96


In [10]:
barttorvik_prev_to_spelling = dict(zip(df_match['Barttorvik Prev Team'], df_match['Team Spelling']))

df_barttorvik_prev.insert(1, 'TeamID', df_barttorvik_prev['TEAM'].map(barttorvik_prev_to_spelling).map(spelling_to_id))

df_barttorvik_prev

Unnamed: 0,Season,TeamID,TEAM,Past Year BARTHAG,Past 4 Years BARTHAG
0,2024,1101,Abilene Christian,0.4420,0.588625
1,2024,1102,Air Force,0.5665,0.370875
2,2024,1103,Akron,0.6479,0.648850
3,2024,1104,Alabama,0.9548,0.884050
4,2024,1105,Alabama A&M,0.2121,0.130000
...,...,...,...,...,...
362,2024,1460,Wright St.,0.4449,0.569925
363,2024,1461,Wyoming,0.5360,0.540900
364,2024,1462,Xavier,0.8891,0.835725
365,2024,1463,Yale,0.7534,0.684333


In [11]:
df = pd.merge(
    df,
    df_barttorvik_prev.drop(columns=['TEAM']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG
0,2024,1101,Abilene Chr,-1.0,-0.333333,0.470588,100.3,104.5,-4.2,0.3859,47.1,51.6,17.7,20.2,25.9,37.9,39.9,68.5,-11.2,0.4420,0.588625
1,2024,1102,Air Force,-1.0,-1.000000,0.290323,106.5,111.6,-5.1,0.3697,53.8,54.2,18.8,17.6,23.8,29.6,39.1,61.9,-13.3,0.5665,0.370875
2,2024,1103,Akron,-1.0,-0.666667,0.705882,105.0,101.6,3.4,0.5940,52.0,48.6,17.2,16.6,29.5,33.6,29.0,65.8,-4.5,0.6479,0.648850
3,2024,1104,Alabama,2.0,1.333333,0.656250,125.1,102.1,23.0,0.9115,56.3,49.9,16.0,15.6,34.9,35.2,39.6,72.6,3.3,0.9548,0.884050
4,2024,1105,Alabama A&M,-1.0,-1.000000,0.352941,92.5,107.4,-14.9,0.1528,45.8,48.9,22.5,19.6,31.9,47.7,45.9,70.6,-17.5,0.2121,0.130000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,2024,1474,Queens NC,-1.0,-1.000000,0.424242,103.4,111.0,-7.6,0.3069,49.9,54.1,15.7,15.4,27.6,33.7,29.4,72.8,-13.6,0.4299,
375,2024,1475,Southern Indiana,-1.0,-1.000000,0.250000,96.4,111.7,-15.3,0.1552,45.6,49.5,19.4,16.6,27.0,37.6,48.6,69.1,-19.2,0.2337,
376,2024,1476,Stonehill,-1.0,-1.000000,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0,0.1809,
377,2024,1477,TX A&M Commerce,-1.0,-1.000000,0.393939,94.3,111.5,-17.2,0.1262,46.0,52.4,16.7,18.3,24.3,30.8,39.2,66.2,-14.0,0.1968,


In [12]:
df.loc[df['Past Year BARTHAG'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG
8,2024,1109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,
17,2024,1118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,
20,2024,1121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,
27,2024,1128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,
33,2024,1134,Brooklyn,-1.0,-1.0,,,,,,,,,,,,,,,,
46,2024,1147,Centenary,-1.0,-1.0,,,,,,,,,,,,,,,,
114,2024,1215,Hardin-Simmons,-1.0,-1.0,,,,,,,,,,,,,,,,
136,2024,1236,PFW,-1.0,-1.0,0.647059,103.2,104.1,-0.9,0.4757,53.8,51.5,14.5,21.9,22.8,30.6,32.6,70.7,-7.9,,
189,2024,1289,Morris Brown,-1.0,-1.0,,,,,,,,,,,,,,,,
202,2024,1302,NE Illinois,-1.0,-1.0,,,,,,,,,,,,,,,,


### My Rankings

In [13]:
df_rankings = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\my_rankings\my_rankings_{season}.csv')
        .assign(Season=season)
        for season in range(season, season + 1) if season != 2020
    ),
    ignore_index=True
)  # probably not necessary but I will keep the loop

df_rankings.insert(0, 'Season', df_rankings.pop('Season'))

df_rankings.drop(columns=['Strength'], inplace=True)

# df_rankings = df_rankings.loc[df_rankings['Season'] == season, :].reset_index(drop=True)

df_rankings

Unnamed: 0,Season,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2024,Purdue,2.865430,0.278715,1.227945,0.949231,68.623502
1,2024,Connecticut,2.807860,0.291167,1.233900,0.942733,66.314773
2,2024,Houston,2.710147,0.324843,1.170705,0.845862,64.748225
3,2024,North Carolina,2.394173,0.229677,1.157857,0.928179,71.641470
4,2024,Auburn,2.265589,0.269172,1.191466,0.922294,70.601135
...,...,...,...,...,...,...,...
357,2024,Virginia Military Institute,-2.524350,-0.226728,0.889057,1.115785,75.051956
358,2024,IUPUI,-2.530265,-0.252894,0.921051,1.173945,68.400487
359,2024,Coppin State,-2.551242,-0.250414,0.847140,1.097554,67.666157
360,2024,Detroit Mercy,-2.783646,-0.198613,0.954155,1.152768,68.011051


In [14]:
my_teams = df_rankings['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            my_team,
            *process.extract(
                my_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for my_team in tqdm(my_teams)
    ],
    columns=['My Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,My Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Howard,howard,100
5,Fresno State,fresno state,100
6,North Carolina Central,north carolina central,100
7,Georgia State,georgia state,100
8,Pennsylvania,pennsylvania,100
9,Canisius,canisius,100


In [15]:
ranking_to_spelling = dict(zip(df_match['My Team'], df_match['Team Spelling']))

df_rankings.insert(1, 'TeamID', df_rankings['Team'].map(ranking_to_spelling).map(spelling_to_id))

df_rankings

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2024,1345,Purdue,2.865430,0.278715,1.227945,0.949231,68.623502
1,2024,1163,Connecticut,2.807860,0.291167,1.233900,0.942733,66.314773
2,2024,1222,Houston,2.710147,0.324843,1.170705,0.845862,64.748225
3,2024,1314,North Carolina,2.394173,0.229677,1.157857,0.928179,71.641470
4,2024,1120,Auburn,2.265589,0.269172,1.191466,0.922294,70.601135
...,...,...,...,...,...,...,...,...
357,2024,1440,Virginia Military Institute,-2.524350,-0.226728,0.889057,1.115785,75.051956
358,2024,1237,IUPUI,-2.530265,-0.252894,0.921051,1.173945,68.400487
359,2024,1164,Coppin State,-2.551242,-0.250414,0.847140,1.097554,67.666157
360,2024,1178,Detroit Mercy,-2.783646,-0.198613,0.954155,1.152768,68.011051


In [16]:
df_rankings.loc[df_rankings['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo


In [17]:
df = pd.merge(
    df,
    df_rankings.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2024,1101,Abilene Chr,-1.0,-0.333333,0.470588,100.3,104.5,-4.2,0.3859,47.1,51.6,17.7,20.2,25.9,37.9,39.9,68.5,-11.2,0.4420,0.588625,-0.211809,-0.038666,1.006279,1.044945,69.602134
1,2024,1102,Air Force,-1.0,-1.000000,0.290323,106.5,111.6,-5.1,0.3697,53.8,54.2,18.8,17.6,23.8,29.6,39.1,61.9,-13.3,0.5665,0.370875,-0.952703,-0.080433,1.042874,1.123307,63.258668
2,2024,1103,Akron,-1.0,-0.666667,0.705882,105.0,101.6,3.4,0.5940,52.0,48.6,17.2,16.6,29.5,33.6,29.0,65.8,-4.5,0.6479,0.648850,1.003653,0.057068,1.062449,1.005381,67.048069
3,2024,1104,Alabama,2.0,1.333333,0.656250,125.1,102.1,23.0,0.9115,56.3,49.9,16.0,15.6,34.9,35.2,39.6,72.6,3.3,0.9548,0.884050,1.688042,0.231507,1.235296,1.003788,73.717312
4,2024,1105,Alabama A&M,-1.0,-1.000000,0.352941,92.5,107.4,-14.9,0.1528,45.8,48.9,22.5,19.6,31.9,47.7,45.9,70.6,-17.5,0.2121,0.130000,-1.283926,-0.152056,0.932386,1.084442,71.239757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,2024,1474,Queens NC,-1.0,-1.000000,0.424242,103.4,111.0,-7.6,0.3069,49.9,54.1,15.7,15.4,27.6,33.7,29.4,72.8,-13.6,0.4299,,-0.594255,-0.078541,1.022795,1.101336,73.901220
375,2024,1475,Southern Indiana,-1.0,-1.000000,0.250000,96.4,111.7,-15.3,0.1552,45.6,49.5,19.4,16.6,27.0,37.6,48.6,69.1,-19.2,0.2337,,-1.921659,-0.142108,0.954198,1.096305,69.811962
376,2024,1476,Stonehill,-1.0,-1.000000,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0,0.1809,,-2.444949,-0.214238,0.914357,1.128596,69.240004
377,2024,1477,TX A&M Commerce,-1.0,-1.000000,0.393939,94.3,111.5,-17.2,0.1262,46.0,52.4,16.7,18.3,24.3,30.8,39.2,66.2,-14.0,0.1968,,-1.301190,-0.161617,0.935604,1.097221,67.459737


In [18]:
df.loc[df['Rating'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
8,2024,1109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
17,2024,1118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
20,2024,1121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
27,2024,1128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
33,2024,1134,Brooklyn,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
46,2024,1147,Centenary,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
114,2024,1215,Hardin-Simmons,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
115,2024,1216,Hartford,-1.0,-0.666667,,,,,,,,,,,,,,,0.0418,0.2644,,,,,
189,2024,1289,Morris Brown,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
202,2024,1302,NE Illinois,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,


### Starters

In [19]:
df_starters = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\starters\starters_{season}.csv')
        .assign(Season=season)
        for season in range(season, season + 1) if season != 2020
    ),
    ignore_index=True
)

df_starters.insert(0, 'Season', df_starters.pop('Season'))

df_starters.rename(columns={'Rating': 'Starters'}, inplace=True)

df_starters

Unnamed: 0,Season,Team,Starters
0,2024,Connecticut,0.575063
1,2024,Purdue,0.566599
2,2024,Houston,0.521213
3,2024,Auburn,0.516003
4,2024,North Carolina,0.468077
...,...,...,...
357,2024,Stonehill,-0.420181
358,2024,Virginia Military Institute,-0.426399
359,2024,Buffalo,-0.445081
360,2024,Mississippi Valley State,-0.469334


In [20]:
starters_teams = df_starters['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            starters_team,
            *process.extract(
                starters_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for starters_team in tqdm(starters_teams)
    ],
    columns=['Starters Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,Starters Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Binghamton,binghamton,100
5,Maryland-Baltimore County,maryland-baltimore county,100
6,East Carolina,east carolina,100
7,Cal State Northridge,cal state northridge,100
8,Cal State Bakersfield,cal state bakersfield,100
9,Canisius,canisius,100


In [21]:
starters_to_spelling = dict(zip(df_match['Starters Team'], df_match['Team Spelling']))

df_starters.insert(1, 'TeamID', df_starters['Team'].map(starters_to_spelling).map(spelling_to_id))

df_starters

Unnamed: 0,Season,TeamID,Team,Starters
0,2024,1163,Connecticut,0.575063
1,2024,1345,Purdue,0.566599
2,2024,1222,Houston,0.521213
3,2024,1120,Auburn,0.516003
4,2024,1314,North Carolina,0.468077
...,...,...,...,...
357,2024,1476,Stonehill,-0.420181
358,2024,1440,Virginia Military Institute,-0.426399
359,2024,1138,Buffalo,-0.445081
360,2024,1290,Mississippi Valley State,-0.469334


In [22]:
df_starters.loc[df_starters['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Starters


In [23]:
df = pd.merge(
    df,
    df_starters.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters
0,2024,1101,Abilene Chr,-1.0,-0.333333,0.470588,100.3,104.5,-4.2,0.3859,47.1,51.6,17.7,20.2,25.9,37.9,39.9,68.5,-11.2,0.4420,0.588625,-0.211809,-0.038666,1.006279,1.044945,69.602134,0.071105
1,2024,1102,Air Force,-1.0,-1.000000,0.290323,106.5,111.6,-5.1,0.3697,53.8,54.2,18.8,17.6,23.8,29.6,39.1,61.9,-13.3,0.5665,0.370875,-0.952703,-0.080433,1.042874,1.123307,63.258668,-0.195481
2,2024,1103,Akron,-1.0,-0.666667,0.705882,105.0,101.6,3.4,0.5940,52.0,48.6,17.2,16.6,29.5,33.6,29.0,65.8,-4.5,0.6479,0.648850,1.003653,0.057068,1.062449,1.005381,67.048069,0.247122
3,2024,1104,Alabama,2.0,1.333333,0.656250,125.1,102.1,23.0,0.9115,56.3,49.9,16.0,15.6,34.9,35.2,39.6,72.6,3.3,0.9548,0.884050,1.688042,0.231507,1.235296,1.003788,73.717312,0.300010
4,2024,1105,Alabama A&M,-1.0,-1.000000,0.352941,92.5,107.4,-14.9,0.1528,45.8,48.9,22.5,19.6,31.9,47.7,45.9,70.6,-17.5,0.2121,0.130000,-1.283926,-0.152056,0.932386,1.084442,71.239757,-0.077493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,2024,1474,Queens NC,-1.0,-1.000000,0.424242,103.4,111.0,-7.6,0.3069,49.9,54.1,15.7,15.4,27.6,33.7,29.4,72.8,-13.6,0.4299,,-0.594255,-0.078541,1.022795,1.101336,73.901220,-0.063682
375,2024,1475,Southern Indiana,-1.0,-1.000000,0.250000,96.4,111.7,-15.3,0.1552,45.6,49.5,19.4,16.6,27.0,37.6,48.6,69.1,-19.2,0.2337,,-1.921659,-0.142108,0.954198,1.096305,69.811962,-0.295791
376,2024,1476,Stonehill,-1.0,-1.000000,0.129032,90.4,114.2,-23.8,0.0638,46.7,52.7,19.5,16.6,22.5,22.6,29.4,68.1,-22.0,0.1809,,-2.444949,-0.214238,0.914357,1.128596,69.240004,-0.420181
377,2024,1477,TX A&M Commerce,-1.0,-1.000000,0.393939,94.3,111.5,-17.2,0.1262,46.0,52.4,16.7,18.3,24.3,30.8,39.2,66.2,-14.0,0.1968,,-1.301190,-0.161617,0.935604,1.097221,67.459737,-0.120318


In [24]:
df.loc[df['Starters'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters
8,2024,1109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
17,2024,1118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
20,2024,1121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
27,2024,1128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
33,2024,1134,Brooklyn,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
46,2024,1147,Centenary,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
114,2024,1215,Hardin-Simmons,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
115,2024,1216,Hartford,-1.0,-0.666667,,,,,,,,,,,,,,,0.0418,0.2644,,,,,,
189,2024,1289,Morris Brown,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
202,2024,1302,NE Illinois,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,


### Map to Matchups

In [25]:
# df_seeds = pd.read_csv(fr'..\data\unprocessed\kaggle\{season}_tourney_seeds.csv')

# df_seeds = df_seeds.loc[df_seeds['Tournament'] == 'M', :].reset_index(drop=True)

# df_seeds.rename(columns={'Seed': 'Region Seed'}, inplace=True)
# df_seeds.insert(2, 'Region', df_seeds['Region Seed'].str[0])
# df_seeds.insert(3, 'Seed', df_seeds['Region Seed'].str.extract('(\d+)').astype(int))

# df_seeds

In [26]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\MNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] == season, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds = df_seeds.loc[~df_seeds['TeamID'].isin(playin_losers), :].reset_index(drop=True)

df_seeds

Unnamed: 0,Season,Seed,Region,Play In,TeamID
0,2024,1,W,False,1163
1,2024,2,W,False,1235
2,2024,3,W,False,1228
3,2024,4,W,False,1120
4,2024,5,W,False,1361
...,...,...,...,...,...
59,2024,12,Z,False,1241
60,2024,13,Z,False,1436
61,2024,14,Z,False,1324
62,2024,15,Z,False,1443


In [27]:
id_to_region = dict(zip(df_seeds['TeamID'], df_seeds['Region']))
id_to_seed = dict(zip(df_seeds['TeamID'], df_seeds['Seed']))

df_mod = pd.DataFrame(
    [
        (team_a, team_b) 
        for team_a in df_seeds['TeamID'].unique() 
        for team_b in df_seeds['TeamID'].unique() 
        if team_a != team_b
    ],
    columns=['Team A ID', 'Team B ID']
)

df_mod.insert(0, 'Season', season)
df_mod['Team A Region'] = df_mod['Team A ID'].map(id_to_region)
df_mod['Team B Region'] = df_mod['Team B ID'].map(id_to_region)
df_mod['Team A Seed'] = df_mod['Team A ID'].map(id_to_seed)
df_mod['Team B Seed'] = df_mod['Team B ID'].map(id_to_seed)

df_mod

Unnamed: 0,Season,Team A ID,Team B ID,Team A Region,Team B Region,Team A Seed,Team B Seed
0,2024,1163,1235,W,W,1,2
1,2024,1163,1228,W,W,1,3
2,2024,1163,1120,W,W,1,4
3,2024,1163,1361,W,W,1,5
4,2024,1163,1140,W,W,1,6
...,...,...,...,...,...,...,...
4027,2024,1255,1301,Z,Z,16,11
4028,2024,1255,1241,Z,Z,16,12
4029,2024,1255,1436,Z,Z,16,13
4030,2024,1255,1324,Z,Z,16,14


Calculate round of matchup

In [28]:
same_region = df_mod['Team A Region'] == df_mod['Team B Region']

# round_0_condition = (df_mod['team0_playin'] == 1) & (df_mod['team1_playin'] == 1)  # no play-in games in this data

round_1_condition = df_mod['Team A Seed'] + df_mod['Team B Seed'] == 17

round_2_condition = (
    (df_mod['Team A Seed'].isin([1, 16]) & df_mod['Team B Seed'].isin([8, 9])) | 
    (df_mod['Team A Seed'].isin([8, 9]) & df_mod['Team B Seed'].isin([1, 16])) |
    (df_mod['Team A Seed'].isin([5, 12]) & df_mod['Team B Seed'].isin([4, 13])) | 
    (df_mod['Team A Seed'].isin([4, 13]) & df_mod['Team B Seed'].isin([5, 12])) |
    (df_mod['Team A Seed'].isin([6, 11]) & df_mod['Team B Seed'].isin([3, 14])) | 
    (df_mod['Team A Seed'].isin([3, 14]) & df_mod['Team B Seed'].isin([6, 11])) |
    (df_mod['Team A Seed'].isin([7, 10]) & df_mod['Team B Seed'].isin([2, 15])) | 
    (df_mod['Team A Seed'].isin([2, 15]) & df_mod['Team B Seed'].isin([7, 10]))
)

round_3_condition = (
    (df_mod['Team A Seed'].isin([1, 16, 8, 9]) & df_mod['Team B Seed'].isin([5, 12, 4, 13])) | 
    (df_mod['Team A Seed'].isin([5, 12, 4, 13]) & df_mod['Team B Seed'].isin([1, 16, 8, 9])) |
    (df_mod['Team A Seed'].isin([6, 11, 3, 14]) & df_mod['Team B Seed'].isin([7, 10, 2, 15])) | 
    (df_mod['Team A Seed'].isin([7, 10, 2, 15]) & df_mod['Team B Seed'].isin([6, 11, 3, 14]))
)

round_4_condition = (
    (df_mod['Team A Seed'].isin([1, 16, 8, 9, 5, 12, 4, 13]) & df_mod['Team B Seed'].isin([6, 11, 3, 14, 7, 10, 2, 15])) | 
    (df_mod['Team A Seed'].isin([6, 11, 3, 14, 7, 10, 2, 15]) & df_mod['Team B Seed'].isin([1, 16, 8, 9, 5, 12, 4, 13]))
)

round_5_condition = (
    (df_mod['Team A Region'].isin(['W']) & df_mod['Team B Region'].isin(['X'])) | 
    (df_mod['Team A Region'].isin(['X']) & df_mod['Team B Region'].isin(['W'])) |
    (df_mod['Team A Region'].isin(['Y']) & df_mod['Team B Region'].isin(['Z'])) | 
    (df_mod['Team A Region'].isin(['Z']) & df_mod['Team B Region'].isin(['Y']))
)

round_6_condition = (
    (df_mod['Team A Region'].isin(['W', 'X']) & df_mod['Team B Region'].isin(['Y', 'Z'])) | 
    (df_mod['Team A Region'].isin(['Y', 'Z']) & df_mod['Team B Region'].isin(['W', 'X'])) 
)

round_6_condition

0       False
1       False
2       False
3       False
4       False
        ...  
4027    False
4028    False
4029    False
4030    False
4031    False
Length: 4032, dtype: bool

In [29]:
df_mod['Round'] = -1

df_mod.loc[round_6_condition, 'Round'] = 6

df_mod.loc[round_5_condition, 'Round'] = 5

df_mod.loc[round_4_condition & same_region, 'Round'] = 4

df_mod.loc[round_3_condition & same_region, 'Round'] = 3

df_mod.loc[round_2_condition & same_region, 'Round'] = 2

df_mod.loc[round_1_condition & same_region, 'Round'] = 1

df_mod

Unnamed: 0,Season,Team A ID,Team B ID,Team A Region,Team B Region,Team A Seed,Team B Seed,Round
0,2024,1163,1235,W,W,1,2,4
1,2024,1163,1228,W,W,1,3,4
2,2024,1163,1120,W,W,1,4,3
3,2024,1163,1361,W,W,1,5,3
4,2024,1163,1140,W,W,1,6,4
...,...,...,...,...,...,...,...,...
4027,2024,1255,1301,Z,Z,16,11,4
4028,2024,1255,1241,Z,Z,16,12,3
4029,2024,1255,1436,Z,Z,16,13,3
4030,2024,1255,1324,Z,Z,16,14,4


Get team names

In [30]:
df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\MTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2024
1,1102,Air Force,1985,2024
2,1103,Akron,1985,2024
3,1104,Alabama,1985,2024
4,1105,Alabama A&M,2000,2024
...,...,...,...,...
373,1474,Queens NC,2023,2024
374,1475,Southern Indiana,2023,2024
375,1476,Stonehill,2023,2024
376,1477,TX A&M Commerce,2023,2024


In [31]:
id_to_team = dict(zip(df_teams['TeamID'], df_teams['TeamName']))

df_mod.insert(df_mod.columns.get_loc('Team A ID') + 1, 'Team A', df_mod['Team A ID'].map(id_to_team))
df_mod.insert(df_mod.columns.get_loc('Team B ID') + 1, 'Team B', df_mod['Team B ID'].map(id_to_team))

df_mod

Unnamed: 0,Season,Team A ID,Team A,Team B ID,Team B,Team A Region,Team B Region,Team A Seed,Team B Seed,Round
0,2024,1163,Connecticut,1235,Iowa St,W,W,1,2,4
1,2024,1163,Connecticut,1228,Illinois,W,W,1,3,4
2,2024,1163,Connecticut,1120,Auburn,W,W,1,4,3
3,2024,1163,Connecticut,1361,San Diego St,W,W,1,5,3
4,2024,1163,Connecticut,1140,BYU,W,W,1,6,4
...,...,...,...,...,...,...,...,...,...,...
4027,2024,1255,Longwood,1301,NC State,Z,Z,16,11,4
4028,2024,1255,Longwood,1241,James Madison,Z,Z,16,12,3
4029,2024,1255,Longwood,1436,Vermont,Z,Z,16,13,3
4030,2024,1255,Longwood,1324,Oakland,Z,Z,16,14,4


Map features

In [32]:
team_a_features = pd.merge(
    df_mod[['Season', 'Team A ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team A ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team A ID', 'TeamID'])

team_b_features = pd.merge(
    df_mod[['Season', 'Team B ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team B ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team B ID', 'TeamID'])

df_features = team_a_features - team_b_features

df_features['Team A ADJOE Team B ADJDE'] = team_a_features['ADJOE'] + team_b_features['ADJDE']
df_features['Team B ADJOE Team A ADJDE'] = team_b_features['ADJOE'] + team_a_features['ADJDE']

df_features['Team A Offense Team B Defense'] = team_a_features['Adjusted Offense'] + team_b_features['Adjusted Defense']
df_features['Team B Offense Team A Defense'] = team_b_features['Adjusted Offense'] + team_a_features['Adjusted Defense']

df_features['Team A BARTHAG'] = team_a_features['BARTHAG']
df_features['Team B BARTHAG'] = team_b_features['BARTHAG']

df_features

Unnamed: 0,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters,Team A ADJOE Team B ADJDE,Team B ADJOE Team A ADJDE,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A BARTHAG,Team B BARTHAG
0,6.0,1.666667,0.117647,13.3,7.3,6.0,0.0120,5.2,-2.0,-0.8,-9.5,4.9,-2.8,-2.7,-3.0,4.1,0.0964,0.157650,0.725351,0.023661,0.093921,0.070260,-2.376070,0.172470,213.4,207.4,2.106374,2.082713,0.9692,0.9572
1,6.0,1.333333,0.147059,1.5,-7.2,8.7,0.0477,3.3,-2.9,-0.2,3.9,0.3,-4.5,5.4,-5.3,4.9,0.1427,0.001425,0.711147,0.074763,0.022712,-0.052051,-4.841077,0.139383,227.9,219.2,2.228685,2.153922,0.9692,0.9215
2,5.0,1.666667,0.117647,6.1,1.8,4.3,0.0121,3.0,1.7,0.0,-2.0,3.6,-4.9,-8.5,-5.3,5.4,0.0814,0.031200,0.542271,0.021995,0.042434,0.020439,-4.286362,0.059060,218.9,214.6,2.156195,2.134200,0.9692,0.9571
3,1.0,0.333333,0.205882,15.0,0.2,14.8,0.0859,7.3,-2.1,-1.1,-1.8,3.8,-4.2,-0.8,-1.5,8.0,0.0438,-0.015400,0.890085,0.105198,0.113882,0.008684,-1.013223,0.219970,220.5,205.7,2.167950,2.062752,0.9692,0.8833
4,7.0,2.666667,0.214795,6.4,-5.2,11.6,0.0655,2.0,-2.9,-0.3,0.2,3.7,8.0,0.4,-4.4,8.2,0.2066,0.057750,1.415374,0.067263,0.048577,-0.018686,-4.404484,0.313872,225.9,214.3,2.195319,2.128057,0.9692,0.9037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,-1.0,0.000000,0.006536,-10.0,3.6,-13.6,-0.3181,-1.3,0.8,4.3,1.7,8.1,7.0,4.2,-1.1,-7.9,-0.3104,-0.393425,-0.880603,-0.095675,-0.071767,0.023908,-1.223391,-0.179739,205.3,218.9,2.060952,2.156627,0.4768,0.7949
4028,0.0,0.333333,-0.294118,-8.0,4.4,-12.4,-0.3005,-5.0,4.6,3.3,-0.2,4.7,5.3,1.0,-2.9,-10.2,-0.1287,-0.077275,-1.740782,-0.112773,-0.070767,0.042006,-3.150093,-0.430280,204.5,216.9,2.042854,2.155628,0.4768,0.7773
4029,-1.0,-0.333333,-0.205882,-0.6,6.2,-6.8,-0.1828,-2.4,4.6,4.2,4.5,14.3,9.8,10.2,3.5,-7.5,-0.1705,-0.304825,-1.419985,-0.057724,-0.016290,0.041435,3.254886,-0.305112,202.7,209.5,2.043426,2.101150,0.4768,0.6596
4030,0.0,0.333333,-0.058824,-3.8,0.3,-4.1,-0.1091,-2.7,0.7,1.8,3.3,6.1,8.6,9.3,0.1,-4.5,0.2547,0.028275,-0.570137,-0.022311,-0.029909,-0.007598,-0.222742,-0.225405,208.6,212.7,2.092459,2.114770,0.4768,0.5859


In [33]:
df_mod[df_features.columns] = df_features

df_mod

Unnamed: 0,Season,Team A ID,Team A,Team B ID,Team B,Team A Region,Team B Region,Team A Seed,Team B Seed,Round,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters,Team A ADJOE Team B ADJDE,Team B ADJOE Team A ADJDE,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A BARTHAG,Team B BARTHAG
0,2024,1163,Connecticut,1235,Iowa St,W,W,1,2,4,6.0,1.666667,0.117647,13.3,7.3,6.0,0.0120,5.2,-2.0,-0.8,-9.5,4.9,-2.8,-2.7,-3.0,4.1,0.0964,0.157650,0.725351,0.023661,0.093921,0.070260,-2.376070,0.172470,213.4,207.4,2.106374,2.082713,0.9692,0.9572
1,2024,1163,Connecticut,1228,Illinois,W,W,1,3,4,6.0,1.333333,0.147059,1.5,-7.2,8.7,0.0477,3.3,-2.9,-0.2,3.9,0.3,-4.5,5.4,-5.3,4.9,0.1427,0.001425,0.711147,0.074763,0.022712,-0.052051,-4.841077,0.139383,227.9,219.2,2.228685,2.153922,0.9692,0.9215
2,2024,1163,Connecticut,1120,Auburn,W,W,1,4,3,5.0,1.666667,0.117647,6.1,1.8,4.3,0.0121,3.0,1.7,0.0,-2.0,3.6,-4.9,-8.5,-5.3,5.4,0.0814,0.031200,0.542271,0.021995,0.042434,0.020439,-4.286362,0.059060,218.9,214.6,2.156195,2.134200,0.9692,0.9571
3,2024,1163,Connecticut,1361,San Diego St,W,W,1,5,3,1.0,0.333333,0.205882,15.0,0.2,14.8,0.0859,7.3,-2.1,-1.1,-1.8,3.8,-4.2,-0.8,-1.5,8.0,0.0438,-0.015400,0.890085,0.105198,0.113882,0.008684,-1.013223,0.219970,220.5,205.7,2.167950,2.062752,0.9692,0.8833
4,2024,1163,Connecticut,1140,BYU,W,W,1,6,4,7.0,2.666667,0.214795,6.4,-5.2,11.6,0.0655,2.0,-2.9,-0.3,0.2,3.7,8.0,0.4,-4.4,8.2,0.2066,0.057750,1.415374,0.067263,0.048577,-0.018686,-4.404484,0.313872,225.9,214.3,2.195319,2.128057,0.9692,0.9037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,2024,1255,Longwood,1301,NC State,Z,Z,16,11,4,-1.0,0.000000,0.006536,-10.0,3.6,-13.6,-0.3181,-1.3,0.8,4.3,1.7,8.1,7.0,4.2,-1.1,-7.9,-0.3104,-0.393425,-0.880603,-0.095675,-0.071767,0.023908,-1.223391,-0.179739,205.3,218.9,2.060952,2.156627,0.4768,0.7949
4028,2024,1255,Longwood,1241,James Madison,Z,Z,16,12,3,0.0,0.333333,-0.294118,-8.0,4.4,-12.4,-0.3005,-5.0,4.6,3.3,-0.2,4.7,5.3,1.0,-2.9,-10.2,-0.1287,-0.077275,-1.740782,-0.112773,-0.070767,0.042006,-3.150093,-0.430280,204.5,216.9,2.042854,2.155628,0.4768,0.7773
4029,2024,1255,Longwood,1436,Vermont,Z,Z,16,13,3,-1.0,-0.333333,-0.205882,-0.6,6.2,-6.8,-0.1828,-2.4,4.6,4.2,4.5,14.3,9.8,10.2,3.5,-7.5,-0.1705,-0.304825,-1.419985,-0.057724,-0.016290,0.041435,3.254886,-0.305112,202.7,209.5,2.043426,2.101150,0.4768,0.6596
4030,2024,1255,Longwood,1324,Oakland,Z,Z,16,14,4,0.0,0.333333,-0.058824,-3.8,0.3,-4.1,-0.1091,-2.7,0.7,1.8,3.3,6.1,8.6,9.3,0.1,-4.5,0.2547,0.028275,-0.570137,-0.022311,-0.029909,-0.007598,-0.222742,-0.225405,208.6,212.7,2.092459,2.114770,0.4768,0.5859


In [34]:
df_mod.insert(1, 'Round', df_mod.pop('Round'))

df_mod.drop(columns=['Team A Region', 'Team B Region', 'Team A Seed', 'Team B Seed'], inplace=True)

df_mod

Unnamed: 0,Season,Round,Team A ID,Team A,Team B ID,Team B,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters,Team A ADJOE Team B ADJDE,Team B ADJOE Team A ADJDE,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A BARTHAG,Team B BARTHAG
0,2024,4,1163,Connecticut,1235,Iowa St,6.0,1.666667,0.117647,13.3,7.3,6.0,0.0120,5.2,-2.0,-0.8,-9.5,4.9,-2.8,-2.7,-3.0,4.1,0.0964,0.157650,0.725351,0.023661,0.093921,0.070260,-2.376070,0.172470,213.4,207.4,2.106374,2.082713,0.9692,0.9572
1,2024,4,1163,Connecticut,1228,Illinois,6.0,1.333333,0.147059,1.5,-7.2,8.7,0.0477,3.3,-2.9,-0.2,3.9,0.3,-4.5,5.4,-5.3,4.9,0.1427,0.001425,0.711147,0.074763,0.022712,-0.052051,-4.841077,0.139383,227.9,219.2,2.228685,2.153922,0.9692,0.9215
2,2024,3,1163,Connecticut,1120,Auburn,5.0,1.666667,0.117647,6.1,1.8,4.3,0.0121,3.0,1.7,0.0,-2.0,3.6,-4.9,-8.5,-5.3,5.4,0.0814,0.031200,0.542271,0.021995,0.042434,0.020439,-4.286362,0.059060,218.9,214.6,2.156195,2.134200,0.9692,0.9571
3,2024,3,1163,Connecticut,1361,San Diego St,1.0,0.333333,0.205882,15.0,0.2,14.8,0.0859,7.3,-2.1,-1.1,-1.8,3.8,-4.2,-0.8,-1.5,8.0,0.0438,-0.015400,0.890085,0.105198,0.113882,0.008684,-1.013223,0.219970,220.5,205.7,2.167950,2.062752,0.9692,0.8833
4,2024,4,1163,Connecticut,1140,BYU,7.0,2.666667,0.214795,6.4,-5.2,11.6,0.0655,2.0,-2.9,-0.3,0.2,3.7,8.0,0.4,-4.4,8.2,0.2066,0.057750,1.415374,0.067263,0.048577,-0.018686,-4.404484,0.313872,225.9,214.3,2.195319,2.128057,0.9692,0.9037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,2024,4,1255,Longwood,1301,NC State,-1.0,0.000000,0.006536,-10.0,3.6,-13.6,-0.3181,-1.3,0.8,4.3,1.7,8.1,7.0,4.2,-1.1,-7.9,-0.3104,-0.393425,-0.880603,-0.095675,-0.071767,0.023908,-1.223391,-0.179739,205.3,218.9,2.060952,2.156627,0.4768,0.7949
4028,2024,3,1255,Longwood,1241,James Madison,0.0,0.333333,-0.294118,-8.0,4.4,-12.4,-0.3005,-5.0,4.6,3.3,-0.2,4.7,5.3,1.0,-2.9,-10.2,-0.1287,-0.077275,-1.740782,-0.112773,-0.070767,0.042006,-3.150093,-0.430280,204.5,216.9,2.042854,2.155628,0.4768,0.7773
4029,2024,3,1255,Longwood,1436,Vermont,-1.0,-0.333333,-0.205882,-0.6,6.2,-6.8,-0.1828,-2.4,4.6,4.2,4.5,14.3,9.8,10.2,3.5,-7.5,-0.1705,-0.304825,-1.419985,-0.057724,-0.016290,0.041435,3.254886,-0.305112,202.7,209.5,2.043426,2.101150,0.4768,0.6596
4030,2024,4,1255,Longwood,1324,Oakland,0.0,0.333333,-0.058824,-3.8,0.3,-4.1,-0.1091,-2.7,0.7,1.8,3.3,6.1,8.6,9.3,0.1,-4.5,0.2547,0.028275,-0.570137,-0.022311,-0.029909,-0.007598,-0.222742,-0.225405,208.6,212.7,2.092459,2.114770,0.4768,0.5859


Check that data follows same format as the data that the model was trained on

In [35]:
df_mod_training = pd.read_csv(f'{model_path}/df_mod.csv')

assert all(df_mod_training.drop(columns=['Result']).columns == df_mod.columns), 'Columns do not match'

'Columns Match'

'Columns Match'

### Get Model Predictions

In [36]:
import pickle

with open(f'{model_path}/model.pkl', 'rb') as f:
    mod = pickle.load(f)

mod

  from pandas import MultiIndex, Int64Index


XGBClassifier(alpha=4.6265797003339335, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.031875184219462616, enable_categorical=False,
              eta=0.04653777482812567, eval_metric='logloss', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              lambda=0.09456529660602847, learning_rate=0.0465377755,
              max_delta_s...
                                    'TORD': 1, 'Team A ADJOE Team B ADJDE': 1,
                                    'Team A BARTHAG': 1,
                                    'Team A Offense Team B Defense': 1,
                                    'Team B ADJOE Team A ADJDE': -1,
                                    'Team B BARTHAG': -1,
                                    'Team B Offense Team A Defense': -1,
                                    'WAB': 1, 'WIN%': 1},
              n_estimators=1000, n_jobs=12, num_parallel_tree=1,
              pr

In [37]:
X = df_mod.drop(columns=['Season', 'Team A ID', 'Team A', 'Team B ID', 'Team B'])

predictions = mod.predict_proba(X)[:, 1]

predictions

array([0.6441993 , 0.7458986 , 0.6478119 , ..., 0.17606908, 0.4042524 ,
       0.38650185], dtype=float32)

Turn predictions into matchup matrix

In [38]:
df_matrix = (
    df_mod[['Team A ID', 'Team B ID']]
    .assign(Prediction=predictions)
    .pivot(
        index=['Team A ID'], 
        columns=['Team B ID'],
        values='Prediction',
    )
)

df_matrix

Team B ID,1103,1104,1112,1120,1124,1140,1155,1158,1159,1160,1161,1163,1166,1173,1179,1181,1182,1194,1196,1211,1212,1213,1222,1228,1235,1241,1242,1246,1253,1255,1266,1270,1277,1280,1287,1301,1304,1305,1307,1314,1321,1324,1332,1345,1355,1359,1361,1376,1388,1389,1391,1395,1397,1400,1401,1403,1412,1429,1436,1443,1447,1450,1458,1463
Team A ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
1103,,0.176790,0.090028,0.074159,0.195362,0.132939,0.280361,0.289806,0.548561,0.227984,0.236360,0.086893,0.111813,0.201963,0.242538,0.117890,0.382621,0.231476,0.215826,0.142201,0.869855,0.345742,0.060471,0.159184,0.115376,0.270459,0.208254,0.189641,0.655920,0.719028,0.115252,0.332945,0.162969,0.201532,0.499685,0.325997,0.202913,0.214542,0.157164,0.114380,0.178537,0.563374,0.230352,0.130908,0.612800,0.360795,0.175329,0.243099,0.089081,0.701028,0.757816,0.152885,0.116804,0.183268,0.198510,0.216794,0.507673,0.265551,0.394329,0.526639,0.820614,0.211518,0.199211,0.348686
1104,0.828383,,0.387664,0.413528,0.611440,0.499307,0.686835,0.703139,0.814302,0.642160,0.640564,0.279104,0.469207,0.643628,0.668955,0.510066,0.778374,0.647326,0.640909,0.474240,0.892249,0.800497,0.197115,0.501395,0.377304,0.721380,0.567538,0.589026,0.878223,0.869422,0.412204,0.779750,0.555424,0.647704,0.847758,0.752897,0.607601,0.695128,0.612634,0.468112,0.592664,0.869722,0.692056,0.344198,0.875938,0.766108,0.640286,0.689326,0.463103,0.868878,0.895797,0.540494,0.461689,0.596938,0.640538,0.627206,0.861976,0.622501,0.769161,0.870167,0.867566,0.662195,0.611267,0.777496
1112,0.914462,0.640672,,0.545499,0.628720,0.664019,0.779242,0.833616,0.900499,0.778998,0.818933,0.383411,0.669939,0.797700,0.836291,0.697503,0.886911,0.825511,0.774170,0.654017,0.920994,0.849280,0.332170,0.626896,0.546344,0.803044,0.720705,0.665464,0.915196,0.916969,0.598263,0.837748,0.686808,0.790129,0.890403,0.880650,0.780663,0.856893,0.777282,0.623532,0.775663,0.907453,0.860147,0.510139,0.907039,0.861930,0.729689,0.853239,0.631441,0.914914,0.925096,0.764297,0.587705,0.728620,0.836344,0.799928,0.888134,0.813620,0.869038,0.928581,0.910723,0.797350,0.788116,0.873999
1120,0.929420,0.614952,0.463310,,0.630095,0.712210,0.756068,0.845612,0.891561,0.733903,0.776168,0.378562,0.618439,0.802420,0.757693,0.605906,0.882161,0.783838,0.811301,0.595038,0.929986,0.877391,0.319606,0.640185,0.457911,0.835171,0.680001,0.658092,0.935050,0.902011,0.597625,0.862112,0.704317,0.771981,0.900890,0.842113,0.762434,0.812584,0.760241,0.675186,0.777774,0.890368,0.796393,0.398026,0.882200,0.878745,0.694889,0.835602,0.586147,0.918616,0.891808,0.780716,0.559967,0.685459,0.826428,0.770873,0.879466,0.815892,0.874472,0.903818,0.914237,0.804159,0.796619,0.875978
1124,0.811254,0.409346,0.396261,0.400915,,0.571801,0.634981,0.785512,0.855650,0.636887,0.685797,0.205899,0.478848,0.626117,0.665755,0.459823,0.785113,0.602153,0.610543,0.493036,0.899981,0.770831,0.148455,0.517229,0.428723,0.729029,0.578010,0.486303,0.865301,0.882620,0.446824,0.713474,0.532856,0.661209,0.821016,0.730260,0.644919,0.733087,0.547648,0.499624,0.555609,0.846015,0.681382,0.286070,0.874104,0.785927,0.638984,0.639047,0.454052,0.853172,0.889163,0.568016,0.375739,0.515042,0.637613,0.598550,0.866669,0.709444,0.774171,0.810997,0.890030,0.682400,0.564746,0.793685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1443,0.506261,0.138681,0.076121,0.100225,0.199283,0.109199,0.208402,0.267687,0.467977,0.161030,0.168983,0.079537,0.102776,0.173464,0.211167,0.115286,0.276611,0.194947,0.206882,0.119076,0.802922,0.276059,0.066534,0.168420,0.081838,0.209484,0.142252,0.152540,0.578936,0.635186,0.093321,0.319635,0.149060,0.171024,0.464700,0.260362,0.165371,0.200556,0.104347,0.121067,0.214451,0.559317,0.233450,0.105150,0.546142,0.244273,0.126526,0.248838,0.112098,0.701258,0.709706,0.161855,0.104517,0.157039,0.247281,0.177672,0.461548,0.225071,0.406175,,0.776692,0.191737,0.156361,0.294959
1447,0.190306,0.135804,0.092863,0.089212,0.112423,0.096695,0.112412,0.121589,0.194667,0.125509,0.124399,0.098429,0.103499,0.145708,0.130599,0.110420,0.151333,0.120364,0.129318,0.090972,0.489977,0.104238,0.070994,0.125672,0.077963,0.113486,0.127625,0.113024,0.306210,0.316319,0.092434,0.152911,0.096334,0.105741,0.208254,0.132586,0.119693,0.114280,0.103696,0.113172,0.121502,0.222578,0.120999,0.098369,0.255706,0.153295,0.094523,0.145890,0.093530,0.403383,0.433899,0.110949,0.099983,0.090326,0.119711,0.132432,0.176291,0.126555,0.166197,0.224999,,0.126556,0.139762,0.131501
1450,0.803011,0.358897,0.212602,0.200635,0.329352,0.309346,0.501843,0.704233,0.780132,0.484071,0.510572,0.115313,0.286417,0.531938,0.536270,0.322276,0.697574,0.525467,0.466240,0.268680,0.921100,0.651582,0.077534,0.396932,0.216589,0.541008,0.417273,0.448028,0.896973,0.888506,0.242661,0.722444,0.502319,0.594259,0.801287,0.650896,0.498082,0.573374,0.453682,0.378721,0.460877,0.851008,0.557835,0.225909,0.853150,0.716038,0.489953,0.542512,0.293329,0.867665,0.861573,0.418373,0.272149,0.396428,0.544652,0.439419,0.771595,0.555489,0.721415,0.806338,0.877539,,0.367969,0.637851
1458,0.817267,0.417535,0.231085,0.208494,0.460460,0.432533,0.578194,0.709516,0.813067,0.585403,0.625477,0.136172,0.415863,0.586316,0.624918,0.424918,0.759203,0.567875,0.588125,0.376939,0.914373,0.751021,0.147998,0.424375,0.372485,0.673193,0.505201,0.472315,0.870670,0.904384,0.319258,0.774676,0.521878,0.580098,0.804528,0.715644,0.646560,0.699745,0.506987,0.351192,0.582522,0.872196,0.682131,0.227419,0.871760,0.746594,0.476344,0.651927,0.336740,0.869893,0.869317,0.559399,0.329380,0.528713,0.628721,0.564505,0.836423,0.655891,0.770041,0.850299,0.865873,0.647936,,0.698208


In [39]:
df_matrix_display = df_matrix.copy()

df_matrix_display.columns = df_matrix_display.columns.map(id_to_team)
df_matrix_display.index = df_matrix_display.index.map(id_to_team)

df_matrix_display

Team B ID,Akron,Alabama,Arizona,Auburn,Baylor,BYU,Clemson,Col Charleston,Colgate,Colorado,Colorado St,Connecticut,Creighton,Dayton,Drake,Duke,Duquesne,FL Atlantic,Florida,Gonzaga,Grambling,Grand Canyon,Houston,Illinois,Iowa St,James Madison,Kansas,Kentucky,Long Beach St,Longwood,Marquette,McNeese St,Michigan St,Mississippi St,Morehead St,NC State,Nebraska,Nevada,New Mexico,North Carolina,Northwestern,Oakland,Oregon,Purdue,S Dakota St,Samford,San Diego St,South Carolina,St Mary's CA,St Peter's,Stetson,TCU,Tennessee,Texas,Texas A&M,Texas Tech,UAB,Utah St,Vermont,WKU,Wagner,Washington St,Wisconsin,Yale
Team A ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
Akron,,0.176790,0.090028,0.074159,0.195362,0.132939,0.280361,0.289806,0.548561,0.227984,0.236360,0.086893,0.111813,0.201963,0.242538,0.117890,0.382621,0.231476,0.215826,0.142201,0.869855,0.345742,0.060471,0.159184,0.115376,0.270459,0.208254,0.189641,0.655920,0.719028,0.115252,0.332945,0.162969,0.201532,0.499685,0.325997,0.202913,0.214542,0.157164,0.114380,0.178537,0.563374,0.230352,0.130908,0.612800,0.360795,0.175329,0.243099,0.089081,0.701028,0.757816,0.152885,0.116804,0.183268,0.198510,0.216794,0.507673,0.265551,0.394329,0.526639,0.820614,0.211518,0.199211,0.348686
Alabama,0.828383,,0.387664,0.413528,0.611440,0.499307,0.686835,0.703139,0.814302,0.642160,0.640564,0.279104,0.469207,0.643628,0.668955,0.510066,0.778374,0.647326,0.640909,0.474240,0.892249,0.800497,0.197115,0.501395,0.377304,0.721380,0.567538,0.589026,0.878223,0.869422,0.412204,0.779750,0.555424,0.647704,0.847758,0.752897,0.607601,0.695128,0.612634,0.468112,0.592664,0.869722,0.692056,0.344198,0.875938,0.766108,0.640286,0.689326,0.463103,0.868878,0.895797,0.540494,0.461689,0.596938,0.640538,0.627206,0.861976,0.622501,0.769161,0.870167,0.867566,0.662195,0.611267,0.777496
Arizona,0.914462,0.640672,,0.545499,0.628720,0.664019,0.779242,0.833616,0.900499,0.778998,0.818933,0.383411,0.669939,0.797700,0.836291,0.697503,0.886911,0.825511,0.774170,0.654017,0.920994,0.849280,0.332170,0.626896,0.546344,0.803044,0.720705,0.665464,0.915196,0.916969,0.598263,0.837748,0.686808,0.790129,0.890403,0.880650,0.780663,0.856893,0.777282,0.623532,0.775663,0.907453,0.860147,0.510139,0.907039,0.861930,0.729689,0.853239,0.631441,0.914914,0.925096,0.764297,0.587705,0.728620,0.836344,0.799928,0.888134,0.813620,0.869038,0.928581,0.910723,0.797350,0.788116,0.873999
Auburn,0.929420,0.614952,0.463310,,0.630095,0.712210,0.756068,0.845612,0.891561,0.733903,0.776168,0.378562,0.618439,0.802420,0.757693,0.605906,0.882161,0.783838,0.811301,0.595038,0.929986,0.877391,0.319606,0.640185,0.457911,0.835171,0.680001,0.658092,0.935050,0.902011,0.597625,0.862112,0.704317,0.771981,0.900890,0.842113,0.762434,0.812584,0.760241,0.675186,0.777774,0.890368,0.796393,0.398026,0.882200,0.878745,0.694889,0.835602,0.586147,0.918616,0.891808,0.780716,0.559967,0.685459,0.826428,0.770873,0.879466,0.815892,0.874472,0.903818,0.914237,0.804159,0.796619,0.875978
Baylor,0.811254,0.409346,0.396261,0.400915,,0.571801,0.634981,0.785512,0.855650,0.636887,0.685797,0.205899,0.478848,0.626117,0.665755,0.459823,0.785113,0.602153,0.610543,0.493036,0.899981,0.770831,0.148455,0.517229,0.428723,0.729029,0.578010,0.486303,0.865301,0.882620,0.446824,0.713474,0.532856,0.661209,0.821016,0.730260,0.644919,0.733087,0.547648,0.499624,0.555609,0.846015,0.681382,0.286070,0.874104,0.785927,0.638984,0.639047,0.454052,0.853172,0.889163,0.568016,0.375739,0.515042,0.637613,0.598550,0.866669,0.709444,0.774171,0.810997,0.890030,0.682400,0.564746,0.793685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WKU,0.506261,0.138681,0.076121,0.100225,0.199283,0.109199,0.208402,0.267687,0.467977,0.161030,0.168983,0.079537,0.102776,0.173464,0.211167,0.115286,0.276611,0.194947,0.206882,0.119076,0.802922,0.276059,0.066534,0.168420,0.081838,0.209484,0.142252,0.152540,0.578936,0.635186,0.093321,0.319635,0.149060,0.171024,0.464700,0.260362,0.165371,0.200556,0.104347,0.121067,0.214451,0.559317,0.233450,0.105150,0.546142,0.244273,0.126526,0.248838,0.112098,0.701258,0.709706,0.161855,0.104517,0.157039,0.247281,0.177672,0.461548,0.225071,0.406175,,0.776692,0.191737,0.156361,0.294959
Wagner,0.190306,0.135804,0.092863,0.089212,0.112423,0.096695,0.112412,0.121589,0.194667,0.125509,0.124399,0.098429,0.103499,0.145708,0.130599,0.110420,0.151333,0.120364,0.129318,0.090972,0.489977,0.104238,0.070994,0.125672,0.077963,0.113486,0.127625,0.113024,0.306210,0.316319,0.092434,0.152911,0.096334,0.105741,0.208254,0.132586,0.119693,0.114280,0.103696,0.113172,0.121502,0.222578,0.120999,0.098369,0.255706,0.153295,0.094523,0.145890,0.093530,0.403383,0.433899,0.110949,0.099983,0.090326,0.119711,0.132432,0.176291,0.126555,0.166197,0.224999,,0.126556,0.139762,0.131501
Washington St,0.803011,0.358897,0.212602,0.200635,0.329352,0.309346,0.501843,0.704233,0.780132,0.484071,0.510572,0.115313,0.286417,0.531938,0.536270,0.322276,0.697574,0.525467,0.466240,0.268680,0.921100,0.651582,0.077534,0.396932,0.216589,0.541008,0.417273,0.448028,0.896973,0.888506,0.242661,0.722444,0.502319,0.594259,0.801287,0.650896,0.498082,0.573374,0.453682,0.378721,0.460877,0.851008,0.557835,0.225909,0.853150,0.716038,0.489953,0.542512,0.293329,0.867665,0.861573,0.418373,0.272149,0.396428,0.544652,0.439419,0.771595,0.555489,0.721415,0.806338,0.877539,,0.367969,0.637851
Wisconsin,0.817267,0.417535,0.231085,0.208494,0.460460,0.432533,0.578194,0.709516,0.813067,0.585403,0.625477,0.136172,0.415863,0.586316,0.624918,0.424918,0.759203,0.567875,0.588125,0.376939,0.914373,0.751021,0.147998,0.424375,0.372485,0.673193,0.505201,0.472315,0.870670,0.904384,0.319258,0.774676,0.521878,0.580098,0.804528,0.715644,0.646560,0.699745,0.506987,0.351192,0.582522,0.872196,0.682131,0.227419,0.871760,0.746594,0.476344,0.651927,0.336740,0.869893,0.869317,0.559399,0.329380,0.528713,0.628721,0.564505,0.836423,0.655891,0.770041,0.850299,0.865873,0.647936,,0.698208


In [40]:
df_matrix.to_csv(f'../data/preprocessed/year_data/{season}_mens_matchup_matrix.csv', index=True)
df_matrix_display.to_csv(f'../data/preprocessed/year_data/{season}_mens_matchup_matrix_display.csv', index=True)

'Done'

'Done'