# Get 2024 Women's Data

Get tournament matchup matrix for 2024

In [1]:
season = 2024
playin_losers = (  # remove play-in losers from seeding data
    3357,  # Sacred Heart
    3162,  # Columbia
    3120,  # Auburn
    3221,  # Holy Cross
)

model_path = '../model/womens_20240316'

season

2024

### Previous Tournament Results

In [2]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv(r'..\data\preprocessed\kaggle\womens_tournament_results.csv')

df = df.loc[df['Season'] == season, :].reset_index(drop=True)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results
0,2024,3101,Abilene Chr,-1.0,-1.0
1,2024,3102,Air Force,-1.0,-1.0
2,2024,3103,Akron,-1.0,-1.0
3,2024,3104,Alabama,0.0,0.0
4,2024,3105,Alabama A&M,-1.0,-1.0
...,...,...,...,...,...
371,2024,3474,Queens NC,-1.0,-1.0
372,2024,3475,Southern Indiana,-1.0,-1.0
373,2024,3476,Stonehill,-1.0,-1.0
374,2024,3477,TX A&M Commerce,-1.0,-1.0


### My Rankings

In [3]:
df_rankings = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\womens_my_rankings\womens_my_rankings_{season}.csv')
        .assign(Season=season)
        for season in range(season, season + 1)
    ),
    ignore_index=True
)

df_rankings.insert(0, 'Season', df_rankings.pop('Season'))

df_rankings.drop(columns=['Strength'], inplace=True)

df_rankings

Unnamed: 0,Season,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2024,South Carolina,5.160799,0.555587,1.204419,0.648832,74.651972
1,2024,Texas,4.091071,0.465907,1.180972,0.715065,72.083925
2,2024,Southern California,3.873833,0.383044,1.129590,0.746547,70.976459
3,2024,Iowa,3.872645,0.454576,1.248377,0.793801,77.224499
4,2024,UCLA,3.860473,0.438698,1.150020,0.711321,72.116413
...,...,...,...,...,...,...,...
355,2024,South Carolina State,-3.466935,-0.377753,0.688609,1.066362,68.762264
356,2024,McNeese State,-3.481032,-0.323796,0.789428,1.113224,75.464490
357,2024,Saint Francis (PA),-3.491812,-0.411055,0.703848,1.114903,68.600916
358,2024,Stonehill,-3.520234,-0.376881,0.687036,1.063917,70.836254


In [4]:
df_spellings = pd.read_csv(
    r'..\data\unprocessed\kaggle\WTeamSpellings.csv', 
    encoding='cp1252'  # fixes issue with fancy quotes
)

df_spellings.loc[df_spellings.shape[0]] = ['fdu', 3192]

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,3394
1,a&m-corpus christi,3394
2,abilene chr,3101
3,abilene christian,3101
4,abilene-christian,3101
...,...,...
1154,youngstown st.,3464
1155,youngstown state,3464
1156,youngstown-st,3464
1157,youngstown-state,3464


In [5]:
from fuzzywuzzy.fuzz import token_sort_ratio
from fuzzywuzzy import process
from tqdm.autonotebook import tqdm

team_spellings = df_spellings['TeamNameSpelling'].unique()
my_teams = df_rankings['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            my_team,
            *process.extract(
                my_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for my_team in tqdm(my_teams)
    ],
    columns=['My Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  from tqdm.autonotebook import tqdm


  0%|          | 0/360 [00:00<?, ?it/s]

Unnamed: 0,My Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Quinnipiac,quinnipiac,100
5,Jacksonville State,jacksonville state,100
6,Little Rock,little rock,100
7,UC San Diego,uc san diego,100
8,Samford,samford,100
9,Stetson,stetson,100


In [6]:
ranking_to_spelling = dict(zip(df_match['My Team'], df_match['Team Spelling']))
spelling_to_id = dict(zip(df_spellings['TeamNameSpelling'], df_spellings['TeamID']))

df_rankings.insert(1, 'TeamID', df_rankings['Team'].map(ranking_to_spelling).map(spelling_to_id))

df_rankings

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2024,3376,South Carolina,5.160799,0.555587,1.204419,0.648832,74.651972
1,2024,3400,Texas,4.091071,0.465907,1.180972,0.715065,72.083925
2,2024,3425,Southern California,3.873833,0.383044,1.129590,0.746547,70.976459
3,2024,3234,Iowa,3.872645,0.454576,1.248377,0.793801,77.224499
4,2024,3417,UCLA,3.860473,0.438698,1.150020,0.711321,72.116413
...,...,...,...,...,...,...,...,...
355,2024,3354,South Carolina State,-3.466935,-0.377753,0.688609,1.066362,68.762264
356,2024,3270,McNeese State,-3.481032,-0.323796,0.789428,1.113224,75.464490
357,2024,3384,Saint Francis (PA),-3.491812,-0.411055,0.703848,1.114903,68.600916
358,2024,3476,Stonehill,-3.520234,-0.376881,0.687036,1.063917,70.836254


In [7]:
df_rankings.loc[df_rankings['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo


In [8]:
df = pd.merge(
    df,
    df_rankings.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2024,3101,Abilene Chr,-1.0,-1.0,-0.830212,-0.031372,0.937686,0.969058,69.481120
1,2024,3102,Air Force,-1.0,-1.0,-0.704032,-0.043371,0.884385,0.927756,71.640484
2,2024,3103,Akron,-1.0,-1.0,-1.122986,-0.108223,0.857709,0.965932,68.472906
3,2024,3104,Alabama,0.0,0.0,2.284074,0.269193,1.065067,0.795875,71.092516
4,2024,3105,Alabama A&M,-1.0,-1.0,-1.101003,-0.135259,0.815961,0.951220,69.685934
...,...,...,...,...,...,...,...,...,...,...
371,2024,3474,Queens NC,-1.0,-1.0,-3.001847,-0.269803,0.755798,1.025601,74.371602
372,2024,3475,Southern Indiana,-1.0,-1.0,0.508777,0.021083,0.933588,0.912505,71.833311
373,2024,3476,Stonehill,-1.0,-1.0,-3.520234,-0.376881,0.687036,1.063917,70.836254
374,2024,3477,TX A&M Commerce,-1.0,-1.0,-0.857273,-0.139022,0.861505,1.000526,77.398001


In [9]:
df.loc[df['Rating'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
8,2024,3109,Alliant Intl,-1.0,-1.0,,,,,
17,2024,3118,Armstrong St,-1.0,-1.0,,,,,
20,2024,3121,Augusta,-1.0,-1.0,,,,,
27,2024,3128,Birmingham So,-1.0,-1.0,,,,,
33,2024,3134,Brooklyn,-1.0,-1.0,,,,,
46,2024,3147,Centenary,-1.0,-1.0,,,,,
113,2024,3215,Hardin-Simmons,-1.0,-1.0,,,,,
114,2024,3216,Hartford,-1.0,-1.0,,,,,
187,2024,3289,Morris Brown,-1.0,-1.0,,,,,
200,2024,3302,NE Illinois,-1.0,-1.0,,,,,


### Previous Rankings

In [10]:
df_prev = pd.read_csv(r'data\preprocessed\womens_my_rankings_full_season\womens_my_rankings_full_season.csv')

df_prev = df_prev.loc[df_prev['Season'] == season, :].reset_index(drop=True)

df_prev

Unnamed: 0,Season,Team,Past Year Rating,Past 4 Years Ratings
0,2024,South Carolina,5.991322,5.069536
1,2024,Louisiana State,5.173495,2.781984
2,2024,Indiana,4.695399,3.460692
3,2024,Connecticut,4.409110,4.009288
4,2024,Iowa,4.369270,3.116985
...,...,...,...,...
356,2024,Texas Southern,-3.813642,-1.357948
357,2024,Navy,-4.039280,-2.028372
358,2024,Mississippi Valley State,-4.365660,-3.570037
359,2024,Saint Peter's,-4.708909,-2.127877


In [11]:
prev_teams = df_prev['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            prev_team,
            *process.extract(
                prev_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for prev_team in tqdm(prev_teams)
    ],
    columns=['Prev Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/361 [00:00<?, ?it/s]

Unnamed: 0,Prev Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,St. Thomas,st thomas mn,86
4,Kansas City,mo kansas city,88
5,Texas A&M-Commerce,tx a&m commerce,91
6,Sam Houston,sam houston,100
7,Texas-Rio Grande Valley,texas rio grande valley,100
8,Appalachian State,appalachian state,100
9,Fairfield,fairfield,100


In [12]:
prev_to_spelling = dict(zip(df_match['Prev Team'], df_match['Team Spelling']))

df_prev.insert(1, 'TeamID', df_prev['Team'].map(prev_to_spelling).map(spelling_to_id))

df_prev

Unnamed: 0,Season,TeamID,Team,Past Year Rating,Past 4 Years Ratings
0,2024,3376,South Carolina,5.991322,5.069536
1,2024,3261,Louisiana State,5.173495,2.781984
2,2024,3231,Indiana,4.695399,3.460692
3,2024,3163,Connecticut,4.409110,4.009288
4,2024,3234,Iowa,4.369270,3.116985
...,...,...,...,...,...
356,2024,3411,Texas Southern,-3.813642,-1.357948
357,2024,3298,Navy,-4.039280,-2.028372
358,2024,3290,Mississippi Valley State,-4.365660,-3.570037
359,2024,3389,Saint Peter's,-4.708909,-2.127877


In [13]:
df_prev.loc[df_prev['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Rating,Past 4 Years Ratings


In [14]:
df = pd.merge(
    df,
    df_prev.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings
0,2024,3101,Abilene Chr,-1.0,-1.0,-0.830212,-0.031372,0.937686,0.969058,69.481120,-0.554004,-0.154706
1,2024,3102,Air Force,-1.0,-1.0,-0.704032,-0.043371,0.884385,0.927756,71.640484,-0.909421,-0.652319
2,2024,3103,Akron,-1.0,-1.0,-1.122986,-0.108223,0.857709,0.965932,68.472906,0.084110,0.026817
3,2024,3104,Alabama,0.0,0.0,2.284074,0.269193,1.065067,0.795875,71.092516,2.474049,1.954984
4,2024,3105,Alabama A&M,-1.0,-1.0,-1.101003,-0.135259,0.815961,0.951220,69.685934,-1.388675,-0.736966
...,...,...,...,...,...,...,...,...,...,...,...,...
371,2024,3474,Queens NC,-1.0,-1.0,-3.001847,-0.269803,0.755798,1.025601,74.371602,-2.865331,-2.865331
372,2024,3475,Southern Indiana,-1.0,-1.0,0.508777,0.021083,0.933588,0.912505,71.833311,-1.419952,-1.419952
373,2024,3476,Stonehill,-1.0,-1.0,-3.520234,-0.376881,0.687036,1.063917,70.836254,-2.567315,-2.567315
374,2024,3477,TX A&M Commerce,-1.0,-1.0,-0.857273,-0.139022,0.861505,1.000526,77.398001,-1.253832,-1.253832


In [15]:
df.loc[df['Past 4 Years Ratings'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings
8,2024,3109,Alliant Intl,-1.0,-1.0,,,,,,,
17,2024,3118,Armstrong St,-1.0,-1.0,,,,,,,
20,2024,3121,Augusta,-1.0,-1.0,,,,,,,
27,2024,3128,Birmingham So,-1.0,-1.0,,,,,,,
33,2024,3134,Brooklyn,-1.0,-1.0,,,,,,,
46,2024,3147,Centenary,-1.0,-1.0,,,,,,,
113,2024,3215,Hardin-Simmons,-1.0,-1.0,,,,,,,
187,2024,3289,Morris Brown,-1.0,-1.0,,,,,,,
200,2024,3302,NE Illinois,-1.0,-1.0,,,,,,,
225,2024,3327,Okla City,-1.0,-1.0,,,,,,,


### Starters

In [16]:
df_starters = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\womens_starters\womens_starters_{season}.csv')
        .assign(Season=season)
        for season in range(season, season + 1)
    ),
    ignore_index=True
)

df_starters.insert(0, 'Season', df_starters.pop('Season'))

df_starters.rename(columns={'Rating': 'Starters'}, inplace=True)

df_starters

Unnamed: 0,Season,Team,Starters
0,2024,South Carolina,0.934991
1,2024,Texas,0.713534
2,2024,Southern California,0.697763
3,2024,Gonzaga,0.689250
4,2024,UCLA,0.687168
...,...,...,...
355,2024,Pepperdine,-0.509862
356,2024,Western Carolina,-0.519138
357,2024,South Carolina State,-0.539390
358,2024,McNeese State,-0.569402


In [17]:
starters_teams = df_starters['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            starters_team,
            *process.extract(
                starters_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for starters_team in tqdm(starters_teams)
    ],
    columns=['Starters Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/360 [00:00<?, ?it/s]

Unnamed: 0,Starters Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,South Carolina,south carolina,100
5,Alabama A&M,alabama a&m,100
6,Lehigh,lehigh,100
7,South Carolina Upstate,south carolina upstate,100
8,Air Force,air force,100
9,Army,army,100


In [18]:
starters_to_spelling = dict(zip(df_match['Starters Team'], df_match['Team Spelling']))

df_starters.insert(1, 'TeamID', df_starters['Team'].map(starters_to_spelling).map(spelling_to_id))

df_starters

Unnamed: 0,Season,TeamID,Team,Starters
0,2024,3376,South Carolina,0.934991
1,2024,3400,Texas,0.713534
2,2024,3425,Southern California,0.697763
3,2024,3211,Gonzaga,0.689250
4,2024,3417,UCLA,0.687168
...,...,...,...,...
355,2024,3337,Pepperdine,-0.509862
356,2024,3441,Western Carolina,-0.519138
357,2024,3354,South Carolina State,-0.539390
358,2024,3270,McNeese State,-0.569402


In [19]:
df_starters.loc[df_starters['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Starters


In [20]:
df = pd.merge(
    df,
    df_starters.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters
0,2024,3101,Abilene Chr,-1.0,-1.0,-0.830212,-0.031372,0.937686,0.969058,69.481120,-0.554004,-0.154706,-0.068097
1,2024,3102,Air Force,-1.0,-1.0,-0.704032,-0.043371,0.884385,0.927756,71.640484,-0.909421,-0.652319,-0.134415
2,2024,3103,Akron,-1.0,-1.0,-1.122986,-0.108223,0.857709,0.965932,68.472906,0.084110,0.026817,-0.151045
3,2024,3104,Alabama,0.0,0.0,2.284074,0.269193,1.065067,0.795875,71.092516,2.474049,1.954984,0.379178
4,2024,3105,Alabama A&M,-1.0,-1.0,-1.101003,-0.135259,0.815961,0.951220,69.685934,-1.388675,-0.736966,-0.135499
...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,2024,3474,Queens NC,-1.0,-1.0,-3.001847,-0.269803,0.755798,1.025601,74.371602,-2.865331,-2.865331,-0.471524
372,2024,3475,Southern Indiana,-1.0,-1.0,0.508777,0.021083,0.933588,0.912505,71.833311,-1.419952,-1.419952,0.235514
373,2024,3476,Stonehill,-1.0,-1.0,-3.520234,-0.376881,0.687036,1.063917,70.836254,-2.567315,-2.567315,-0.444307
374,2024,3477,TX A&M Commerce,-1.0,-1.0,-0.857273,-0.139022,0.861505,1.000526,77.398001,-1.253832,-1.253832,-0.004320


In [21]:
df.loc[df['Starters'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters
8,2024,3109,Alliant Intl,-1.0,-1.0,,,,,,,,
17,2024,3118,Armstrong St,-1.0,-1.0,,,,,,,,
20,2024,3121,Augusta,-1.0,-1.0,,,,,,,,
27,2024,3128,Birmingham So,-1.0,-1.0,,,,,,,,
33,2024,3134,Brooklyn,-1.0,-1.0,,,,,,,,
46,2024,3147,Centenary,-1.0,-1.0,,,,,,,,
113,2024,3215,Hardin-Simmons,-1.0,-1.0,,,,,,,,
114,2024,3216,Hartford,-1.0,-1.0,,,,,,-5.55522,-3.738838,
187,2024,3289,Morris Brown,-1.0,-1.0,,,,,,,,
200,2024,3302,NE Illinois,-1.0,-1.0,,,,,,,,


### Box Score Stats

In [22]:
df_stats = pd.read_csv(r'..\data\preprocessed\womens_box_score_stats\womens_box_score_stats.csv')

df_stats = df_stats.loc[df_stats['Season'] == season, :].reset_index(drop=True)

df_stats

Unnamed: 0,Season,Team,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR
0,2024,Abilene Christian,0.407407,0.478077,0.490781,21.232178,14.880053,20.625826,20.122552
1,2024,Air Force,0.433333,0.438547,0.484612,19.250295,12.406726,16.749504,29.145055
2,2024,Akron,0.379310,0.437227,0.477698,21.734627,13.822299,19.185600,19.213234
3,2024,Alabama,0.718750,0.496489,0.428012,19.892904,15.069715,22.813074,18.450030
4,2024,Alabama A&M,0.466667,0.409847,0.441112,23.103184,16.513021,21.612872,24.870078
...,...,...,...,...,...,...,...,...,...
355,2024,Wright State,0.516129,0.477860,0.488562,18.043420,9.488515,21.044587,21.790551
356,2024,Wyoming,0.517241,0.507442,0.464675,20.152826,9.383333,17.073582,17.922837
357,2024,Xavier,0.035714,0.420328,0.513675,25.643612,8.332792,13.540131,21.670300
358,2024,Yale,0.296296,0.422138,0.493458,19.933187,15.534023,17.563822,25.764921


In [23]:
stats_teams = df_stats['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            stats_team,
            *process.extract(
                stats_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for stats_team in tqdm(stats_teams)
    ],
    columns=['Stats Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/360 [00:00<?, ?it/s]

Unnamed: 0,Stats Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Radford,radford,100
5,Quinnipiac,quinnipiac,100
6,Queens (NC),queens nc,100
7,Purdue Fort Wayne,purdue fort wayne,100
8,Purdue,purdue,100
9,Providence,providence,100


In [24]:
stats_to_spelling = dict(zip(df_match['Stats Team'], df_match['Team Spelling']))

df_stats.insert(1, 'TeamID', df_stats['Team'].map(stats_to_spelling).map(spelling_to_id))

df_stats

Unnamed: 0,Season,TeamID,Team,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR
0,2024,3101,Abilene Christian,0.407407,0.478077,0.490781,21.232178,14.880053,20.625826,20.122552
1,2024,3102,Air Force,0.433333,0.438547,0.484612,19.250295,12.406726,16.749504,29.145055
2,2024,3103,Akron,0.379310,0.437227,0.477698,21.734627,13.822299,19.185600,19.213234
3,2024,3104,Alabama,0.718750,0.496489,0.428012,19.892904,15.069715,22.813074,18.450030
4,2024,3105,Alabama A&M,0.466667,0.409847,0.441112,23.103184,16.513021,21.612872,24.870078
...,...,...,...,...,...,...,...,...,...,...
355,2024,3460,Wright State,0.516129,0.477860,0.488562,18.043420,9.488515,21.044587,21.790551
356,2024,3461,Wyoming,0.517241,0.507442,0.464675,20.152826,9.383333,17.073582,17.922837
357,2024,3462,Xavier,0.035714,0.420328,0.513675,25.643612,8.332792,13.540131,21.670300
358,2024,3463,Yale,0.296296,0.422138,0.493458,19.933187,15.534023,17.563822,25.764921


In [25]:
df_stats.loc[df_stats['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR


In [26]:
df = pd.merge(
    df,
    df_stats.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR
0,2024,3101,Abilene Chr,-1.0,-1.0,-0.830212,-0.031372,0.937686,0.969058,69.481120,-0.554004,-0.154706,-0.068097,0.407407,0.478077,0.490781,21.232178,14.880053,20.625826,20.122552
1,2024,3102,Air Force,-1.0,-1.0,-0.704032,-0.043371,0.884385,0.927756,71.640484,-0.909421,-0.652319,-0.134415,0.433333,0.438547,0.484612,19.250295,12.406726,16.749504,29.145055
2,2024,3103,Akron,-1.0,-1.0,-1.122986,-0.108223,0.857709,0.965932,68.472906,0.084110,0.026817,-0.151045,0.379310,0.437227,0.477698,21.734627,13.822299,19.185600,19.213234
3,2024,3104,Alabama,0.0,0.0,2.284074,0.269193,1.065067,0.795875,71.092516,2.474049,1.954984,0.379178,0.718750,0.496489,0.428012,19.892904,15.069715,22.813074,18.450030
4,2024,3105,Alabama A&M,-1.0,-1.0,-1.101003,-0.135259,0.815961,0.951220,69.685934,-1.388675,-0.736966,-0.135499,0.466667,0.409847,0.441112,23.103184,16.513021,21.612872,24.870078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,2024,3474,Queens NC,-1.0,-1.0,-3.001847,-0.269803,0.755798,1.025601,74.371602,-2.865331,-2.865331,-0.471524,0.153846,0.385506,0.484467,22.763053,13.889463,19.832072,33.929720
372,2024,3475,Southern Indiana,-1.0,-1.0,0.508777,0.021083,0.933588,0.912505,71.833311,-1.419952,-1.419952,0.235514,0.785714,0.511738,0.420331,21.255438,12.245669,20.934225,20.347761
373,2024,3476,Stonehill,-1.0,-1.0,-3.520234,-0.376881,0.687036,1.063917,70.836254,-2.567315,-2.567315,-0.444307,0.133333,0.391396,0.478187,25.925883,12.777546,16.302205,15.915717
374,2024,3477,TX A&M Commerce,-1.0,-1.0,-0.857273,-0.139022,0.861505,1.000526,77.398001,-1.253832,-1.253832,-0.004320,0.448276,0.430530,0.444913,16.663597,9.834779,18.572694,19.689437


In [27]:
df.loc[df['Team Win%'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR
8,2024,3109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,
17,2024,3118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,
20,2024,3121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,
27,2024,3128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,
33,2024,3134,Brooklyn,-1.0,-1.0,,,,,,,,,,,,,,,
46,2024,3147,Centenary,-1.0,-1.0,,,,,,,,,,,,,,,
113,2024,3215,Hardin-Simmons,-1.0,-1.0,,,,,,,,,,,,,,,
114,2024,3216,Hartford,-1.0,-1.0,,,,,,-5.55522,-3.738838,,,,,,,,
187,2024,3289,Morris Brown,-1.0,-1.0,,,,,,,,,,,,,,,
200,2024,3302,NE Illinois,-1.0,-1.0,,,,,,,,,,,,,,,


### Strength of Schedule

In [28]:
df_sos = pd.read_csv('../data/preprocessed/womens_sos/womens_sos.csv')

df_sos = df_sos.loc[df_sos['Season'] == season, :].reset_index(drop=True)

df_sos

Unnamed: 0,Season,Team,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2024,Abilene Christian,-0.155903,-0.093325,-0.037989,0.055572,-0.178976,-0.142405,-0.070618,-0.008943
1,2024,Air Force,-0.120852,0.019438,0.037034,0.087410,-0.058140,-0.057003,-0.037699,0.064128
2,2024,Akron,-0.167112,-0.107156,-0.030833,0.008959,-0.169275,-0.131431,-0.093892,-0.029154
3,2024,Alabama,0.034188,0.203797,0.258514,0.313399,0.154640,0.218686,0.241354,0.283660
4,2024,Alabama A&M,-0.193538,-0.113545,-0.080474,-0.046221,-0.253979,-0.179348,-0.072093,-0.049082
...,...,...,...,...,...,...,...,...,...,...
355,2024,Wyoming,-0.045482,-0.009130,0.067609,0.122949,0.011834,0.013679,0.024055,0.109136
356,2024,Xavier,-0.211365,-0.211365,-0.211365,-0.211365,-0.169275,0.015767,0.062506,0.123413
357,2024,Yale,-0.149080,-0.132917,-0.090508,0.008445,-0.220355,-0.101207,0.005871,0.062506
358,2024,Youngstown State,-0.114245,-0.078582,-0.043807,0.028306,-0.225318,-0.155507,-0.096020,-0.007541


In [29]:
sos_teams = df_sos['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            sos_team,
            *process.extract(
                sos_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for sos_team in tqdm(sos_teams)
    ],
    columns=['SOS Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/360 [00:00<?, ?it/s]

Unnamed: 0,SOS Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Radford,radford,100
5,Quinnipiac,quinnipiac,100
6,Queens (NC),queens nc,100
7,Purdue Fort Wayne,purdue fort wayne,100
8,Purdue,purdue,100
9,Providence,providence,100


In [30]:
sos_to_spelling = dict(zip(df_match['SOS Team'], df_match['Team Spelling']))

df_sos.insert(1, 'TeamID', df_sos['Team'].map(sos_to_spelling).map(spelling_to_id))

df_sos

Unnamed: 0,Season,TeamID,Team,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2024,3101,Abilene Christian,-0.155903,-0.093325,-0.037989,0.055572,-0.178976,-0.142405,-0.070618,-0.008943
1,2024,3102,Air Force,-0.120852,0.019438,0.037034,0.087410,-0.058140,-0.057003,-0.037699,0.064128
2,2024,3103,Akron,-0.167112,-0.107156,-0.030833,0.008959,-0.169275,-0.131431,-0.093892,-0.029154
3,2024,3104,Alabama,0.034188,0.203797,0.258514,0.313399,0.154640,0.218686,0.241354,0.283660
4,2024,3105,Alabama A&M,-0.193538,-0.113545,-0.080474,-0.046221,-0.253979,-0.179348,-0.072093,-0.049082
...,...,...,...,...,...,...,...,...,...,...,...
355,2024,3461,Wyoming,-0.045482,-0.009130,0.067609,0.122949,0.011834,0.013679,0.024055,0.109136
356,2024,3462,Xavier,-0.211365,-0.211365,-0.211365,-0.211365,-0.169275,0.015767,0.062506,0.123413
357,2024,3463,Yale,-0.149080,-0.132917,-0.090508,0.008445,-0.220355,-0.101207,0.005871,0.062506
358,2024,3464,Youngstown State,-0.114245,-0.078582,-0.043807,0.028306,-0.225318,-0.155507,-0.096020,-0.007541


In [31]:
df = pd.merge(
    df,
    df_sos.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2024,3101,Abilene Chr,-1.0,-1.0,-0.830212,-0.031372,0.937686,0.969058,69.481120,-0.554004,-0.154706,-0.068097,0.407407,0.478077,0.490781,21.232178,14.880053,20.625826,20.122552,-0.155903,-0.093325,-0.037989,0.055572,-0.178976,-0.142405,-0.070618,-0.008943
1,2024,3102,Air Force,-1.0,-1.0,-0.704032,-0.043371,0.884385,0.927756,71.640484,-0.909421,-0.652319,-0.134415,0.433333,0.438547,0.484612,19.250295,12.406726,16.749504,29.145055,-0.120852,0.019438,0.037034,0.087410,-0.058140,-0.057003,-0.037699,0.064128
2,2024,3103,Akron,-1.0,-1.0,-1.122986,-0.108223,0.857709,0.965932,68.472906,0.084110,0.026817,-0.151045,0.379310,0.437227,0.477698,21.734627,13.822299,19.185600,19.213234,-0.167112,-0.107156,-0.030833,0.008959,-0.169275,-0.131431,-0.093892,-0.029154
3,2024,3104,Alabama,0.0,0.0,2.284074,0.269193,1.065067,0.795875,71.092516,2.474049,1.954984,0.379178,0.718750,0.496489,0.428012,19.892904,15.069715,22.813074,18.450030,0.034188,0.203797,0.258514,0.313399,0.154640,0.218686,0.241354,0.283660
4,2024,3105,Alabama A&M,-1.0,-1.0,-1.101003,-0.135259,0.815961,0.951220,69.685934,-1.388675,-0.736966,-0.135499,0.466667,0.409847,0.441112,23.103184,16.513021,21.612872,24.870078,-0.193538,-0.113545,-0.080474,-0.046221,-0.253979,-0.179348,-0.072093,-0.049082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,2024,3474,Queens NC,-1.0,-1.0,-3.001847,-0.269803,0.755798,1.025601,74.371602,-2.865331,-2.865331,-0.471524,0.153846,0.385506,0.484467,22.763053,13.889463,19.832072,33.929720,-0.206568,-0.179608,-0.168206,-0.160606,-0.377753,-0.223739,-0.165485,-0.102438
372,2024,3475,Southern Indiana,-1.0,-1.0,0.508777,0.021083,0.933588,0.912505,71.833311,-1.419952,-1.419952,0.235514,0.785714,0.511738,0.420331,21.255438,12.245669,20.934225,20.347761,-0.105384,-0.094363,-0.059392,-0.045482,-0.174800,-0.079891,0.033734,0.107770
373,2024,3476,Stonehill,-1.0,-1.0,-3.520234,-0.376881,0.687036,1.063917,70.836254,-2.567315,-2.567315,-0.444307,0.133333,0.391396,0.478187,25.925883,12.777546,16.302205,15.915717,-0.388245,-0.326649,-0.256831,-0.210287,-0.365436,-0.289327,-0.217838,-0.143779
374,2024,3477,TX A&M Commerce,-1.0,-1.0,-0.857273,-0.139022,0.861505,1.000526,77.398001,-1.253832,-1.253832,-0.004320,0.448276,0.430530,0.444913,16.663597,9.834779,18.572694,19.689437,-0.143846,-0.067119,-0.039699,0.039367,-0.144542,-0.138210,-0.067119,-0.011028


In [32]:
df.loc[df['0.5 Win'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
8,2024,3109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
17,2024,3118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
20,2024,3121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
27,2024,3128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
33,2024,3134,Brooklyn,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
46,2024,3147,Centenary,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
113,2024,3215,Hardin-Simmons,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
114,2024,3216,Hartford,-1.0,-1.0,,,,,,-5.55522,-3.738838,,,,,,,,,,,,,,,,
187,2024,3289,Morris Brown,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,
200,2024,3302,NE Illinois,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,


### Map to Matchups

Get seeding

In [33]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] == season, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds = df_seeds.loc[~df_seeds['TeamID'].isin(playin_losers), :].reset_index(drop=True)

df_seeds

Unnamed: 0,Season,Seed,Region,Play In,TeamID
0,2024,1,W,False,3376
1,2024,2,W,False,3323
2,2024,3,W,False,3333
3,2024,4,W,False,3231
4,2024,5,W,False,3328
...,...,...,...,...,...
59,2024,12,Z,True,3435
60,2024,13,Z,False,3267
61,2024,14,Z,False,3238
62,2024,15,Z,False,3263


In [34]:
id_to_region = dict(zip(df_seeds['TeamID'], df_seeds['Region']))
id_to_seed = dict(zip(df_seeds['TeamID'], df_seeds['Seed']))

df_mod = pd.DataFrame(
    [
        (team_a, team_b) 
        for team_a in df_seeds['TeamID'].unique() 
        for team_b in df_seeds['TeamID'].unique() 
        if team_a != team_b
    ],
    columns=['Team A ID', 'Team B ID']
)

df_mod.insert(0, 'Season', season)
df_mod['Team A Region'] = df_mod['Team A ID'].map(id_to_region)
df_mod['Team B Region'] = df_mod['Team B ID'].map(id_to_region)
df_mod['Team A Seed'] = df_mod['Team A ID'].map(id_to_seed)
df_mod['Team B Seed'] = df_mod['Team B ID'].map(id_to_seed)

df_mod

Unnamed: 0,Season,Team A ID,Team B ID,Team A Region,Team B Region,Team A Seed,Team B Seed
0,2024,3376,3323,W,W,1,2
1,2024,3376,3333,W,W,1,3
2,2024,3376,3231,W,W,1,4
3,2024,3376,3328,W,W,1,5
4,2024,3376,3304,W,W,1,6
...,...,...,...,...,...,...,...
4027,2024,3394,3112,Z,Z,16,11
4028,2024,3394,3435,Z,Z,16,12
4029,2024,3394,3267,Z,Z,16,13
4030,2024,3394,3238,Z,Z,16,14


Calculate round of matchup

In [35]:
same_region = df_mod['Team A Region'] == df_mod['Team B Region']

# round_0_condition = (df_mod['team0_playin'] == 1) & (df_mod['team1_playin'] == 1)  # no play-in games in this data

round_1_condition = df_mod['Team A Seed'] + df_mod['Team B Seed'] == 17

round_2_condition = (
    (df_mod['Team A Seed'].isin([1, 16]) & df_mod['Team B Seed'].isin([8, 9])) | 
    (df_mod['Team A Seed'].isin([8, 9]) & df_mod['Team B Seed'].isin([1, 16])) |
    (df_mod['Team A Seed'].isin([5, 12]) & df_mod['Team B Seed'].isin([4, 13])) | 
    (df_mod['Team A Seed'].isin([4, 13]) & df_mod['Team B Seed'].isin([5, 12])) |
    (df_mod['Team A Seed'].isin([6, 11]) & df_mod['Team B Seed'].isin([3, 14])) | 
    (df_mod['Team A Seed'].isin([3, 14]) & df_mod['Team B Seed'].isin([6, 11])) |
    (df_mod['Team A Seed'].isin([7, 10]) & df_mod['Team B Seed'].isin([2, 15])) | 
    (df_mod['Team A Seed'].isin([2, 15]) & df_mod['Team B Seed'].isin([7, 10]))
)

round_3_condition = (
    (df_mod['Team A Seed'].isin([1, 16, 8, 9]) & df_mod['Team B Seed'].isin([5, 12, 4, 13])) | 
    (df_mod['Team A Seed'].isin([5, 12, 4, 13]) & df_mod['Team B Seed'].isin([1, 16, 8, 9])) |
    (df_mod['Team A Seed'].isin([6, 11, 3, 14]) & df_mod['Team B Seed'].isin([7, 10, 2, 15])) | 
    (df_mod['Team A Seed'].isin([7, 10, 2, 15]) & df_mod['Team B Seed'].isin([6, 11, 3, 14]))
)

round_4_condition = (
    (df_mod['Team A Seed'].isin([1, 16, 8, 9, 5, 12, 4, 13]) & df_mod['Team B Seed'].isin([6, 11, 3, 14, 7, 10, 2, 15])) | 
    (df_mod['Team A Seed'].isin([6, 11, 3, 14, 7, 10, 2, 15]) & df_mod['Team B Seed'].isin([1, 16, 8, 9, 5, 12, 4, 13]))
)

round_5_condition = (
    (df_mod['Team A Region'].isin(['W']) & df_mod['Team B Region'].isin(['X'])) | 
    (df_mod['Team A Region'].isin(['X']) & df_mod['Team B Region'].isin(['W'])) |
    (df_mod['Team A Region'].isin(['Y']) & df_mod['Team B Region'].isin(['Z'])) | 
    (df_mod['Team A Region'].isin(['Z']) & df_mod['Team B Region'].isin(['Y']))
)

round_6_condition = (
    (df_mod['Team A Region'].isin(['W', 'X']) & df_mod['Team B Region'].isin(['Y', 'Z'])) | 
    (df_mod['Team A Region'].isin(['Y', 'Z']) & df_mod['Team B Region'].isin(['W', 'X'])) 
)

round_6_condition

0       False
1       False
2       False
3       False
4       False
        ...  
4027    False
4028    False
4029    False
4030    False
4031    False
Length: 4032, dtype: bool

In [36]:
df_mod['Round'] = -1

df_mod.loc[round_6_condition, 'Round'] = 6

df_mod.loc[round_5_condition, 'Round'] = 5

df_mod.loc[round_4_condition & same_region, 'Round'] = 4

df_mod.loc[round_3_condition & same_region, 'Round'] = 3

df_mod.loc[round_2_condition & same_region, 'Round'] = 2

df_mod.loc[round_1_condition & same_region, 'Round'] = 1

df_mod

Unnamed: 0,Season,Team A ID,Team B ID,Team A Region,Team B Region,Team A Seed,Team B Seed,Round
0,2024,3376,3323,W,W,1,2,4
1,2024,3376,3333,W,W,1,3,4
2,2024,3376,3231,W,W,1,4,3
3,2024,3376,3328,W,W,1,5,3
4,2024,3376,3304,W,W,1,6,4
...,...,...,...,...,...,...,...,...
4027,2024,3394,3112,Z,Z,16,11,4
4028,2024,3394,3435,Z,Z,16,12,3
4029,2024,3394,3267,Z,Z,16,13,3
4030,2024,3394,3238,Z,Z,16,14,4


In [37]:
df_mod['Seed'] = df_mod['Team A Seed'] - df_mod['Team B Seed']

df_mod.drop(columns=['Team A Region', 'Team B Region', 'Team A Seed', 'Team B Seed'], inplace=True)

df_mod

Unnamed: 0,Season,Team A ID,Team B ID,Round,Seed
0,2024,3376,3323,4,-1
1,2024,3376,3333,4,-2
2,2024,3376,3231,3,-3
3,2024,3376,3328,3,-4
4,2024,3376,3304,4,-5
...,...,...,...,...,...
4027,2024,3394,3112,4,5
4028,2024,3394,3435,3,4
4029,2024,3394,3267,3,3
4030,2024,3394,3238,4,2


Get Head-to-Head

In [38]:
df_h2h = pd.read_csv('../data/preprocessed/women_h2h/women_h2h.csv')

df_h2h = df_h2h.loc[df_h2h['Season'] == season, :].reset_index(drop=True)

df_h2h

Unnamed: 0,Season,Team A,Team B,Head to Head,Common Opps
0,2024,Abilene Christian,California Baptist,0.000000,-0.406504
1,2024,Abilene Christian,Grand Canyon,-0.888889,-0.234234
2,2024,Abilene Christian,Louisiana Tech,-0.800000,-0.444444
3,2024,Abilene Christian,Navy,0.800000,
4,2024,Abilene Christian,Rice,-0.800000,0.000000
...,...,...,...,...,...
65631,2024,Youngstown State,Weber State,,0.000000
65632,2024,Youngstown State,Western Kentucky,,0.400000
65633,2024,Youngstown State,Wichita State,,0.000000
65634,2024,Youngstown State,Wisconsin,,-0.400000


In [39]:
h2h_teams = df_h2h['Team A'].unique()

df_match = pd.DataFrame(
    [
        [
            h2h_team,
            *process.extract(
                h2h_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for h2h_team in tqdm(h2h_teams)
    ],
    columns=['Head to Head Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/360 [00:00<?, ?it/s]

Unnamed: 0,Head to Head Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Radford,radford,100
5,Quinnipiac,quinnipiac,100
6,Queens (NC),queens nc,100
7,Purdue Fort Wayne,purdue fort wayne,100
8,Purdue,purdue,100
9,Providence,providence,100


In [40]:
h2h_to_spelling = dict(zip(df_match['Head to Head Team'], df_match['Team Spelling']))

df_h2h.insert(df_h2h.columns.get_loc('Team A'), 'Team A ID', df_h2h['Team A'].map(h2h_to_spelling).map(spelling_to_id))

df_h2h.insert(df_h2h.columns.get_loc('Team B'), 'Team B ID', df_h2h['Team B'].map(h2h_to_spelling).map(spelling_to_id))

df_h2h

Unnamed: 0,Season,Team A ID,Team A,Team B ID,Team B,Head to Head,Common Opps
0,2024,3101,Abilene Christian,3465,California Baptist,0.000000,-0.406504
1,2024,3101,Abilene Christian,3213,Grand Canyon,-0.888889,-0.234234
2,2024,3101,Abilene Christian,3256,Louisiana Tech,-0.800000,-0.444444
3,2024,3101,Abilene Christian,3298,Navy,0.800000,
4,2024,3101,Abilene Christian,3349,Rice,-0.800000,0.000000
...,...,...,...,...,...,...,...
65631,2024,3464,Youngstown State,3451,Weber State,,0.000000
65632,2024,3464,Youngstown State,3443,Western Kentucky,,0.400000
65633,2024,3464,Youngstown State,3455,Wichita State,,0.000000
65634,2024,3464,Youngstown State,3458,Wisconsin,,-0.400000


In [41]:
df_mod = pd.merge(
    df_mod,
    df_h2h[['Season', 'Team A ID', 'Team B ID', 'Head to Head', 'Common Opps']],
    how='left',
    on=['Season', 'Team A ID', 'Team B ID'],
)

df_mod

Unnamed: 0,Season,Team A ID,Team B ID,Round,Seed,Head to Head,Common Opps
0,2024,3376,3323,4,-1,0.8,0.190476
1,2024,3376,3333,4,-2,,0.000000
2,2024,3376,3231,3,-3,,0.000000
3,2024,3376,3328,3,-4,,0.615385
4,2024,3376,3304,4,-5,,0.000000
...,...,...,...,...,...,...,...
4027,2024,3394,3112,4,5,,
4028,2024,3394,3435,3,4,,-0.800000
4029,2024,3394,3267,3,3,,0.000000
4030,2024,3394,3238,4,2,,-0.266667


Get team names

In [42]:
df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\WTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M
...,...,...
371,3474,Queens NC
372,3475,Southern Indiana
373,3476,Stonehill
374,3477,TX A&M Commerce


In [43]:
id_to_team = dict(zip(df_teams['TeamID'], df_teams['TeamName']))

df_mod.insert(df_mod.columns.get_loc('Team A ID') + 1, 'Team A', df_mod['Team A ID'].map(id_to_team))
df_mod.insert(df_mod.columns.get_loc('Team B ID') + 1, 'Team B', df_mod['Team B ID'].map(id_to_team))

df_mod

Unnamed: 0,Season,Team A ID,Team A,Team B ID,Team B,Round,Seed,Head to Head,Common Opps
0,2024,3376,South Carolina,3323,Notre Dame,4,-1,0.8,0.190476
1,2024,3376,South Carolina,3333,Oregon St,4,-2,,0.000000
2,2024,3376,South Carolina,3231,Indiana,3,-3,,0.000000
3,2024,3376,South Carolina,3328,Oklahoma,3,-4,,0.615385
4,2024,3376,South Carolina,3304,Nebraska,4,-5,,0.000000
...,...,...,...,...,...,...,...,...,...
4027,2024,3394,TAM C. Christi,3112,Arizona,4,5,,
4028,2024,3394,TAM C. Christi,3435,Vanderbilt,3,4,,-0.800000
4029,2024,3394,TAM C. Christi,3267,Marshall,3,3,,0.000000
4030,2024,3394,TAM C. Christi,3238,Jackson St,4,2,,-0.266667


Map features

In [44]:
team_a_features = pd.merge(
    df_mod[['Season', 'Team A ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team A ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team A ID', 'TeamID'])

team_b_features = pd.merge(
    df_mod[['Season', 'Team B ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team B ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team B ID', 'TeamID'])

df_features = team_a_features - team_b_features

df_features['Team A Offense Team B Defense'] = team_a_features['Adjusted Offense'] + team_b_features['Adjusted Defense']
df_features['Team B Offense Team A Defense'] = team_b_features['Adjusted Offense'] + team_a_features['Adjusted Defense']

df_features['Team A Rating'] = team_a_features['Rating']
df_features['Team B Rating'] = team_b_features['Rating']

df_features

Unnamed: 0,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A Rating,Team B Rating
0,2.0,3.666667,1.677515,0.150893,0.111859,-0.039034,-0.873797,2.188902,2.775358,0.384301,0.187500,0.047505,-0.056051,-1.061853,3.408742,-2.134985,-0.799940,0.080383,-0.022304,0.063816,0.000000,0.320890,0.320890,0.313564,0.266887,1.892285,1.741391,5.160799,3.483285
1,5.0,5.000000,1.856021,0.209952,0.099539,-0.110414,5.711093,4.535940,3.049327,0.463122,0.225806,0.014910,-0.036465,-0.586543,5.439675,1.698984,0.722661,0.087003,-0.020147,0.032381,0.063557,0.313534,0.228939,0.172543,0.116888,1.963664,1.753712,5.160799,3.304778
2,3.0,2.666667,1.934765,0.193224,0.058844,-0.134380,3.952402,1.295923,1.608844,0.545906,0.172414,-0.036193,-0.087811,-0.531228,7.925315,-0.891497,-3.188423,0.081475,0.011523,0.111555,0.047679,0.327992,0.324371,0.318939,0.158716,1.987630,1.794407,5.160799,3.226035
3,3.0,4.333333,2.528010,0.275367,0.148206,-0.127161,-3.364445,2.695256,3.058868,0.492657,0.290323,0.060606,-0.078779,-2.942566,1.600584,3.652996,-4.678140,0.041119,0.021010,0.044151,0.036348,0.627680,0.366476,0.291587,0.283052,1.980412,1.705045,5.160799,2.632790
4,5.0,5.333333,2.699518,0.266825,0.130864,-0.135962,3.675839,3.838510,3.253950,0.426569,0.333333,0.053260,-0.097758,-1.848205,1.090692,-1.072356,-2.473125,0.081475,0.047013,0.075100,0.047679,0.539362,0.384151,0.330857,0.266719,1.989212,1.722387,5.160799,2.461281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,-2.0,-3.333333,-1.610451,-0.276814,-0.173857,0.102957,0.636050,-3.193604,-3.467110,-0.148683,0.172454,-0.035215,-0.076014,3.724479,2.868778,4.973892,-2.520881,-0.222198,-0.309172,-0.344231,-0.408894,-0.233159,-0.308589,-0.261297,-0.324203,1.646356,1.923170,0.525438,2.135889
4028,0.0,0.000000,-1.637236,-0.214683,-0.139077,0.075606,1.648309,-0.988002,-1.252461,-0.113464,-0.005974,-0.021660,-0.046688,2.866825,-1.548830,6.540606,1.890322,-0.216424,-0.215977,-0.265637,-0.233168,-0.264084,-0.246651,-0.242095,-0.242446,1.673707,1.888390,0.525438,2.162675
4029,0.0,0.000000,-1.368758,-0.160079,-0.170530,-0.010450,-6.820722,-0.252116,-0.385017,-0.129930,-0.096296,-0.062453,-0.083221,2.648865,-2.178235,5.303570,-4.467634,-0.133362,-0.116771,-0.106718,-0.164571,-0.028962,0.033898,0.093609,0.036574,1.759763,1.919843,0.525438,1.894196
4030,0.0,-0.666667,-0.972468,-0.102030,-0.083433,0.018598,1.390612,-1.220787,-1.294450,-0.102088,-0.089400,0.006565,0.024867,2.288782,-3.901513,-4.138111,-4.680749,0.037251,0.004374,0.021326,-0.084047,-0.254938,-0.225853,-0.196715,-0.249219,1.730715,1.832746,0.525438,1.497907


In [45]:
df_mod[df_features.columns] = df_features

df_mod

Unnamed: 0,Season,Team A ID,Team A,Team B ID,Team B,Round,Seed,Head to Head,Common Opps,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A Rating,Team B Rating
0,2024,3376,South Carolina,3323,Notre Dame,4,-1,0.8,0.190476,2.0,3.666667,1.677515,0.150893,0.111859,-0.039034,-0.873797,2.188902,2.775358,0.384301,0.187500,0.047505,-0.056051,-1.061853,3.408742,-2.134985,-0.799940,0.080383,-0.022304,0.063816,0.000000,0.320890,0.320890,0.313564,0.266887,1.892285,1.741391,5.160799,3.483285
1,2024,3376,South Carolina,3333,Oregon St,4,-2,,0.000000,5.0,5.000000,1.856021,0.209952,0.099539,-0.110414,5.711093,4.535940,3.049327,0.463122,0.225806,0.014910,-0.036465,-0.586543,5.439675,1.698984,0.722661,0.087003,-0.020147,0.032381,0.063557,0.313534,0.228939,0.172543,0.116888,1.963664,1.753712,5.160799,3.304778
2,2024,3376,South Carolina,3231,Indiana,3,-3,,0.000000,3.0,2.666667,1.934765,0.193224,0.058844,-0.134380,3.952402,1.295923,1.608844,0.545906,0.172414,-0.036193,-0.087811,-0.531228,7.925315,-0.891497,-3.188423,0.081475,0.011523,0.111555,0.047679,0.327992,0.324371,0.318939,0.158716,1.987630,1.794407,5.160799,3.226035
3,2024,3376,South Carolina,3328,Oklahoma,3,-4,,0.615385,3.0,4.333333,2.528010,0.275367,0.148206,-0.127161,-3.364445,2.695256,3.058868,0.492657,0.290323,0.060606,-0.078779,-2.942566,1.600584,3.652996,-4.678140,0.041119,0.021010,0.044151,0.036348,0.627680,0.366476,0.291587,0.283052,1.980412,1.705045,5.160799,2.632790
4,2024,3376,South Carolina,3304,Nebraska,4,-5,,0.000000,5.0,5.333333,2.699518,0.266825,0.130864,-0.135962,3.675839,3.838510,3.253950,0.426569,0.333333,0.053260,-0.097758,-1.848205,1.090692,-1.072356,-2.473125,0.081475,0.047013,0.075100,0.047679,0.539362,0.384151,0.330857,0.266719,1.989212,1.722387,5.160799,2.461281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,2024,3394,TAM C. Christi,3112,Arizona,4,5,,,-2.0,-3.333333,-1.610451,-0.276814,-0.173857,0.102957,0.636050,-3.193604,-3.467110,-0.148683,0.172454,-0.035215,-0.076014,3.724479,2.868778,4.973892,-2.520881,-0.222198,-0.309172,-0.344231,-0.408894,-0.233159,-0.308589,-0.261297,-0.324203,1.646356,1.923170,0.525438,2.135889
4028,2024,3394,TAM C. Christi,3435,Vanderbilt,3,4,,-0.800000,0.0,0.000000,-1.637236,-0.214683,-0.139077,0.075606,1.648309,-0.988002,-1.252461,-0.113464,-0.005974,-0.021660,-0.046688,2.866825,-1.548830,6.540606,1.890322,-0.216424,-0.215977,-0.265637,-0.233168,-0.264084,-0.246651,-0.242095,-0.242446,1.673707,1.888390,0.525438,2.162675
4029,2024,3394,TAM C. Christi,3267,Marshall,3,3,,0.000000,0.0,0.000000,-1.368758,-0.160079,-0.170530,-0.010450,-6.820722,-0.252116,-0.385017,-0.129930,-0.096296,-0.062453,-0.083221,2.648865,-2.178235,5.303570,-4.467634,-0.133362,-0.116771,-0.106718,-0.164571,-0.028962,0.033898,0.093609,0.036574,1.759763,1.919843,0.525438,1.894196
4030,2024,3394,TAM C. Christi,3238,Jackson St,4,2,,-0.266667,0.0,-0.666667,-0.972468,-0.102030,-0.083433,0.018598,1.390612,-1.220787,-1.294450,-0.102088,-0.089400,0.006565,0.024867,2.288782,-3.901513,-4.138111,-4.680749,0.037251,0.004374,0.021326,-0.084047,-0.254938,-0.225853,-0.196715,-0.249219,1.730715,1.832746,0.525438,1.497907


Check that data follows same format as the data that the model was trained on

In [46]:
df_mod_training = pd.read_csv(f'{model_path}/df_mod.csv')

assert all(df_mod_training.drop(columns=['Result']).columns == df_mod.columns), 'Columns do not match'

'Columns Match'

'Columns Match'

### Get Model Predictions

In [47]:
import pickle

with open(f'{model_path}/model.pkl', 'rb') as f:
    mod = pickle.load(f)

mod

  from pandas import MultiIndex, Int64Index


XGBClassifier(alpha=0.14137785435818806, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.024349339363753667, enable_categorical=False,
              eta=0.010757573288552503, eval_metric='logloss', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              lambda=1.726453598996863, learning_rate=0.0107575729,
              max_delta_s...
                                    'Rating': 1, 'Seed': -1, 'Starters': 1,
                                    'Team A Offense Team B Defense': 1,
                                    'Team A Rating': 1,
                                    'Team B Offense Team A Defense': -1,
                                    'Team B Rating': -1, 'Team EFG%': 1,
                                    'Team FTR': 1, 'Team ORBR': 1,
                                    'Team TOR': -1, ...},
              n_estimators=1000, n_jobs=12, num_parallel_tree=1,
       

In [48]:
X = df_mod.drop(columns=['Season', 'Team A ID', 'Team A', 'Team B ID', 'Team B'])

predictions = mod.predict_proba(X)[:, 1]

predictions

array([0.89475393, 0.92130953, 0.898935  , ..., 0.27448252, 0.19005434,
       0.37895563], dtype=float32)

In [49]:
df_matrix = (
    df_mod[['Team A ID', 'Team B ID']]
    .assign(Prediction=predictions)
    .pivot(
        index=['Team A ID'], 
        columns=['Team B ID'],
        values='Prediction',
    )
)

df_matrix

Team B ID,3104,3112,3124,3151,3160,3163,3166,3179,3180,3181,3186,3193,3195,3199,3211,3231,3234,3235,3238,3242,3243,3245,3257,3261,3263,3266,3267,3268,3276,3277,3279,3292,3301,3304,3313,3314,3323,3326,3328,3333,3339,3342,3343,3349,3350,3355,3376,3390,3393,3394,3397,3400,3401,3404,3414,3417,3424,3425,3428,3435,3439,3452,3453,3465
Team A ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
3104,,0.521184,0.347804,0.845126,0.250675,0.101080,0.380383,0.699852,0.925676,0.404095,0.832038,0.768824,0.630972,0.453428,0.256026,0.240352,0.114150,0.437386,0.824498,0.517816,0.281354,0.862755,0.371704,0.150066,0.909535,0.544749,0.847411,0.361256,0.574312,0.468190,0.415579,0.654988,0.205879,0.482425,0.909899,0.477075,0.173332,0.203785,0.478682,0.292676,0.868428,0.971064,0.474391,0.898960,0.638667,0.660787,0.024604,0.146368,0.438078,0.948260,0.469361,0.171866,0.647185,0.953701,0.903293,0.163649,0.598642,0.208284,0.267357,0.660744,0.307658,0.427203,0.717745,0.904074
3112,0.477344,,0.331136,0.840633,0.297390,0.098005,0.423056,0.748483,0.943858,0.394052,0.836875,0.797877,0.695347,0.524942,0.297484,0.286420,0.155751,0.492103,0.823959,0.548234,0.287015,0.912708,0.322086,0.172629,0.919414,0.598075,0.867872,0.471737,0.567239,0.494023,0.436722,0.622254,0.259438,0.447645,0.880990,0.474341,0.190819,0.213300,0.408628,0.247886,0.889932,0.966642,0.498033,0.903418,0.680303,0.632908,0.037568,0.179224,0.470503,0.949630,0.405006,0.167244,0.624448,0.954706,0.910215,0.147374,0.559064,0.156113,0.406660,0.681619,0.274489,0.390448,0.746872,0.874859
3124,0.652978,0.659506,,0.929609,0.442287,0.192494,0.605474,0.802819,0.968277,0.645152,0.918279,0.848649,0.758043,0.632431,0.496476,0.393174,0.189289,0.561117,0.912978,0.706138,0.443938,0.940537,0.478953,0.309574,0.960341,0.732502,0.903784,0.553748,0.712977,0.658833,0.700559,0.768428,0.400755,0.625928,0.940427,0.631152,0.335975,0.350932,0.550915,0.412694,0.932795,0.976943,0.672441,0.944655,0.811523,0.776252,0.060435,0.262238,0.623866,0.967238,0.573242,0.316506,0.777282,0.966158,0.957429,0.245482,0.741917,0.316162,0.437016,0.740714,0.439934,0.676922,0.831170,0.938884
3151,0.150750,0.155360,0.067746,,0.061217,0.030585,0.093395,0.207331,0.691655,0.093231,0.349582,0.285181,0.224864,0.105663,0.065868,0.044719,0.028496,0.176310,0.469457,0.165594,0.065438,0.619395,0.090137,0.034779,0.589135,0.162391,0.347589,0.098561,0.178657,0.113730,0.128612,0.186929,0.049727,0.131426,0.666136,0.127699,0.044669,0.042566,0.088467,0.076042,0.417571,0.939272,0.116811,0.563212,0.169017,0.245131,0.018210,0.029026,0.147821,0.774530,0.107235,0.035490,0.182853,0.820359,0.664687,0.035136,0.221535,0.053285,0.055186,0.159928,0.059495,0.098371,0.256384,0.464767
3160,0.752506,0.701303,0.558338,0.937877,,0.276365,0.706403,0.834906,0.964217,0.680658,0.928876,0.887042,0.818318,0.720530,0.543535,0.494662,0.302870,0.679713,0.940728,0.677175,0.553016,0.951482,0.535063,0.477424,0.955726,0.742203,0.913576,0.613433,0.751757,0.662192,0.725364,0.781370,0.376043,0.697753,0.955757,0.669940,0.431768,0.388586,0.666719,0.496586,0.930709,0.973464,0.709527,0.960398,0.846444,0.817831,0.082131,0.279669,0.653113,0.969401,0.667435,0.284810,0.824719,0.968737,0.954623,0.258008,0.787519,0.460750,0.517382,0.830675,0.485833,0.660561,0.849071,0.946706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,0.339104,0.319329,0.255686,0.833759,0.168091,0.054151,0.301576,0.511483,0.927389,0.270354,0.775962,0.641790,0.473161,0.320910,0.171472,0.184116,0.071823,0.378934,0.756970,0.425249,0.257226,0.827675,0.232086,0.112074,0.845883,0.409321,0.767428,0.260729,0.367173,0.354177,0.276671,0.467289,0.123574,0.299480,0.859290,0.322145,0.118175,0.108296,0.307109,0.149732,0.808941,0.959817,0.317879,0.838821,0.455278,0.433231,0.019508,0.072967,0.348004,0.921435,0.234085,0.071139,0.479711,0.957660,0.849914,0.059182,0.486037,0.119486,0.173977,,0.214202,0.286497,0.592927,0.842414
3439,0.692128,0.728256,0.560401,0.938994,0.516299,0.228211,0.633083,0.812597,0.972332,0.657870,0.923296,0.903538,0.770672,0.632246,0.551590,0.475672,0.267774,0.687456,0.945929,0.772464,0.573178,0.958841,0.631234,0.328696,0.966588,0.755712,0.925997,0.607825,0.770900,0.728034,0.707536,0.801199,0.455274,0.689594,0.964814,0.707200,0.337106,0.400119,0.615254,0.479567,0.955250,0.980677,0.682199,0.967247,0.861769,0.763732,0.082886,0.275462,0.735339,0.974550,0.591655,0.315822,0.831045,0.976038,0.967087,0.319559,0.811937,0.412625,0.505397,0.791473,,0.621158,0.860625,0.946466
3452,0.571014,0.598257,0.317264,0.900176,0.338586,0.153017,0.530177,0.765255,0.951096,0.512846,0.879606,0.855972,0.684696,0.542708,0.433238,0.281080,0.179561,0.606706,0.894466,0.683581,0.346597,0.916520,0.390688,0.217216,0.928981,0.688790,0.854119,0.495867,0.741072,0.553059,0.570101,0.736722,0.308184,0.580858,0.940773,0.565596,0.286842,0.315676,0.525694,0.372008,0.913864,0.961326,0.531940,0.926542,0.734215,0.719268,0.045703,0.188680,0.505111,0.952410,0.515475,0.181460,0.748449,0.966702,0.943993,0.202195,0.671206,0.236975,0.365956,0.705336,0.377976,,0.794705,0.910908
3453,0.284604,0.251995,0.173385,0.740580,0.152940,0.054117,0.232065,0.426554,0.874650,0.221867,0.661315,0.598126,0.375509,0.260757,0.174915,0.109885,0.054455,0.233003,0.636768,0.250974,0.166435,0.775474,0.130207,0.072111,0.794056,0.357147,0.617686,0.145856,0.305952,0.259292,0.262437,0.369262,0.115364,0.235964,0.797737,0.195905,0.079070,0.079942,0.177487,0.160281,0.746859,0.964975,0.222821,0.759159,0.453660,0.434746,0.025935,0.065284,0.254402,0.883502,0.207023,0.068036,0.390004,0.925336,0.820359,0.081205,0.332090,0.097331,0.157043,0.405151,0.141238,0.207829,,0.705875


In [50]:
df_matrix_display = df_matrix.copy()

df_matrix_display.columns = df_matrix_display.columns.map(id_to_team)
df_matrix_display.index = df_matrix_display.index.map(id_to_team)

df_matrix_display

Team B ID,Alabama,Arizona,Baylor,Chattanooga,Colorado,Connecticut,Creighton,Drake,Drexel,Duke,E Washington,Fairfield,FL Gulf Coast,Florida St,Gonzaga,Indiana,Iowa,Iowa St,Jackson St,Kansas,Kansas St,Kent,Louisville,LSU,Maine,Marquette,Marshall,Maryland,Michigan,Michigan St,Mississippi,MTSU,NC State,Nebraska,Norfolk St,North Carolina,Notre Dame,Ohio St,Oklahoma,Oregon St,Portland,Presbyterian,Princeton,Rice,Richmond,S Dakota St,South Carolina,Stanford,Syracuse,TAM C. Christi,Tennessee,Texas,Texas A&M,TN Martin,UC Irvine,UCLA,UNLV,USC,Utah,Vanderbilt,Virginia Tech,West Virginia,WI Green Bay,Cal Baptist
Team A ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
Alabama,,0.521184,0.347804,0.845126,0.250675,0.101080,0.380383,0.699852,0.925676,0.404095,0.832038,0.768824,0.630972,0.453428,0.256026,0.240352,0.114150,0.437386,0.824498,0.517816,0.281354,0.862755,0.371704,0.150066,0.909535,0.544749,0.847411,0.361256,0.574312,0.468190,0.415579,0.654988,0.205879,0.482425,0.909899,0.477075,0.173332,0.203785,0.478682,0.292676,0.868428,0.971064,0.474391,0.898960,0.638667,0.660787,0.024604,0.146368,0.438078,0.948260,0.469361,0.171866,0.647185,0.953701,0.903293,0.163649,0.598642,0.208284,0.267357,0.660744,0.307658,0.427203,0.717745,0.904074
Arizona,0.477344,,0.331136,0.840633,0.297390,0.098005,0.423056,0.748483,0.943858,0.394052,0.836875,0.797877,0.695347,0.524942,0.297484,0.286420,0.155751,0.492103,0.823959,0.548234,0.287015,0.912708,0.322086,0.172629,0.919414,0.598075,0.867872,0.471737,0.567239,0.494023,0.436722,0.622254,0.259438,0.447645,0.880990,0.474341,0.190819,0.213300,0.408628,0.247886,0.889932,0.966642,0.498033,0.903418,0.680303,0.632908,0.037568,0.179224,0.470503,0.949630,0.405006,0.167244,0.624448,0.954706,0.910215,0.147374,0.559064,0.156113,0.406660,0.681619,0.274489,0.390448,0.746872,0.874859
Baylor,0.652978,0.659506,,0.929609,0.442287,0.192494,0.605474,0.802819,0.968277,0.645152,0.918279,0.848649,0.758043,0.632431,0.496476,0.393174,0.189289,0.561117,0.912978,0.706138,0.443938,0.940537,0.478953,0.309574,0.960341,0.732502,0.903784,0.553748,0.712977,0.658833,0.700559,0.768428,0.400755,0.625928,0.940427,0.631152,0.335975,0.350932,0.550915,0.412694,0.932795,0.976943,0.672441,0.944655,0.811523,0.776252,0.060435,0.262238,0.623866,0.967238,0.573242,0.316506,0.777282,0.966158,0.957429,0.245482,0.741917,0.316162,0.437016,0.740714,0.439934,0.676922,0.831170,0.938884
Chattanooga,0.150750,0.155360,0.067746,,0.061217,0.030585,0.093395,0.207331,0.691655,0.093231,0.349582,0.285181,0.224864,0.105663,0.065868,0.044719,0.028496,0.176310,0.469457,0.165594,0.065438,0.619395,0.090137,0.034779,0.589135,0.162391,0.347589,0.098561,0.178657,0.113730,0.128612,0.186929,0.049727,0.131426,0.666136,0.127699,0.044669,0.042566,0.088467,0.076042,0.417571,0.939272,0.116811,0.563212,0.169017,0.245131,0.018210,0.029026,0.147821,0.774530,0.107235,0.035490,0.182853,0.820359,0.664687,0.035136,0.221535,0.053285,0.055186,0.159928,0.059495,0.098371,0.256384,0.464767
Colorado,0.752506,0.701303,0.558338,0.937877,,0.276365,0.706403,0.834906,0.964217,0.680658,0.928876,0.887042,0.818318,0.720530,0.543535,0.494662,0.302870,0.679713,0.940728,0.677175,0.553016,0.951482,0.535063,0.477424,0.955726,0.742203,0.913576,0.613433,0.751757,0.662192,0.725364,0.781370,0.376043,0.697753,0.955757,0.669940,0.431768,0.388586,0.666719,0.496586,0.930709,0.973464,0.709527,0.960398,0.846444,0.817831,0.082131,0.279669,0.653113,0.969401,0.667435,0.284810,0.824719,0.968737,0.954623,0.258008,0.787519,0.460750,0.517382,0.830675,0.485833,0.660561,0.849071,0.946706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vanderbilt,0.339104,0.319329,0.255686,0.833759,0.168091,0.054151,0.301576,0.511483,0.927389,0.270354,0.775962,0.641790,0.473161,0.320910,0.171472,0.184116,0.071823,0.378934,0.756970,0.425249,0.257226,0.827675,0.232086,0.112074,0.845883,0.409321,0.767428,0.260729,0.367173,0.354177,0.276671,0.467289,0.123574,0.299480,0.859290,0.322145,0.118175,0.108296,0.307109,0.149732,0.808941,0.959817,0.317879,0.838821,0.455278,0.433231,0.019508,0.072967,0.348004,0.921435,0.234085,0.071139,0.479711,0.957660,0.849914,0.059182,0.486037,0.119486,0.173977,,0.214202,0.286497,0.592927,0.842414
Virginia Tech,0.692128,0.728256,0.560401,0.938994,0.516299,0.228211,0.633083,0.812597,0.972332,0.657870,0.923296,0.903538,0.770672,0.632246,0.551590,0.475672,0.267774,0.687456,0.945929,0.772464,0.573178,0.958841,0.631234,0.328696,0.966588,0.755712,0.925997,0.607825,0.770900,0.728034,0.707536,0.801199,0.455274,0.689594,0.964814,0.707200,0.337106,0.400119,0.615254,0.479567,0.955250,0.980677,0.682199,0.967247,0.861769,0.763732,0.082886,0.275462,0.735339,0.974550,0.591655,0.315822,0.831045,0.976038,0.967087,0.319559,0.811937,0.412625,0.505397,0.791473,,0.621158,0.860625,0.946466
West Virginia,0.571014,0.598257,0.317264,0.900176,0.338586,0.153017,0.530177,0.765255,0.951096,0.512846,0.879606,0.855972,0.684696,0.542708,0.433238,0.281080,0.179561,0.606706,0.894466,0.683581,0.346597,0.916520,0.390688,0.217216,0.928981,0.688790,0.854119,0.495867,0.741072,0.553059,0.570101,0.736722,0.308184,0.580858,0.940773,0.565596,0.286842,0.315676,0.525694,0.372008,0.913864,0.961326,0.531940,0.926542,0.734215,0.719268,0.045703,0.188680,0.505111,0.952410,0.515475,0.181460,0.748449,0.966702,0.943993,0.202195,0.671206,0.236975,0.365956,0.705336,0.377976,,0.794705,0.910908
WI Green Bay,0.284604,0.251995,0.173385,0.740580,0.152940,0.054117,0.232065,0.426554,0.874650,0.221867,0.661315,0.598126,0.375509,0.260757,0.174915,0.109885,0.054455,0.233003,0.636768,0.250974,0.166435,0.775474,0.130207,0.072111,0.794056,0.357147,0.617686,0.145856,0.305952,0.259292,0.262437,0.369262,0.115364,0.235964,0.797737,0.195905,0.079070,0.079942,0.177487,0.160281,0.746859,0.964975,0.222821,0.759159,0.453660,0.434746,0.025935,0.065284,0.254402,0.883502,0.207023,0.068036,0.390004,0.925336,0.820359,0.081205,0.332090,0.097331,0.157043,0.405151,0.141238,0.207829,,0.705875


In [51]:
df_matrix.to_csv(f'../data/preprocessed/womens_year_data/{season}_womens_matchup_matrix.csv', index=True)
df_matrix_display.to_csv(f'../data/preprocessed/womens_year_data/{season}_womens_matchup_matrix_display.csv', index=True)

'Done'

'Done'