# Get Women's Features

Compile all feature engineering into a model-ready dataframe. 

### Previous Tournament Results

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv(r'..\data\preprocessed\kaggle\womens_tournament_results.csv')

df = df.loc[~df['Season'].isin([2020, 2024]), :].reset_index(drop=True)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results
0,2012,3101,Abilene Chr,-1.0,-1.0
1,2012,3102,Air Force,-1.0,-1.0
2,2012,3103,Akron,-1.0,-1.0
3,2012,3104,Alabama,-1.0,-1.0
4,2012,3105,Alabama A&M,-1.0,-1.0
...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0
4132,2023,3475,Southern Indiana,-1.0,-1.0
4133,2023,3476,Stonehill,-1.0,-1.0
4134,2023,3477,TX A&M Commerce,-1.0,-1.0


### My Rankings

In [2]:
df_rankings = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\womens_my_rankings\womens_my_rankings_{season}.csv')
        .assign(Season=season)
        for season in range(2012, 2024) if season != 2020
    ),
    ignore_index=True
)

df_rankings.insert(0, 'Season', df_rankings.pop('Season'))

df_rankings.drop(columns=['Strength'], inplace=True)

df_rankings

Unnamed: 0,Season,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,Baylor,5.279253,0.542235,1.177240,0.635006,71.857692
1,2012,Notre Dame,4.785555,0.532910,1.157420,0.624510,73.949315
2,2012,Connecticut,4.755495,0.607852,1.175693,0.567841,69.774220
3,2012,Stanford,4.530613,0.445364,1.147792,0.702428,69.743591
4,2012,Delaware,4.078766,0.287002,1.058902,0.771900,67.928649
...,...,...,...,...,...,...,...
3837,2023,Texas Southern,-3.845266,-0.242638,0.778726,1.021364,75.246052
3838,2023,Navy,-4.046983,-0.332233,0.715587,1.047820,71.192033
3839,2023,Mississippi Valley State,-4.377937,-0.374314,0.709382,1.083696,73.554934
3840,2023,Saint Peter's,-4.740745,-0.346663,0.676954,1.023618,71.031943


In [3]:
df_spellings = pd.read_csv(
    r'..\data\unprocessed\kaggle\WTeamSpellings.csv', 
    encoding='cp1252'  # fixes issue with fancy quotes
)

df_spellings.loc[df_spellings.shape[0]] = ['fdu', 3192]

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,3394
1,a&m-corpus christi,3394
2,abilene chr,3101
3,abilene christian,3101
4,abilene-christian,3101
...,...,...
1154,youngstown st.,3464
1155,youngstown state,3464
1156,youngstown-st,3464
1157,youngstown-state,3464


In [4]:
from fuzzywuzzy.fuzz import token_sort_ratio
from fuzzywuzzy import process
from tqdm.autonotebook import tqdm

team_spellings = df_spellings['TeamNameSpelling'].unique()
my_teams = df_rankings['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            my_team,
            *process.extract(
                my_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for my_team in tqdm(my_teams)
    ],
    columns=['My Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  from tqdm.autonotebook import tqdm


  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,My Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,Savannah State Tigers,savannah state,80
4,St. Thomas,st thomas mn,86
5,Kansas City,mo kansas city,88
6,Texas A&M-Commerce,tx a&m commerce,91
7,Baylor,baylor,100
8,Youngstown State,youngstown state,100
9,Georgia State,georgia state,100


In [5]:
ranking_to_spelling = dict(zip(df_match['My Team'], df_match['Team Spelling']))
spelling_to_id = dict(zip(df_spellings['TeamNameSpelling'], df_spellings['TeamID']))

df_rankings.insert(1, 'TeamID', df_rankings['Team'].map(ranking_to_spelling).map(spelling_to_id))

df_rankings

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,3124,Baylor,5.279253,0.542235,1.177240,0.635006,71.857692
1,2012,3323,Notre Dame,4.785555,0.532910,1.157420,0.624510,73.949315
2,2012,3163,Connecticut,4.755495,0.607852,1.175693,0.567841,69.774220
3,2012,3390,Stanford,4.530613,0.445364,1.147792,0.702428,69.743591
4,2012,3174,Delaware,4.078766,0.287002,1.058902,0.771900,67.928649
...,...,...,...,...,...,...,...,...
3837,2023,3411,Texas Southern,-3.845266,-0.242638,0.778726,1.021364,75.246052
3838,2023,3298,Navy,-4.046983,-0.332233,0.715587,1.047820,71.192033
3839,2023,3290,Mississippi Valley State,-4.377937,-0.374314,0.709382,1.083696,73.554934
3840,2023,3389,Saint Peter's,-4.740745,-0.346663,0.676954,1.023618,71.031943


In [6]:
df_rankings.loc[df_rankings['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo


In [7]:
df = pd.merge(
    df,
    df_rankings.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,
1,2012,3102,Air Force,-1.0,-1.0,-1.831277,-0.217915,0.732056,0.949972,74.575075
2,2012,3103,Akron,-1.0,-1.0,-0.152271,-0.004095,0.932742,0.936837,77.106955
3,2012,3104,Alabama,-1.0,-1.0,-0.134440,0.014331,0.843149,0.828818,74.567105
4,2012,3105,Alabama A&M,-1.0,-1.0,-0.401164,-0.079746,0.830830,0.910576,68.810632
...,...,...,...,...,...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0,-2.868351,-0.267052,0.761421,1.028474,73.226379
4132,2023,3475,Southern Indiana,-1.0,-1.0,-1.395469,-0.147884,0.789814,0.937699,73.553011
4133,2023,3476,Stonehill,-1.0,-1.0,-2.582334,-0.244864,0.817570,1.062434,68.209488
4134,2023,3477,TX A&M Commerce,-1.0,-1.0,-1.251000,-0.111823,0.865579,0.977402,73.473756


In [8]:
df.loc[df['Rating'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,
8,2012,3109,Alliant Intl,-1.0,-1.0,,,,,
17,2012,3118,Armstrong St,-1.0,-1.0,,,,,
20,2012,3121,Augusta,-1.0,-1.0,,,,,
27,2012,3128,Birmingham So,-1.0,-1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4024,2023,3366,Savannah St,-1.0,-1.0,,,,,
4090,2023,3432,Utica,-1.0,-1.0,,,,,
4102,2023,3445,W Salem St,-1.0,-1.0,,,,,
4103,2023,3446,W Texas A&M,-1.0,-1.0,,,,,


### Previous Rankings

In [9]:
df_prev = pd.read_csv(r'data\preprocessed\womens_my_rankings_full_season\womens_my_rankings_full_season.csv')

df_prev

Unnamed: 0,Season,Team,Past Year Rating,Past 4 Years Ratings
0,2012,Connecticut,5.099069,5.576978
1,2012,Stanford,4.731230,4.639040
2,2012,Baylor,4.546418,3.658880
3,2012,Texas A&M,4.457265,3.738744
4,2012,Tennessee,4.306993,4.172733
...,...,...,...,...
4531,2024,Texas Southern,-3.813642,-1.357948
4532,2024,Navy,-4.039280,-2.028372
4533,2024,Mississippi Valley State,-4.365660,-3.570037
4534,2024,Saint Peter's,-4.708909,-2.127877


In [10]:
prev_teams = df_prev['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            prev_team,
            *process.extract(
                prev_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for prev_team in tqdm(prev_teams)
    ],
    columns=['Prev Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/363 [00:00<?, ?it/s]

Unnamed: 0,Prev Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,Centenary (LA) Gents,centenary (la),80
4,Savannah State Tigers,savannah state,80
5,St. Thomas,st thomas mn,86
6,Kansas City,mo kansas city,88
7,Texas A&M-Commerce,tx a&m commerce,91
8,Connecticut,connecticut,100
9,Grambling,grambling,100


In [11]:
prev_to_spelling = dict(zip(df_match['Prev Team'], df_match['Team Spelling']))

df_prev.insert(1, 'TeamID', df_prev['Team'].map(prev_to_spelling).map(spelling_to_id))

df_prev

Unnamed: 0,Season,TeamID,Team,Past Year Rating,Past 4 Years Ratings
0,2012,3163,Connecticut,5.099069,5.576978
1,2012,3390,Stanford,4.731230,4.639040
2,2012,3124,Baylor,4.546418,3.658880
3,2012,3401,Texas A&M,4.457265,3.738744
4,2012,3397,Tennessee,4.306993,4.172733
...,...,...,...,...,...
4531,2024,3411,Texas Southern,-3.813642,-1.357948
4532,2024,3298,Navy,-4.039280,-2.028372
4533,2024,3290,Mississippi Valley State,-4.365660,-3.570037
4534,2024,3389,Saint Peter's,-4.708909,-2.127877


In [12]:
df_prev.loc[df_prev['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Rating,Past 4 Years Ratings


In [13]:
df = pd.merge(
    df,
    df_prev.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,
1,2012,3102,Air Force,-1.0,-1.0,-1.831277,-0.217915,0.732056,0.949972,74.575075,-1.410331,-2.079717
2,2012,3103,Akron,-1.0,-1.0,-0.152271,-0.004095,0.932742,0.936837,77.106955,-0.030574,-0.523468
3,2012,3104,Alabama,-1.0,-1.0,-0.134440,0.014331,0.843149,0.828818,74.567105,0.935387,0.158792
4,2012,3105,Alabama A&M,-1.0,-1.0,-0.401164,-0.079746,0.830830,0.910576,68.810632,-0.902314,-1.288895
...,...,...,...,...,...,...,...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0,-2.868351,-0.267052,0.761421,1.028474,73.226379,,
4132,2023,3475,Southern Indiana,-1.0,-1.0,-1.395469,-0.147884,0.789814,0.937699,73.553011,,
4133,2023,3476,Stonehill,-1.0,-1.0,-2.582334,-0.244864,0.817570,1.062434,68.209488,,
4134,2023,3477,TX A&M Commerce,-1.0,-1.0,-1.251000,-0.111823,0.865579,0.977402,73.473756,,


In [14]:
df.loc[df['Past 4 Years Ratings'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,
8,2012,3109,Alliant Intl,-1.0,-1.0,,,,,,,
17,2012,3118,Armstrong St,-1.0,-1.0,,,,,,,
20,2012,3121,Augusta,-1.0,-1.0,,,,,,,
27,2012,3128,Birmingham So,-1.0,-1.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0,-2.868351,-0.267052,0.761421,1.028474,73.226379,,
4132,2023,3475,Southern Indiana,-1.0,-1.0,-1.395469,-0.147884,0.789814,0.937699,73.553011,,
4133,2023,3476,Stonehill,-1.0,-1.0,-2.582334,-0.244864,0.817570,1.062434,68.209488,,
4134,2023,3477,TX A&M Commerce,-1.0,-1.0,-1.251000,-0.111823,0.865579,0.977402,73.473756,,


### Starters

In [15]:
df_starters = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\womens_starters\womens_starters_{season}.csv')
        .assign(Season=season)
        for season in range(2012, 2024) if season != 2020
    ),
    ignore_index=True
)

df_starters.insert(0, 'Season', df_starters.pop('Season'))

df_starters.rename(columns={'Rating': 'Starters'}, inplace=True)

df_starters

Unnamed: 0,Season,Team,Starters
0,2012,Baylor,1.022474
1,2012,Notre Dame,0.911295
2,2012,Connecticut,0.898498
3,2012,Stanford,0.872734
4,2012,Delaware,0.787463
...,...,...,...
3837,2023,Charleston Southern,-0.625213
3838,2023,Navy,-0.666668
3839,2023,Mississippi Valley State,-0.698719
3840,2023,Saint Peter's,-0.732723


In [16]:
starters_teams = df_starters['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            starters_team,
            *process.extract(
                starters_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for starters_team in tqdm(starters_teams)
    ],
    columns=['Starters Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,Starters Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Baylor,baylor,100
5,Western Illinois,western illinois,100
6,Texas State,texas state,100
7,Southern Illinois,southern illinois,100
8,Mississippi,mississippi,100
9,Southeastern Louisiana,southeastern louisiana,100


In [17]:
starters_to_spelling = dict(zip(df_match['Starters Team'], df_match['Team Spelling']))

df_starters.insert(1, 'TeamID', df_starters['Team'].map(starters_to_spelling).map(spelling_to_id))

df_starters

Unnamed: 0,Season,TeamID,Team,Starters
0,2012,3124,Baylor,1.022474
1,2012,3323,Notre Dame,0.911295
2,2012,3163,Connecticut,0.898498
3,2012,3390,Stanford,0.872734
4,2012,3174,Delaware,0.787463
...,...,...,...,...
3837,2023,3149,Charleston Southern,-0.625213
3838,2023,3298,Navy,-0.666668
3839,2023,3290,Mississippi Valley State,-0.698719
3840,2023,3389,Saint Peter's,-0.732723


In [18]:
df_starters.loc[df_starters['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Starters


In [19]:
df = pd.merge(
    df,
    df_starters.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,,
1,2012,3102,Air Force,-1.0,-1.0,-1.831277,-0.217915,0.732056,0.949972,74.575075,-1.410331,-2.079717,-0.194364
2,2012,3103,Akron,-1.0,-1.0,-0.152271,-0.004095,0.932742,0.936837,77.106955,-0.030574,-0.523468,0.022943
3,2012,3104,Alabama,-1.0,-1.0,-0.134440,0.014331,0.843149,0.828818,74.567105,0.935387,0.158792,-0.022134
4,2012,3105,Alabama A&M,-1.0,-1.0,-0.401164,-0.079746,0.830830,0.910576,68.810632,-0.902314,-1.288895,0.063638
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0,-2.868351,-0.267052,0.761421,1.028474,73.226379,,,-0.480801
4132,2023,3475,Southern Indiana,-1.0,-1.0,-1.395469,-0.147884,0.789814,0.937699,73.553011,,,-0.034042
4133,2023,3476,Stonehill,-1.0,-1.0,-2.582334,-0.244864,0.817570,1.062434,68.209488,,,-0.323604
4134,2023,3477,TX A&M Commerce,-1.0,-1.0,-1.251000,-0.111823,0.865579,0.977402,73.473756,,,-0.072719


In [20]:
df.loc[df['Starters'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,,
8,2012,3109,Alliant Intl,-1.0,-1.0,,,,,,,,
17,2012,3118,Armstrong St,-1.0,-1.0,,,,,,,,
20,2012,3121,Augusta,-1.0,-1.0,,,,,,,,
27,2012,3128,Birmingham So,-1.0,-1.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4024,2023,3366,Savannah St,-1.0,-1.0,,,,,,,,
4090,2023,3432,Utica,-1.0,-1.0,,,,,,,,
4102,2023,3445,W Salem St,-1.0,-1.0,,,,,,,,
4103,2023,3446,W Texas A&M,-1.0,-1.0,,,,,,,,


### Box Score Stats

In [21]:
df_stats = pd.read_csv(r'..\data\preprocessed\womens_box_score_stats\womens_box_score_stats.csv')

df_stats

Unnamed: 0,Season,Team,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR
0,2012,Air Force,0.206897,0.359092,0.465672,28.432136,24.020646,15.950771,27.764713,26.582950
1,2012,Akron,0.437500,0.479016,0.450917,24.421918,22.137917,13.804245,23.486284,24.877060
2,2012,Alabama,0.387097,0.385596,0.452268,23.786649,26.323043,16.400917,23.213107,22.586530
3,2012,Alabama A&M,0.571429,0.399033,0.398839,22.398825,22.936621,19.312337,28.515040,21.120411
4,2012,Alabama State,0.461538,0.375943,0.395007,27.683449,22.872486,19.446039,26.945086,32.327736
...,...,...,...,...,...,...,...,...,...,...
4188,2023,Wright State,0.225806,0.498593,0.534849,17.012553,17.144642,6.267888,13.380253,20.606656
4189,2023,Wyoming,0.666667,0.509020,0.450510,19.795051,16.780303,12.286629,21.591279,21.086182
4190,2023,Xavier,0.233333,0.412094,0.473729,23.822086,22.387582,11.589644,15.655137,22.642044
4191,2023,Yale,0.481481,0.420978,0.460989,23.695352,21.984882,16.213456,18.227896,26.639276


In [22]:
stats_teams = df_stats['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            stats_team,
            *process.extract(
                stats_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for stats_team in tqdm(stats_teams)
    ],
    columns=['Stats Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,Stats Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,Savannah State Tigers,savannah state,80
4,St. Thomas,st thomas mn,86
5,Kansas City,mo kansas city,88
6,Texas A&M-Commerce,tx a&m commerce,91
7,Saint Mary's (CA),saint mary's (ca),100
8,Saint Louis,saint louis,100
9,Saint Joseph's,saint joseph's,100


In [23]:
stats_to_spelling = dict(zip(df_match['Stats Team'], df_match['Team Spelling']))

df_stats.insert(1, 'TeamID', df_stats['Team'].map(stats_to_spelling).map(spelling_to_id))

df_stats

Unnamed: 0,Season,TeamID,Team,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR
0,2012,3102,Air Force,0.206897,0.359092,0.465672,28.432136,24.020646,15.950771,27.764713,26.582950
1,2012,3103,Akron,0.437500,0.479016,0.450917,24.421918,22.137917,13.804245,23.486284,24.877060
2,2012,3104,Alabama,0.387097,0.385596,0.452268,23.786649,26.323043,16.400917,23.213107,22.586530
3,2012,3105,Alabama A&M,0.571429,0.399033,0.398839,22.398825,22.936621,19.312337,28.515040,21.120411
4,2012,3106,Alabama State,0.461538,0.375943,0.395007,27.683449,22.872486,19.446039,26.945086,32.327736
...,...,...,...,...,...,...,...,...,...,...,...
4188,2023,3460,Wright State,0.225806,0.498593,0.534849,17.012553,17.144642,6.267888,13.380253,20.606656
4189,2023,3461,Wyoming,0.666667,0.509020,0.450510,19.795051,16.780303,12.286629,21.591279,21.086182
4190,2023,3462,Xavier,0.233333,0.412094,0.473729,23.822086,22.387582,11.589644,15.655137,22.642044
4191,2023,3463,Yale,0.481481,0.420978,0.460989,23.695352,21.984882,16.213456,18.227896,26.639276


In [24]:
df_stats.loc[df_stats['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR


In [25]:
df = pd.merge(
    df,
    df_stats.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,
1,2012,3102,Air Force,-1.0,-1.0,-1.831277,-0.217915,0.732056,0.949972,74.575075,-1.410331,-2.079717,-0.194364,0.206897,0.359092,0.465672,28.432136,24.020646,15.950771,27.764713,26.582950
2,2012,3103,Akron,-1.0,-1.0,-0.152271,-0.004095,0.932742,0.936837,77.106955,-0.030574,-0.523468,0.022943,0.437500,0.479016,0.450917,24.421918,22.137917,13.804245,23.486284,24.877060
3,2012,3104,Alabama,-1.0,-1.0,-0.134440,0.014331,0.843149,0.828818,74.567105,0.935387,0.158792,-0.022134,0.387097,0.385596,0.452268,23.786649,26.323043,16.400917,23.213107,22.586530
4,2012,3105,Alabama A&M,-1.0,-1.0,-0.401164,-0.079746,0.830830,0.910576,68.810632,-0.902314,-1.288895,0.063638,0.571429,0.399033,0.398839,22.398825,22.936621,19.312337,28.515040,21.120411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0,-2.868351,-0.267052,0.761421,1.028474,73.226379,,,-0.480801,0.192308,0.385667,0.505617,20.778791,22.290950,11.663618,21.776474,30.733245
4132,2023,3475,Southern Indiana,-1.0,-1.0,-1.395469,-0.147884,0.789814,0.937699,73.553011,,,-0.034042,0.346154,0.435186,0.455503,26.855660,23.368938,12.329459,26.337568,26.833733
4133,2023,3476,Stonehill,-1.0,-1.0,-2.582334,-0.244864,0.817570,1.062434,68.209488,,,-0.323604,0.285714,0.441719,0.474679,21.297962,16.598459,12.889947,17.881752,17.626117
4134,2023,3477,TX A&M Commerce,-1.0,-1.0,-1.251000,-0.111823,0.865579,0.977402,73.473756,,,-0.072719,0.406250,0.428839,0.481760,22.458422,21.238237,15.431194,26.566158,17.033695


In [26]:
df.loc[df['Team Win%'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,
8,2012,3109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,
17,2012,3118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,
20,2012,3121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,
27,2012,3128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4024,2023,3366,Savannah St,-1.0,-1.0,,,,,,,,,,,,,,,,
4090,2023,3432,Utica,-1.0,-1.0,,,,,,,,,,,,,,,,
4102,2023,3445,W Salem St,-1.0,-1.0,,,,,,,,,,,,,,,,
4103,2023,3446,W Texas A&M,-1.0,-1.0,,,,,,,,,,,,,,,,


### Strength of Schedule

In [27]:
df_sos = pd.read_csv('../data/preprocessed/womens_sos/womens_sos.csv')

df_sos

Unnamed: 0,Season,Team,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2012,Air Force,-0.152163,-0.083404,-0.035436,-0.006093,-0.104840,-0.049391,-0.023207,0.015430
1,2012,Akron,-0.100213,-0.021749,0.063472,0.081430,-0.182061,-0.056845,-0.035750,0.048377
2,2012,Alabama,-0.106952,0.003164,0.191008,0.291233,-0.052181,0.001447,0.082468,0.232509
3,2012,Alabama A&M,-0.157796,-0.136646,-0.080710,-0.063227,-0.165046,-0.149575,-0.128260,-0.035382
4,2012,Alabama State,-0.157796,-0.114468,-0.081206,-0.035382,-0.266922,-0.148533,-0.136646,-0.080228
...,...,...,...,...,...,...,...,...,...,...
3837,2021,Virginia,-0.072068,-0.072068,-0.072068,-0.072068,0.010475,0.023774,0.043723,0.143050
3838,2022,Delaware State,-0.493152,-0.493152,-0.493152,-0.493152,-0.362972,-0.256526,-0.169604,-0.103276
3839,2022,Morehead State,-0.336325,-0.336325,-0.336325,-0.336325,-0.243651,-0.167443,-0.109045,-0.042146
3840,2023,Hartford Hawks,-0.521199,-0.521199,-0.521199,-0.521199,-0.374166,-0.374166,-0.278828,-0.203193


In [28]:
sos_teams = df_sos['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            sos_team,
            *process.extract(
                sos_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for sos_team in tqdm(sos_teams)
    ],
    columns=['SOS Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,SOS Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,Savannah State Tigers,savannah state,80
4,St. Thomas,st thomas mn,86
5,Kansas City,mo kansas city,88
6,Texas A&M-Commerce,tx a&m commerce,91
7,Southeastern Louisiana,southeastern louisiana,100
8,Sam Houston,sam houston,100
9,Saint Peter's,saint peter's,100


In [29]:
sos_to_spelling = dict(zip(df_match['SOS Team'], df_match['Team Spelling']))

df_sos.insert(1, 'TeamID', df_sos['Team'].map(sos_to_spelling).map(spelling_to_id))

df_sos

Unnamed: 0,Season,TeamID,Team,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2012,3102,Air Force,-0.152163,-0.083404,-0.035436,-0.006093,-0.104840,-0.049391,-0.023207,0.015430
1,2012,3103,Akron,-0.100213,-0.021749,0.063472,0.081430,-0.182061,-0.056845,-0.035750,0.048377
2,2012,3104,Alabama,-0.106952,0.003164,0.191008,0.291233,-0.052181,0.001447,0.082468,0.232509
3,2012,3105,Alabama A&M,-0.157796,-0.136646,-0.080710,-0.063227,-0.165046,-0.149575,-0.128260,-0.035382
4,2012,3106,Alabama State,-0.157796,-0.114468,-0.081206,-0.035382,-0.266922,-0.148533,-0.136646,-0.080228
...,...,...,...,...,...,...,...,...,...,...,...
3837,2021,3438,Virginia,-0.072068,-0.072068,-0.072068,-0.072068,0.010475,0.023774,0.043723,0.143050
3838,2022,3175,Delaware State,-0.493152,-0.493152,-0.493152,-0.493152,-0.362972,-0.256526,-0.169604,-0.103276
3839,2022,3287,Morehead State,-0.336325,-0.336325,-0.336325,-0.336325,-0.243651,-0.167443,-0.109045,-0.042146
3840,2023,3216,Hartford Hawks,-0.521199,-0.521199,-0.521199,-0.521199,-0.374166,-0.374166,-0.278828,-0.203193


In [30]:
df = pd.merge(
    df,
    df_sos.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
1,2012,3102,Air Force,-1.0,-1.0,-1.831277,-0.217915,0.732056,0.949972,74.575075,-1.410331,-2.079717,-0.194364,0.206897,0.359092,0.465672,28.432136,24.020646,15.950771,27.764713,26.582950,-0.152163,-0.083404,-0.035436,-0.006093,-0.104840,-0.049391,-0.023207,0.015430
2,2012,3103,Akron,-1.0,-1.0,-0.152271,-0.004095,0.932742,0.936837,77.106955,-0.030574,-0.523468,0.022943,0.437500,0.479016,0.450917,24.421918,22.137917,13.804245,23.486284,24.877060,-0.100213,-0.021749,0.063472,0.081430,-0.182061,-0.056845,-0.035750,0.048377
3,2012,3104,Alabama,-1.0,-1.0,-0.134440,0.014331,0.843149,0.828818,74.567105,0.935387,0.158792,-0.022134,0.387097,0.385596,0.452268,23.786649,26.323043,16.400917,23.213107,22.586530,-0.106952,0.003164,0.191008,0.291233,-0.052181,0.001447,0.082468,0.232509
4,2012,3105,Alabama A&M,-1.0,-1.0,-0.401164,-0.079746,0.830830,0.910576,68.810632,-0.902314,-1.288895,0.063638,0.571429,0.399033,0.398839,22.398825,22.936621,19.312337,28.515040,21.120411,-0.157796,-0.136646,-0.080710,-0.063227,-0.165046,-0.149575,-0.128260,-0.035382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4131,2023,3474,Queens NC,-1.0,-1.0,-2.868351,-0.267052,0.761421,1.028474,73.226379,,,-0.480801,0.192308,0.385667,0.505617,20.778791,22.290950,11.663618,21.776474,30.733245,-0.253598,-0.112864,-0.086276,-0.068550,-0.301791,-0.253598,-0.152857,-0.037971
4132,2023,3475,Southern Indiana,-1.0,-1.0,-1.395469,-0.147884,0.789814,0.937699,73.553011,,,-0.034042,0.346154,0.435186,0.455503,26.855660,23.368938,12.329459,26.337568,26.833733,-0.206222,-0.169012,-0.117394,-0.076389,-0.244053,-0.211420,-0.078625,-0.045575
4133,2023,3476,Stonehill,-1.0,-1.0,-2.582334,-0.244864,0.817570,1.062434,68.209488,,,-0.323604,0.285714,0.441719,0.474679,21.297962,16.598459,12.889947,17.881752,17.626117,-0.262417,-0.242003,-0.210151,-0.177714,-0.319257,-0.269535,-0.225129,-0.105584
4134,2023,3477,TX A&M Commerce,-1.0,-1.0,-1.251000,-0.111823,0.865579,0.977402,73.473756,,,-0.072719,0.406250,0.428839,0.481760,22.458422,21.238237,15.431194,26.566158,17.033695,-0.187062,-0.142405,-0.060628,-0.033892,-0.222821,-0.151336,-0.034487,-0.000496


In [31]:
df.loc[df['0.5 Win'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss
0,2012,3101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
8,2012,3109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
17,2012,3118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
20,2012,3121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
27,2012,3128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4024,2023,3366,Savannah St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
4090,2023,3432,Utica,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
4102,2023,3445,W Salem St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,
4103,2023,3446,W Texas A&M,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,


### Map to Matchups

In [32]:
df_mod = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneyDetailedResults.csv')[['Season', 'DayNum', 'WTeamID', 'LTeamID', 'WScore', 'LScore']]

df_mod = df_mod.loc[df_mod['Season'] >= 2012, :].reset_index(drop=True)

df_mod

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,WScore,LScore
0,2012,138,3116,3173,72,55
1,2012,138,3163,3341,83,47
2,2012,138,3177,3140,59,55
3,2012,138,3211,3353,86,73
4,2012,138,3243,3343,67,64
...,...,...,...,...,...,...
696,2023,147,3376,3268,86,75
697,2023,147,3439,3326,84,74
698,2023,151,3234,3376,77,73
699,2023,151,3261,3439,79,72


Get seeding

In [33]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] >= 2012, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds.insert(1, 'Region Seed', df_seeds['Region'] + df_seeds['Seed'].astype(str).str.zfill(2))

df_seeds

Unnamed: 0,Season,Region Seed,Seed,Region,Play In,TeamID
0,2012,W01,1,W,False,3124
1,2012,W02,2,W,False,3397
2,2012,W03,3,W,False,3174
3,2012,W04,4,W,False,3210
4,2012,W05,5,W,False,3207
...,...,...,...,...,...,...
707,2023,Z12,12,Z,False,3405
708,2023,Z13,13,Z,False,3387
709,2023,Z14,14,Z,False,3241
710,2023,Z15,15,Z,False,3436


Remove play-ins

In [34]:
df_mod = pd.merge(
    df_mod,
    df_seeds[['Season', 'Region', 'Seed', 'Play In', 'TeamID']]
    .rename(columns={'TeamID': 'WTeamID', 'Region': 'WTeamRegion', 'Seed': 'WTeamSeed', 'Play In': 'WTeamPlayIn'}),
    how='left',
    on=['Season', 'WTeamID'],
)

df_mod

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,WScore,LScore,WTeamRegion,WTeamSeed,WTeamPlayIn
0,2012,138,3116,3173,72,55,Z,6,False
1,2012,138,3163,3341,83,47,Y,1,False
2,2012,138,3177,3140,59,55,W,7,False
3,2012,138,3211,3353,86,73,Y,11,False
4,2012,138,3243,3343,67,64,Y,8,False
...,...,...,...,...,...,...,...,...,...
696,2023,147,3376,3268,86,75,W,1,False
697,2023,147,3439,3326,84,74,Z,1,False
698,2023,151,3234,3376,77,73,X,2,False
699,2023,151,3261,3439,79,72,Y,3,False


In [35]:
df_mod = pd.merge(
    df_mod,
    df_seeds[['Season', 'Region', 'Seed', 'Play In', 'TeamID']]
    .rename(columns={'TeamID': 'LTeamID', 'Region': 'LTeamRegion', 'Seed': 'LTeamSeed', 'Play In': 'LTeamPlayIn'}),
    how='left',
    on=['Season', 'LTeamID'],
)

df_mod

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,WScore,LScore,WTeamRegion,WTeamSeed,WTeamPlayIn,LTeamRegion,LTeamSeed,LTeamPlayIn
0,2012,138,3116,3173,72,55,Z,6,False,Z,11,False
1,2012,138,3163,3341,83,47,Y,1,False,Y,16,False
2,2012,138,3177,3140,59,55,W,7,False,W,10,False
3,2012,138,3211,3353,86,73,Y,11,False,Y,6,False
4,2012,138,3243,3343,67,64,Y,8,False,Y,9,False
...,...,...,...,...,...,...,...,...,...,...,...,...
696,2023,147,3376,3268,86,75,W,1,False,W,2,False
697,2023,147,3439,3326,84,74,Z,1,False,Z,3,False
698,2023,151,3234,3376,77,73,X,2,False,W,1,False
699,2023,151,3261,3439,79,72,Y,3,False,Z,1,False


In [36]:
df_mod = df_mod.loc[
    ~(  # exclude if teams are in the same region and both play-ins
        (df_mod['WTeamRegion'] == df_mod['LTeamRegion']) & 
        (df_mod['WTeamPlayIn']) & 
        (df_mod['LTeamPlayIn'])
    ), 
    :
].reset_index(drop=True)

df_mod

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,WScore,LScore,WTeamRegion,WTeamSeed,WTeamPlayIn,LTeamRegion,LTeamSeed,LTeamPlayIn
0,2012,138,3116,3173,72,55,Z,6,False,Z,11,False
1,2012,138,3163,3341,83,47,Y,1,False,Y,16,False
2,2012,138,3177,3140,59,55,W,7,False,W,10,False
3,2012,138,3211,3353,86,73,Y,11,False,Y,6,False
4,2012,138,3243,3343,67,64,Y,8,False,Y,9,False
...,...,...,...,...,...,...,...,...,...,...,...,...
688,2023,147,3376,3268,86,75,W,1,False,W,2,False
689,2023,147,3439,3326,84,74,Z,1,False,Z,3,False
690,2023,151,3234,3376,77,73,X,2,False,W,1,False
691,2023,151,3261,3439,79,72,Y,3,False,Z,1,False


Remap to Team A / Team B format

In [37]:
df_mod = pd.DataFrame({
    'Season': list(df_mod['Season'])*2,
    'Result': [1 for _ in range(df_mod.shape[0])] + [-1 for _ in range(df_mod.shape[0])],
    'Team A ID': list(df_mod['WTeamID']) + list(df_mod['LTeamID']),
    'Team B ID': list(df_mod['LTeamID']) + list(df_mod['WTeamID']),
    'Team A Region': list(df_mod['WTeamRegion']) + list(df_mod['LTeamRegion']),
    'Team B Region': list(df_mod['LTeamRegion']) + list(df_mod['WTeamRegion']),
    'Team A Seed': list(df_mod['WTeamSeed']) + list(df_mod['LTeamSeed']),
    'Team B Seed': list(df_mod['LTeamSeed']) + list(df_mod['WTeamSeed']),
})

df_mod

Unnamed: 0,Season,Result,Team A ID,Team B ID,Team A Region,Team B Region,Team A Seed,Team B Seed
0,2012,1,3116,3173,Z,Z,6,11
1,2012,1,3163,3341,Y,Y,1,16
2,2012,1,3177,3140,W,W,7,10
3,2012,1,3211,3353,Y,Y,11,6
4,2012,1,3243,3343,Y,Y,8,9
...,...,...,...,...,...,...,...,...
1381,2023,-1,3268,3376,W,W,2,1
1382,2023,-1,3326,3439,Z,Z,3,1
1383,2023,-1,3376,3234,W,X,1,2
1384,2023,-1,3439,3261,Z,Y,1,3


Get round of matchup

In [38]:
same_region = df_mod['Team A Region'] == df_mod['Team B Region']

# round_0_condition = (df_mod['team0_playin'] == 1) & (df_mod['team1_playin'] == 1)  # no play-in games in this data

round_1_condition = df_mod['Team A Seed'] + df_mod['Team B Seed'] == 17

round_2_condition = (
    (df_mod['Team A Seed'].isin([1, 16]) & df_mod['Team B Seed'].isin([8, 9])) | 
    (df_mod['Team A Seed'].isin([8, 9]) & df_mod['Team B Seed'].isin([1, 16])) |
    (df_mod['Team A Seed'].isin([5, 12]) & df_mod['Team B Seed'].isin([4, 13])) | 
    (df_mod['Team A Seed'].isin([4, 13]) & df_mod['Team B Seed'].isin([5, 12])) |
    (df_mod['Team A Seed'].isin([6, 11]) & df_mod['Team B Seed'].isin([3, 14])) | 
    (df_mod['Team A Seed'].isin([3, 14]) & df_mod['Team B Seed'].isin([6, 11])) |
    (df_mod['Team A Seed'].isin([7, 10]) & df_mod['Team B Seed'].isin([2, 15])) | 
    (df_mod['Team A Seed'].isin([2, 15]) & df_mod['Team B Seed'].isin([7, 10]))
)

round_3_condition = (
    (df_mod['Team A Seed'].isin([1, 16, 8, 9]) & df_mod['Team B Seed'].isin([5, 12, 4, 13])) | 
    (df_mod['Team A Seed'].isin([5, 12, 4, 13]) & df_mod['Team B Seed'].isin([1, 16, 8, 9])) |
    (df_mod['Team A Seed'].isin([6, 11, 3, 14]) & df_mod['Team B Seed'].isin([7, 10, 2, 15])) | 
    (df_mod['Team A Seed'].isin([7, 10, 2, 15]) & df_mod['Team B Seed'].isin([6, 11, 3, 14]))
)

round_4_condition = (
    (df_mod['Team A Seed'].isin([1, 16, 8, 9, 5, 12, 4, 13]) & df_mod['Team B Seed'].isin([6, 11, 3, 14, 7, 10, 2, 15])) | 
    (df_mod['Team A Seed'].isin([6, 11, 3, 14, 7, 10, 2, 15]) & df_mod['Team B Seed'].isin([1, 16, 8, 9, 5, 12, 4, 13]))
)

round_5_condition = (
    (df_mod['Team A Region'].isin(['W']) & df_mod['Team B Region'].isin(['X'])) | 
    (df_mod['Team A Region'].isin(['X']) & df_mod['Team B Region'].isin(['W'])) |
    (df_mod['Team A Region'].isin(['Y']) & df_mod['Team B Region'].isin(['Z'])) | 
    (df_mod['Team A Region'].isin(['Z']) & df_mod['Team B Region'].isin(['Y']))
)

round_6_condition = (
    (df_mod['Team A Region'].isin(['W', 'X']) & df_mod['Team B Region'].isin(['Y', 'Z'])) | 
    (df_mod['Team A Region'].isin(['Y', 'Z']) & df_mod['Team B Region'].isin(['W', 'X'])) 
)

round_6_condition

0       False
1       False
2       False
3       False
4       False
        ...  
1381    False
1382    False
1383    False
1384    False
1385     True
Length: 1386, dtype: bool

In [39]:
df_mod['Round'] = -1

df_mod.loc[round_6_condition, 'Round'] = 6

df_mod.loc[round_5_condition, 'Round'] = 5

df_mod.loc[round_4_condition & same_region, 'Round'] = 4

df_mod.loc[round_3_condition & same_region, 'Round'] = 3

df_mod.loc[round_2_condition & same_region, 'Round'] = 2

df_mod.loc[round_1_condition & same_region, 'Round'] = 1

df_mod['Round'].describe()

count    1386.000000
mean        1.904762
std         1.191858
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         6.000000
Name: Round, dtype: float64

In [40]:
df_mod['Seed'] = df_mod['Team A Seed'] - df_mod['Team B Seed']

df_mod.drop(columns=['Team A Region', 'Team B Region', 'Team A Seed', 'Team B Seed'], inplace=True)

df_mod

Unnamed: 0,Season,Result,Team A ID,Team B ID,Round,Seed
0,2012,1,3116,3173,1,-5
1,2012,1,3163,3341,1,-15
2,2012,1,3177,3140,1,-3
3,2012,1,3211,3353,1,5
4,2012,1,3243,3343,1,-1
...,...,...,...,...,...,...
1381,2023,-1,3268,3376,4,1
1382,2023,-1,3326,3439,4,2
1383,2023,-1,3376,3234,5,-1
1384,2023,-1,3439,3261,5,-2


Get Head-to-Head

In [41]:
df_h2h = pd.read_csv('../data/preprocessed/women_h2h/women_h2h.csv')

df_h2h

Unnamed: 0,Season,Team A,Team B,Head to Head,Common Opps
0,2012,Air Force,Alabama,-0.800000,
1,2012,Air Force,Boise State,-0.888889,-0.160000
2,2012,Air Force,Colorado State,-0.888889,-0.288288
3,2012,Air Force,Denver,-0.800000,-0.941176
4,2012,Air Force,Jackson State,0.800000,0.266667
...,...,...,...,...,...
670803,2023,Youngstown State,Wagner,,0.444444
670804,2023,Youngstown State,West Virginia,,0.000000
670805,2023,Youngstown State,Western Carolina,,0.533333
670806,2023,Youngstown State,Wisconsin,,0.000000


In [42]:
h2h_teams = df_h2h['Team A'].unique()

df_match = pd.DataFrame(
    [
        [
            h2h_team,
            *process.extract(
                h2h_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for h2h_team in tqdm(h2h_teams)
    ],
    columns=['Head to Head Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/362 [00:00<?, ?it/s]

Unnamed: 0,Head to Head Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,Savannah State Tigers,savannah state,80
4,St. Thomas,st thomas mn,86
5,Kansas City,mo kansas city,88
6,Texas A&M-Commerce,tx a&m commerce,91
7,Saint Mary's (CA),saint mary's (ca),100
8,Saint Louis,saint louis,100
9,Saint Joseph's,saint joseph's,100


In [43]:
h2h_to_spelling = dict(zip(df_match['Head to Head Team'], df_match['Team Spelling']))

df_h2h.insert(df_h2h.columns.get_loc('Team A'), 'Team A ID', df_h2h['Team A'].map(h2h_to_spelling).map(spelling_to_id))

df_h2h.insert(df_h2h.columns.get_loc('Team B'), 'Team B ID', df_h2h['Team B'].map(h2h_to_spelling).map(spelling_to_id))

df_h2h

Unnamed: 0,Season,Team A ID,Team A,Team B ID,Team B,Head to Head,Common Opps
0,2012,3102,Air Force,3104,Alabama,-0.800000,
1,2012,3102,Air Force,3129,Boise State,-0.888889,-0.160000
2,2012,3102,Air Force,3161,Colorado State,-0.888889,-0.288288
3,2012,3102,Air Force,3176,Denver,-0.800000,-0.941176
4,2012,3102,Air Force,3238,Jackson State,0.800000,0.266667
...,...,...,...,...,...,...,...
670803,2023,3464,Youngstown State,3447,Wagner,,0.444444
670804,2023,3464,Youngstown State,3452,West Virginia,,0.000000
670805,2023,3464,Youngstown State,3441,Western Carolina,,0.533333
670806,2023,3464,Youngstown State,3458,Wisconsin,,0.000000


In [44]:
df_mod = pd.merge(
    df_mod,
    df_h2h[['Season', 'Team A ID', 'Team B ID', 'Head to Head', 'Common Opps']],
    how='left',
    on=['Season', 'Team A ID', 'Team B ID'],
)

df_mod

Unnamed: 0,Season,Result,Team A ID,Team B ID,Round,Seed,Head to Head,Common Opps
0,2012,1,3116,3173,1,-5,,0.000000
1,2012,1,3163,3341,1,-15,,0.666667
2,2012,1,3177,3140,1,-3,,-0.153846
3,2012,1,3211,3353,1,5,,
4,2012,1,3243,3343,1,-1,,-0.307692
...,...,...,...,...,...,...,...,...
1381,2023,-1,3268,3376,4,1,-0.8,0.000000
1382,2023,-1,3326,3439,4,2,,0.000000
1383,2023,-1,3376,3234,5,-1,,0.592593
1384,2023,-1,3439,3261,5,-2,,0.153846


Get team names

In [45]:
df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\WTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M
...,...,...
371,3474,Queens NC
372,3475,Southern Indiana
373,3476,Stonehill
374,3477,TX A&M Commerce


In [46]:
id_to_team = dict(zip(df_teams['TeamID'], df_teams['TeamName']))

df_mod.insert(df_mod.columns.get_loc('Team A ID') + 1, 'Team A', df_mod['Team A ID'].map(id_to_team))
df_mod.insert(df_mod.columns.get_loc('Team B ID') + 1, 'Team B', df_mod['Team B ID'].map(id_to_team))

df_mod

Unnamed: 0,Season,Result,Team A ID,Team A,Team B ID,Team B,Round,Seed,Head to Head,Common Opps
0,2012,1,3116,Arkansas,3173,Dayton,1,-5,,0.000000
1,2012,1,3163,Connecticut,3341,Prairie View,1,-15,,0.666667
2,2012,1,3177,DePaul,3140,BYU,1,-3,,-0.153846
3,2012,1,3211,Gonzaga,3353,Rutgers,1,5,,
4,2012,1,3243,Kansas St,3343,Princeton,1,-1,,-0.307692
...,...,...,...,...,...,...,...,...,...,...
1381,2023,-1,3268,Maryland,3376,South Carolina,4,1,-0.8,0.000000
1382,2023,-1,3326,Ohio St,3439,Virginia Tech,4,2,,0.000000
1383,2023,-1,3376,South Carolina,3234,Iowa,5,-1,,0.592593
1384,2023,-1,3439,Virginia Tech,3261,LSU,5,-2,,0.153846


Map features

In [47]:
team_a_features = pd.merge(
    df_mod[['Season', 'Team A ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team A ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team A ID', 'TeamID'])

team_b_features = pd.merge(
    df_mod[['Season', 'Team B ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team B ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team B ID', 'TeamID'])

df_features = team_a_features - team_b_features

df_features['Team A Offense Team B Defense'] = team_a_features['Adjusted Offense'] + team_b_features['Adjusted Defense']
df_features['Team B Offense Team A Defense'] = team_b_features['Adjusted Offense'] + team_a_features['Adjusted Defense']

df_features['Team A Rating'] = team_a_features['Rating']
df_features['Team B Rating'] = team_b_features['Rating']

df_features

Unnamed: 0,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A Rating,Team B Rating
0,-1.0,-0.750000,-0.089508,0.026757,0.001189,-0.025567,-8.387438,-0.551002,-0.663068,-0.147548,-0.051168,-0.013769,-0.005223,-0.255442,2.215458,2.440872,-1.447517,-2.717337,0.049103,0.057933,0.080154,0.186366,-0.098768,-0.004212,0.068672,0.031637,1.730252,1.703495,2.321899,2.411407
1,4.0,2.750000,5.420883,0.688562,0.339859,-0.348703,0.854502,4.666967,5.646929,0.945633,0.362659,0.114824,-0.103051,-2.108403,1.200900,2.702770,-0.994398,-5.867203,0.325851,0.377205,0.371813,0.551623,0.447986,0.516449,0.609848,0.568292,2.092237,1.403675,4.755495,-0.665388
2,3.0,1.000000,0.263877,0.023684,0.068715,0.045031,2.795567,1.327217,1.278952,0.001981,-0.118952,0.002505,0.059121,0.348861,1.341508,-1.196064,-2.637275,-1.103772,0.065701,0.045488,0.088543,0.067362,0.158003,0.159586,0.152590,0.111489,1.830027,1.806343,2.604955,2.341077
3,2.0,1.000000,-0.217967,-0.047588,0.052572,0.100160,5.078021,0.741975,-0.351774,-0.057039,0.129032,0.046226,0.050877,-3.790769,-1.060401,1.929341,-0.756823,-4.345191,-0.121741,-0.121117,-0.068439,-0.019417,-0.172513,-0.130744,-0.066982,-0.129448,1.740460,1.788048,2.460832,2.678799
4,0.0,-0.250000,-0.452299,-0.061093,-0.048631,0.012462,-7.006088,-0.186343,1.565799,-0.117382,-0.263393,-0.042403,0.027494,-1.713529,-2.541942,-5.754640,-4.428173,2.920438,0.249388,0.195220,0.105794,0.142998,0.091003,0.024551,0.045774,-0.025010,1.666190,1.727284,2.081415,2.533715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,-4.0,-2.333333,-2.297466,-0.255286,-0.102635,0.152651,3.881919,-2.546332,-0.770498,-0.277584,-0.193548,-0.013409,0.129295,0.102629,6.190148,-9.255655,-1.139981,1.043045,0.006993,-0.011435,0.016908,-0.002886,-0.399127,-0.372879,-0.310015,-0.200165,1.739039,1.994326,3.815571,6.113037
1382,2.0,0.000000,-0.503128,-0.053538,-0.040638,0.012900,7.036740,0.042043,-0.186680,-0.183432,-0.089718,0.002137,0.056640,-0.951253,4.935148,-1.456589,-1.964807,1.444126,-0.087140,-0.010175,-0.005974,0.065653,0.036137,0.090983,0.116520,0.096686,1.871238,1.924775,3.614788,4.117916
1383,5.0,2.000000,2.073065,0.200165,0.015299,-0.184866,-4.516919,2.531813,1.369934,0.225738,0.187500,-0.074863,-0.107958,-0.624248,0.320985,12.831391,-1.071930,-3.378402,0.012387,0.006406,-0.000341,0.020598,0.411789,0.384592,0.337157,0.265864,2.026540,1.826376,6.113037,4.039972
1384,-1.0,0.333333,-0.377149,-0.055700,-0.018698,0.037002,-5.960647,-0.325898,0.360466,-0.096830,-0.062366,0.009444,0.016518,-0.528286,-2.999539,-7.044113,-4.320422,0.524095,0.108699,0.067741,0.053432,0.025771,-0.146194,-0.158831,-0.177786,-0.167504,1.874874,1.930573,4.117916,4.495065


In [48]:
df_mod[df_features.columns] = df_features

df_mod

Unnamed: 0,Season,Result,Team A ID,Team A,Team B ID,Team B,Round,Seed,Head to Head,Common Opps,Past Year Tournament Result,Past 4 Years Tournament Results,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Past Year Rating,Past 4 Years Ratings,Starters,Team Win%,Team EFG%,Opponent EFG%,Team TOR,Opponent TOR,Team ORBR,Team FTR,Opponent FTR,0.5 Win,0.75 Win,0.9 Win,1.0 Win,0.0 Loss,0.1 Loss,0.25 Loss,0.5 Loss,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A Rating,Team B Rating
0,2012,1,3116,Arkansas,3173,Dayton,1,-5,,0.000000,-1.0,-0.750000,-0.089508,0.026757,0.001189,-0.025567,-8.387438,-0.551002,-0.663068,-0.147548,-0.051168,-0.013769,-0.005223,-0.255442,2.215458,2.440872,-1.447517,-2.717337,0.049103,0.057933,0.080154,0.186366,-0.098768,-0.004212,0.068672,0.031637,1.730252,1.703495,2.321899,2.411407
1,2012,1,3163,Connecticut,3341,Prairie View,1,-15,,0.666667,4.0,2.750000,5.420883,0.688562,0.339859,-0.348703,0.854502,4.666967,5.646929,0.945633,0.362659,0.114824,-0.103051,-2.108403,1.200900,2.702770,-0.994398,-5.867203,0.325851,0.377205,0.371813,0.551623,0.447986,0.516449,0.609848,0.568292,2.092237,1.403675,4.755495,-0.665388
2,2012,1,3177,DePaul,3140,BYU,1,-3,,-0.153846,3.0,1.000000,0.263877,0.023684,0.068715,0.045031,2.795567,1.327217,1.278952,0.001981,-0.118952,0.002505,0.059121,0.348861,1.341508,-1.196064,-2.637275,-1.103772,0.065701,0.045488,0.088543,0.067362,0.158003,0.159586,0.152590,0.111489,1.830027,1.806343,2.604955,2.341077
3,2012,1,3211,Gonzaga,3353,Rutgers,1,5,,,2.0,1.000000,-0.217967,-0.047588,0.052572,0.100160,5.078021,0.741975,-0.351774,-0.057039,0.129032,0.046226,0.050877,-3.790769,-1.060401,1.929341,-0.756823,-4.345191,-0.121741,-0.121117,-0.068439,-0.019417,-0.172513,-0.130744,-0.066982,-0.129448,1.740460,1.788048,2.460832,2.678799
4,2012,1,3243,Kansas St,3343,Princeton,1,-1,,-0.307692,0.0,-0.250000,-0.452299,-0.061093,-0.048631,0.012462,-7.006088,-0.186343,1.565799,-0.117382,-0.263393,-0.042403,0.027494,-1.713529,-2.541942,-5.754640,-4.428173,2.920438,0.249388,0.195220,0.105794,0.142998,0.091003,0.024551,0.045774,-0.025010,1.666190,1.727284,2.081415,2.533715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,2023,-1,3268,Maryland,3376,South Carolina,4,1,-0.8,0.000000,-4.0,-2.333333,-2.297466,-0.255286,-0.102635,0.152651,3.881919,-2.546332,-0.770498,-0.277584,-0.193548,-0.013409,0.129295,0.102629,6.190148,-9.255655,-1.139981,1.043045,0.006993,-0.011435,0.016908,-0.002886,-0.399127,-0.372879,-0.310015,-0.200165,1.739039,1.994326,3.815571,6.113037
1382,2023,-1,3326,Ohio St,3439,Virginia Tech,4,2,,0.000000,2.0,0.000000,-0.503128,-0.053538,-0.040638,0.012900,7.036740,0.042043,-0.186680,-0.183432,-0.089718,0.002137,0.056640,-0.951253,4.935148,-1.456589,-1.964807,1.444126,-0.087140,-0.010175,-0.005974,0.065653,0.036137,0.090983,0.116520,0.096686,1.871238,1.924775,3.614788,4.117916
1383,2023,-1,3376,South Carolina,3234,Iowa,5,-1,,0.592593,5.0,2.000000,2.073065,0.200165,0.015299,-0.184866,-4.516919,2.531813,1.369934,0.225738,0.187500,-0.074863,-0.107958,-0.624248,0.320985,12.831391,-1.071930,-3.378402,0.012387,0.006406,-0.000341,0.020598,0.411789,0.384592,0.337157,0.265864,2.026540,1.826376,6.113037,4.039972
1384,2023,-1,3439,Virginia Tech,3261,LSU,5,-2,,0.153846,-1.0,0.333333,-0.377149,-0.055700,-0.018698,0.037002,-5.960647,-0.325898,0.360466,-0.096830,-0.062366,0.009444,0.016518,-0.528286,-2.999539,-7.044113,-4.320422,0.524095,0.108699,0.067741,0.053432,0.025771,-0.146194,-0.158831,-0.177786,-0.167504,1.874874,1.930573,4.117916,4.495065


In [49]:
df_mod.to_csv('../data/preprocessed/womens_model_data/womens_model_data.csv', index=False)

'Done'

'Done'