# Team Features

Compile all feature engineering into a model-ready dataframe. 

### Previous Tournament Results

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv(r'..\data\preprocessed\kaggle\tournament_results.csv')

df = df.loc[~df['Season'].isin([2020, 2024]), :].reset_index(drop=True)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results
0,2012,1101,Abilene Chr,-1.0,-1.0
1,2012,1102,Air Force,-1.0,-1.0
2,2012,1103,Akron,0.0,-0.5
3,2012,1104,Alabama,-1.0,-1.0
4,2012,1105,Alabama A&M,-1.0,-1.0
...,...,...,...,...,...
4153,2023,1474,Queens NC,-1.0,-1.0
4154,2023,1475,Southern Indiana,-1.0,-1.0
4155,2023,1476,Stonehill,-1.0,-1.0
4156,2023,1477,TX A&M Commerce,-1.0,-1.0


In [2]:
# df = pd.merge(
#     df,
#     df_tournament_results,
#     how='left',
#     on=['Season', 'TeamID', 'Team']
# )

# df

### Ordinal Rankings

Omitted

In [3]:
# import pandas as pd

# pd.set_option('display.max_columns', 100)

# df = pd.read_csv(r'..\data\preprocessed\kaggle\ordinals.csv')

# df = df.loc[df['Season'] != 2020, :].reset_index(drop=True)

# df

### Barttorvik Ratings

In [4]:
df_barttorvik = pd.read_csv(r'..\data\preprocessed\barttorvik\barttorvik.csv')

df_barttorvik

Unnamed: 0,Season,TEAM,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2012,Kentucky,0.941176,119.7,88.5,31.2,0.9702,53.4,41.6,17.2,18.1,38.4,40.0,25.6,66.1,11.3
1,2012,Ohio St.,0.794118,115.5,85.5,30.0,0.9695,52.5,46.3,17.4,22.5,35.7,37.0,28.6,68.1,7.7
2,2012,Kansas,0.818182,114.7,88.1,26.6,0.9542,54.0,43.9,19.6,20.7,34.9,41.1,34.3,67.9,8.2
3,2012,Michigan St.,0.794118,112.7,86.7,26.0,0.9532,52.7,43.0,19.8,19.7,37.2,39.0,34.2,66.1,8.6
4,2012,North Carolina,0.852941,115.8,89.6,26.2,0.9502,50.0,45.1,16.2,18.5,40.5,38.1,22.0,72.7,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3863,2023,Florida A&M,0.185185,88.2,108.1,-19.9,0.0877,45.0,51.1,24.0,18.0,27.6,31.4,44.6,65.8,-16.3
3864,2023,IUPUI,0.100000,94.0,117.2,-23.2,0.0728,49.7,55.0,23.2,16.7,29.0,32.2,36.0,67.8,-22.5
3865,2023,Green Bay,0.093750,90.5,114.1,-23.6,0.0651,46.0,54.8,21.1,16.1,21.7,31.5,31.6,65.9,-23.1
3866,2023,Hartford,0.080000,88.7,116.7,-28.0,0.0412,47.7,55.4,22.7,17.6,25.2,24.1,27.7,66.2,-20.0


In [5]:
df_spellings = pd.read_csv(
    r'..\data\unprocessed\kaggle\MTeamSpellings.csv', 
    encoding='cp1252'  # fixes issue with fancy quotes
)

df_spellings.loc[df_spellings.shape[0]] = ['fdu', 1192]

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101
...,...,...
1161,youngstown st.,1464
1162,youngstown state,1464
1163,youngstown-st,1464
1164,youngstown-state,1464


In [6]:
from fuzzywuzzy.fuzz import token_sort_ratio
from fuzzywuzzy import process
from tqdm.autonotebook import tqdm

team_spellings = df_spellings['TeamNameSpelling'].unique()
barttorvik_teams = df_barttorvik['TEAM'].unique()

df_match = pd.DataFrame(
    [
        [
            barttorvik_team,
            *process.extract(
                barttorvik_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for barttorvik_team in tqdm(barttorvik_teams)
    ],
    columns=['Barttorvik Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  from tqdm.autonotebook import tqdm


  0%|          | 0/365 [00:00<?, ?it/s]

Unnamed: 0,Barttorvik Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,Queens,queens nc,80
2,St. Thomas,st thomas mn,86
3,UT Rio Grande Valley,texas rio grande valley,88
4,Texas A&M Commerce,tx a&m commerce,91
5,Cal St. Bakersfield,cal state bakersfield,92
6,Southeast Missouri St.,southeast missouri state,93
7,Mississippi Valley St.,mississippi valley state,93
8,Texas A&M Corpus Chris,texas a&m-corpus christi,96
9,Bethune Cookman,bethune-cookman,100


In [7]:
barttorvik_to_spelling = dict(zip(df_match['Barttorvik Team'], df_match['Team Spelling']))

len(barttorvik_to_spelling)

365

In [8]:
spelling_to_id = dict(zip(df_spellings['TeamNameSpelling'], df_spellings['TeamID']))

len(spelling_to_id)

1166

In [9]:
df_barttorvik.insert(1, 'TeamID', df_barttorvik['TEAM'].map(barttorvik_to_spelling).map(spelling_to_id))

df_barttorvik

Unnamed: 0,Season,TeamID,TEAM,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2012,1246,Kentucky,0.941176,119.7,88.5,31.2,0.9702,53.4,41.6,17.2,18.1,38.4,40.0,25.6,66.1,11.3
1,2012,1326,Ohio St.,0.794118,115.5,85.5,30.0,0.9695,52.5,46.3,17.4,22.5,35.7,37.0,28.6,68.1,7.7
2,2012,1242,Kansas,0.818182,114.7,88.1,26.6,0.9542,54.0,43.9,19.6,20.7,34.9,41.1,34.3,67.9,8.2
3,2012,1277,Michigan St.,0.794118,112.7,86.7,26.0,0.9532,52.7,43.0,19.8,19.7,37.2,39.0,34.2,66.1,8.6
4,2012,1314,North Carolina,0.852941,115.8,89.6,26.2,0.9502,50.0,45.1,16.2,18.5,40.5,38.1,22.0,72.7,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3863,2023,1197,Florida A&M,0.185185,88.2,108.1,-19.9,0.0877,45.0,51.1,24.0,18.0,27.6,31.4,44.6,65.8,-16.3
3864,2023,1237,IUPUI,0.100000,94.0,117.2,-23.2,0.0728,49.7,55.0,23.2,16.7,29.0,32.2,36.0,67.8,-22.5
3865,2023,1453,Green Bay,0.093750,90.5,114.1,-23.6,0.0651,46.0,54.8,21.1,16.1,21.7,31.5,31.6,65.9,-23.1
3866,2023,1216,Hartford,0.080000,88.7,116.7,-28.0,0.0412,47.7,55.4,22.7,17.6,25.2,24.1,27.7,66.2,-20.0


In [10]:
df_barttorvik.loc[df_barttorvik['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,TEAM,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB


In [11]:
df = pd.merge(
    df,
    # df_barttorvik[[
    #     'Season', 
    #     'TeamID',
    #     'WIN%',
    #     'ADJOE',
    #     'ADJDE',
    #     'BARTHAG',
    #     'ADJ T.',
    # ]],
    df_barttorvik.drop(columns=['TEAM']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,
1,2012,1102,Air Force,-1.0,-1.0,0.407407,98.5,100.0,-1.5,0.4564,51.1,48.4,20.6,21.5,19.8,39.4,38.2,62.3,-7.6
2,2012,1103,Akron,0.0,-0.5,0.636364,105.1,96.9,8.2,0.7161,51.5,46.4,21.0,20.6,34.8,40.0,34.0,67.8,-1.7
3,2012,1104,Alabama,-1.0,-1.0,0.656250,105.0,88.1,16.9,0.8829,49.0,43.4,20.4,21.4,33.9,36.5,38.6,63.1,1.6
4,2012,1105,Alabama A&M,-1.0,-1.0,0.192308,88.4,111.1,-22.7,0.0677,45.1,49.2,23.9,20.6,30.4,35.5,52.9,67.9,-17.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4153,2023,1474,Queens NC,-1.0,-1.0,0.500000,105.6,108.5,-2.9,0.4217,51.2,53.0,17.6,15.7,29.7,35.9,25.4,70.6,-9.4
4154,2023,1475,Southern Indiana,-1.0,-1.0,0.448276,101.1,110.6,-9.5,0.2621,50.5,51.8,18.3,16.7,32.6,33.4,40.3,70.1,-12.6
4155,2023,1476,Stonehill,-1.0,-1.0,0.433333,94.1,107.1,-13.0,0.1835,50.9,50.2,19.7,19.8,19.1,32.2,27.3,67.4,-12.6
4156,2023,1477,TX A&M Commerce,-1.0,-1.0,0.375000,98.6,111.5,-12.9,0.1953,52.4,52.8,18.9,17.7,26.5,28.3,37.9,66.3,-15.4


### Barttorvik Previous Seasons

In [12]:
df_barttorvik_prev = pd.read_csv(r'..\data\preprocessed\barttorvik_full_season\barttorvik_full_season.csv')

df_barttorvik_prev

Unnamed: 0,Season,TEAM,Past Year BARTHAG,Past 4 Years BARTHAG
0,2012,Abilene Christian,,
1,2012,Air Force,0.5782,0.463200
2,2012,Akron,0.6049,0.678300
3,2012,Alabama,0.8419,0.782050
4,2012,Alabama A&M,0.1283,0.102125
...,...,...,...,...
4399,2023,Wright St.,0.4613,0.625775
4400,2023,Wyoming,0.7661,0.456400
4401,2023,Xavier,0.8204,0.805150
4402,2023,Yale,0.5447,0.667467


In [13]:
team_spellings = df_spellings['TeamNameSpelling'].unique()
barttorvik_prev_teams = df_barttorvik_prev['TEAM'].unique()

df_match = pd.DataFrame(
    [
        [
            barttorvik_prev_team,
            *process.extract(
                barttorvik_prev_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for barttorvik_prev_team in tqdm(barttorvik_prev_teams)
    ],
    columns=['Barttorvik Prev Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/367 [00:00<?, ?it/s]

Unnamed: 0,Barttorvik Prev Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,Queens,queens nc,80
2,St. Thomas,st thomas mn,86
3,UT Rio Grande Valley,texas rio grande valley,88
4,Texas A&M Commerce,tx a&m commerce,91
5,Winston Salem St.,winston-salem-state,91
6,Cal St. Bakersfield,cal state bakersfield,92
7,Southeast Missouri St.,southeast missouri state,93
8,Mississippi Valley St.,mississippi valley state,93
9,Texas A&M Corpus Chris,texas a&m-corpus christi,96


In [14]:
barttorvik_prev_to_spelling = dict(zip(df_match['Barttorvik Prev Team'], df_match['Team Spelling']))

df_barttorvik_prev.insert(1, 'TeamID', df_barttorvik_prev['TEAM'].map(barttorvik_prev_to_spelling).map(spelling_to_id))

df_barttorvik_prev

Unnamed: 0,Season,TeamID,TEAM,Past Year BARTHAG,Past 4 Years BARTHAG
0,2012,1101,Abilene Christian,,
1,2012,1102,Air Force,0.5782,0.463200
2,2012,1103,Akron,0.6049,0.678300
3,2012,1104,Alabama,0.8419,0.782050
4,2012,1105,Alabama A&M,0.1283,0.102125
...,...,...,...,...,...
4399,2023,1460,Wright St.,0.4613,0.625775
4400,2023,1461,Wyoming,0.7661,0.456400
4401,2023,1462,Xavier,0.8204,0.805150
4402,2023,1463,Yale,0.5447,0.667467


In [15]:
df = pd.merge(
    df,
    df_barttorvik_prev.drop(columns=['TEAM']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,
1,2012,1102,Air Force,-1.0,-1.0,0.407407,98.5,100.0,-1.5,0.4564,51.1,48.4,20.6,21.5,19.8,39.4,38.2,62.3,-7.6,0.5782,0.463200
2,2012,1103,Akron,0.0,-0.5,0.636364,105.1,96.9,8.2,0.7161,51.5,46.4,21.0,20.6,34.8,40.0,34.0,67.8,-1.7,0.6049,0.678300
3,2012,1104,Alabama,-1.0,-1.0,0.656250,105.0,88.1,16.9,0.8829,49.0,43.4,20.4,21.4,33.9,36.5,38.6,63.1,1.6,0.8419,0.782050
4,2012,1105,Alabama A&M,-1.0,-1.0,0.192308,88.4,111.1,-22.7,0.0677,45.1,49.2,23.9,20.6,30.4,35.5,52.9,67.9,-17.4,0.1283,0.102125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4164,2023,1474,Queens NC,-1.0,-1.0,0.500000,105.6,108.5,-2.9,0.4217,51.2,53.0,17.6,15.7,29.7,35.9,25.4,70.6,-9.4,,
4165,2023,1475,Southern Indiana,-1.0,-1.0,0.448276,101.1,110.6,-9.5,0.2621,50.5,51.8,18.3,16.7,32.6,33.4,40.3,70.1,-12.6,,
4166,2023,1476,Stonehill,-1.0,-1.0,0.433333,94.1,107.1,-13.0,0.1835,50.9,50.2,19.7,19.8,19.1,32.2,27.3,67.4,-12.6,,
4167,2023,1477,TX A&M Commerce,-1.0,-1.0,0.375000,98.6,111.5,-12.9,0.1953,52.4,52.8,18.9,17.7,26.5,28.3,37.9,66.3,-15.4,,


In [16]:
df.loc[df['Past Year BARTHAG'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,
8,2012,1109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,
17,2012,1118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,
20,2012,1121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,
27,2012,1128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4164,2023,1474,Queens NC,-1.0,-1.0,0.500000,105.6,108.5,-2.9,0.4217,51.2,53.0,17.6,15.7,29.7,35.9,25.4,70.6,-9.4,,
4165,2023,1475,Southern Indiana,-1.0,-1.0,0.448276,101.1,110.6,-9.5,0.2621,50.5,51.8,18.3,16.7,32.6,33.4,40.3,70.1,-12.6,,
4166,2023,1476,Stonehill,-1.0,-1.0,0.433333,94.1,107.1,-13.0,0.1835,50.9,50.2,19.7,19.8,19.1,32.2,27.3,67.4,-12.6,,
4167,2023,1477,TX A&M Commerce,-1.0,-1.0,0.375000,98.6,111.5,-12.9,0.1953,52.4,52.8,18.9,17.7,26.5,28.3,37.9,66.3,-15.4,,


### My Rankings

In [17]:
df_rankings = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\my_rankings\my_rankings_{season}.csv')
        .assign(Season=season)
        for season in range(2012, 2024) if season != 2020
    ),
    ignore_index=True
)

df_rankings.insert(0, 'Season', df_rankings.pop('Season'))

df_rankings.drop(columns=['Strength'], inplace=True)

df_rankings

Unnamed: 0,Season,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,Kentucky,3.416144,0.297875,1.166730,0.868855,66.950603
1,2012,Syracuse,3.133934,0.247351,1.132201,0.884851,67.141144
2,2012,North Carolina,2.990982,0.242541,1.124441,0.881900,73.343437
3,2012,Michigan State,2.856820,0.265908,1.119664,0.853757,66.464932
4,2012,Murray State,2.808001,0.139555,1.073767,0.934212,66.784035
...,...,...,...,...,...,...,...
3862,2023,Presbyterian,-2.195092,-0.150432,0.936036,1.086467,65.320811
3863,2023,IUPUI,-2.406697,-0.210480,0.938439,1.148919,67.983313
3864,2023,Hartford Hawks,-2.518344,-0.260419,0.880985,1.141404,67.079234
3865,2023,Green Bay,-2.533588,-0.247050,0.888355,1.135405,66.383482


In [18]:
my_teams = df_rankings['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            my_team,
            *process.extract(
                my_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for my_team in tqdm(my_teams)
    ],
    columns=['My Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/364 [00:00<?, ?it/s]

Unnamed: 0,My Team,Team Spelling,Match Score
0,Hartford Hawks,hartford,73
1,St. Francis (NY) Terriers,st francis (ny),74
2,Houston Christian,houston chr,79
3,Savannah State Tigers,savannah state,80
4,St. Thomas,st thomas mn,86
5,Kansas City,mo kansas city,88
6,Texas A&M-Commerce,tx a&m commerce,91
7,Appalachian State,appalachian state,100
8,Arizona State,arizona state,100
9,North Dakota,north dakota,100


In [19]:
ranking_to_spelling = dict(zip(df_match['My Team'], df_match['Team Spelling']))

len(ranking_to_spelling)

364

In [20]:
df_rankings.insert(1, 'TeamID', df_rankings['Team'].map(ranking_to_spelling).map(spelling_to_id))

df_rankings

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,1246,Kentucky,3.416144,0.297875,1.166730,0.868855,66.950603
1,2012,1393,Syracuse,3.133934,0.247351,1.132201,0.884851,67.141144
2,2012,1314,North Carolina,2.990982,0.242541,1.124441,0.881900,73.343437
3,2012,1277,Michigan State,2.856820,0.265908,1.119664,0.853757,66.464932
4,2012,1293,Murray State,2.808001,0.139555,1.073767,0.934212,66.784035
...,...,...,...,...,...,...,...,...
3862,2023,1342,Presbyterian,-2.195092,-0.150432,0.936036,1.086467,65.320811
3863,2023,1237,IUPUI,-2.406697,-0.210480,0.938439,1.148919,67.983313
3864,2023,1216,Hartford Hawks,-2.518344,-0.260419,0.880985,1.141404,67.079234
3865,2023,1453,Green Bay,-2.533588,-0.247050,0.888355,1.135405,66.383482


In [21]:
df_rankings.loc[df_rankings['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo


In [22]:
df = pd.merge(
    df,
    df_rankings.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
1,2012,1102,Air Force,-1.0,-1.0,0.407407,98.5,100.0,-1.5,0.4564,51.1,48.4,20.6,21.5,19.8,39.4,38.2,62.3,-7.6,0.5782,0.463200,-0.152520,-0.020081,0.979958,1.000039,62.835529
2,2012,1103,Akron,0.0,-0.5,0.636364,105.1,96.9,8.2,0.7161,51.5,46.4,21.0,20.6,34.8,40.0,34.0,67.8,-1.7,0.6049,0.678300,1.341049,0.095792,1.045707,0.949915,68.483451
3,2012,1104,Alabama,-1.0,-1.0,0.656250,105.0,88.1,16.9,0.8829,49.0,43.4,20.4,21.4,33.9,36.5,38.6,63.1,1.6,0.8419,0.782050,1.585085,0.154575,1.034248,0.879673,63.967881
4,2012,1105,Alabama A&M,-1.0,-1.0,0.192308,88.4,111.1,-22.7,0.0677,45.1,49.2,23.9,20.6,30.4,35.5,52.9,67.9,-17.4,0.1283,0.102125,-2.376490,-0.191533,0.890904,1.082437,68.157471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4164,2023,1474,Queens NC,-1.0,-1.0,0.500000,105.6,108.5,-2.9,0.4217,51.2,53.0,17.6,15.7,29.7,35.9,25.4,70.6,-9.4,,,0.053745,-0.017864,1.058141,1.076004,70.263382
4165,2023,1475,Southern Indiana,-1.0,-1.0,0.448276,101.1,110.6,-9.5,0.2621,50.5,51.8,18.3,16.7,32.6,33.4,40.3,70.1,-12.6,,,-0.608620,-0.058034,1.020948,1.078982,70.370070
4166,2023,1476,Stonehill,-1.0,-1.0,0.433333,94.1,107.1,-13.0,0.1835,50.9,50.2,19.7,19.8,19.1,32.2,27.3,67.4,-12.6,,,-0.931737,-0.112451,0.948106,1.060557,67.604692
4167,2023,1477,TX A&M Commerce,-1.0,-1.0,0.375000,98.6,111.5,-12.9,0.1953,52.4,52.8,18.9,17.7,26.5,28.3,37.9,66.3,-15.4,,,-1.093229,-0.094676,0.998594,1.093269,66.657934


In [23]:
df.loc[df['Rating'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
8,2012,1109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
17,2012,1118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
20,2012,1121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
27,2012,1128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4056,2023,1366,Savannah St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
4122,2023,1432,Utica,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
4135,2023,1445,W Salem St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,
4136,2023,1446,W Texas A&M,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,


### Starters

In [24]:
df_starters = pd.concat(
    (
        pd.read_csv(fr'..\data\preprocessed\starters\starters_{season}.csv')
        .assign(Season=season)
        for season in range(2012, 2024) if season != 2020
    ),
    ignore_index=True
)

df_starters.insert(0, 'Season', df_starters.pop('Season'))

df_starters.rename(columns={'Rating': 'Starters'}, inplace=True)

df_starters

Unnamed: 0,Season,Team,Starters
0,2012,Murray State,0.566137
1,2012,Michigan State,0.544399
2,2012,Duke,0.525030
3,2012,Wichita State,0.522023
4,2012,Kentucky,0.521597
...,...,...,...
3862,2023,Houston Christian,-0.374878
3863,2023,Presbyterian,-0.390325
3864,2023,Long Island University,-0.401581
3865,2023,Hartford,-0.433945


In [25]:
starters_teams = df_starters['Team'].unique()

df_match = pd.DataFrame(
    [
        [
            starters_team,
            *process.extract(
                starters_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for starters_team in tqdm(starters_teams)
    ],
    columns=['Starters Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  0%|          | 0/364 [00:00<?, ?it/s]

Unnamed: 0,Starters Team,Team Spelling,Match Score
0,Houston Christian,houston chr,79
1,St. Thomas,st thomas mn,86
2,Kansas City,mo kansas city,88
3,Texas A&M-Commerce,tx a&m commerce,91
4,Southeast Missouri State,southeast missouri state,100
5,Louisiana,louisiana,100
6,Appalachian State,appalachian state,100
7,Southern,southern,100
8,Southeastern Louisiana,southeastern louisiana,100
9,North Dakota,north dakota,100


In [26]:
starters_to_spelling = dict(zip(df_match['Starters Team'], df_match['Team Spelling']))

len(starters_to_spelling)

364

In [27]:
df_starters.insert(1, 'TeamID', df_starters['Team'].map(starters_to_spelling).map(spelling_to_id))

df_starters

Unnamed: 0,Season,TeamID,Team,Starters
0,2012,1293,Murray State,0.566137
1,2012,1277,Michigan State,0.544399
2,2012,1181,Duke,0.525030
3,2012,1455,Wichita State,0.522023
4,2012,1246,Kentucky,0.521597
...,...,...,...,...
3862,2023,1223,Houston Christian,-0.374878
3863,2023,1342,Presbyterian,-0.390325
3864,2023,1254,Long Island University,-0.401581
3865,2023,1216,Hartford,-0.433945


In [28]:
df_starters.loc[df_starters['TeamID'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Starters


In [29]:
df = pd.merge(
    df,
    df_starters.drop(columns=['Team']),
    how='left',
    on=['Season', 'TeamID']
)

df

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
1,2012,1102,Air Force,-1.0,-1.0,0.407407,98.5,100.0,-1.5,0.4564,51.1,48.4,20.6,21.5,19.8,39.4,38.2,62.3,-7.6,0.5782,0.463200,-0.152520,-0.020081,0.979958,1.000039,62.835529,-0.065026
2,2012,1103,Akron,0.0,-0.5,0.636364,105.1,96.9,8.2,0.7161,51.5,46.4,21.0,20.6,34.8,40.0,34.0,67.8,-1.7,0.6049,0.678300,1.341049,0.095792,1.045707,0.949915,68.483451,0.347701
3,2012,1104,Alabama,-1.0,-1.0,0.656250,105.0,88.1,16.9,0.8829,49.0,43.4,20.4,21.4,33.9,36.5,38.6,63.1,1.6,0.8419,0.782050,1.585085,0.154575,1.034248,0.879673,63.967881,0.222821
4,2012,1105,Alabama A&M,-1.0,-1.0,0.192308,88.4,111.1,-22.7,0.0677,45.1,49.2,23.9,20.6,30.4,35.5,52.9,67.9,-17.4,0.1283,0.102125,-2.376490,-0.191533,0.890904,1.082437,68.157471,-0.306213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4164,2023,1474,Queens NC,-1.0,-1.0,0.500000,105.6,108.5,-2.9,0.4217,51.2,53.0,17.6,15.7,29.7,35.9,25.4,70.6,-9.4,,,0.053745,-0.017864,1.058141,1.076004,70.263382,0.027291
4165,2023,1475,Southern Indiana,-1.0,-1.0,0.448276,101.1,110.6,-9.5,0.2621,50.5,51.8,18.3,16.7,32.6,33.4,40.3,70.1,-12.6,,,-0.608620,-0.058034,1.020948,1.078982,70.370070,0.016206
4166,2023,1476,Stonehill,-1.0,-1.0,0.433333,94.1,107.1,-13.0,0.1835,50.9,50.2,19.7,19.8,19.1,32.2,27.3,67.4,-12.6,,,-0.931737,-0.112451,0.948106,1.060557,67.604692,-0.113922
4167,2023,1477,TX A&M Commerce,-1.0,-1.0,0.375000,98.6,111.5,-12.9,0.1953,52.4,52.8,18.9,17.7,26.5,28.3,37.9,66.3,-15.4,,,-1.093229,-0.094676,0.998594,1.093269,66.657934,-0.159400


In [30]:
df.loc[df['Starters'].isna(), :]

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters
0,2012,1101,Abilene Chr,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
8,2012,1109,Alliant Intl,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
17,2012,1118,Armstrong St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
20,2012,1121,Augusta,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
27,2012,1128,Birmingham So,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4056,2023,1366,Savannah St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
4122,2023,1432,Utica,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
4135,2023,1445,W Salem St,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,
4136,2023,1446,W Texas A&M,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,


### Strength of Schedule

Omitted

In [31]:
# df_sos = pd.read_csv('../data/preprocessed/mens_sos/mens_sos.csv')

# df_sos

In [32]:
# sos_teams = df_sos['Team'].unique()

# df_match = pd.DataFrame(
#     [
#         [
#             sos_team,
#             *process.extract(
#                 sos_team,
#                 team_spellings,
#                 scorer=token_sort_ratio,
#                 limit=1
#             )[0][:2]
#         ] for sos_team in tqdm(sos_teams)
#     ],
#     columns=['SOS Team', 'Team Spelling', 'Match Score']
# ).sort_values('Match Score', ignore_index=True)

# df_match.head(25)

In [33]:
# sos_to_spelling = dict(zip(df_match['SOS Team'], df_match['Team Spelling']))

# df_sos.insert(1, 'TeamID', df_sos['Team'].map(sos_to_spelling).map(spelling_to_id))

# df_sos

In [34]:
# df = pd.merge(
#     df,
#     df_sos.drop(columns=['Team']),
#     how='left',
#     on=['Season', 'TeamID']
# )

# df

In [35]:
# df.loc[df['0.5 Win'].isna(), :]

### Map to Matchups

In [36]:
df_mod = pd.read_csv(r'..\data\unprocessed\kaggle\MNCAATourneyDetailedResults.csv')[['Season', 'DayNum', 'WTeamID', 'LTeamID']]

df_mod = df_mod.loc[df_mod['Season'] >= 2012, :].reset_index(drop=True)

# fix 2021 dates
df_mod.loc[
    (df_mod['Season'] == 2021) & 
    (df_mod['DayNum'] < 140), 
    'DayNum'
] = df_mod.loc[
    (df_mod['Season'] == 2021) & 
    (df_mod['DayNum'] < 140), 
    'DayNum'
] - 1

df_mod.loc[
    (df_mod['Season'] == 2021) & 
    (df_mod['DayNum'].between(140, 150)), 
    'DayNum'
] = df_mod.loc[
    (df_mod['Season'] == 2021) & 
    (df_mod['DayNum'].between(140, 150)), 
    'DayNum'
] - 2

# get rid of play-in games
df_mod = df_mod.loc[df_mod['DayNum'] > 135, :].reset_index(drop=True)

df_mod

Unnamed: 0,Season,DayNum,WTeamID,LTeamID
0,2012,136,1124,1355
1,2012,136,1160,1424
2,2012,136,1211,1452
3,2012,136,1231,1308
4,2012,136,1235,1163
...,...,...,...,...
687,2023,146,1274,1400
688,2023,146,1361,1166
689,2023,152,1163,1274
690,2023,152,1361,1194


Get round of each game

In [37]:
df_mod['Round'] = 1
df_mod.loc[df_mod['DayNum'].between(138, 139), 'Round'] = 2
df_mod.loc[df_mod['DayNum'].between(140, 144), 'Round'] = 3
df_mod.loc[df_mod['DayNum'].between(145, 149), 'Round'] = 4
df_mod.loc[df_mod['DayNum'].between(150, 153), 'Round'] = 5
df_mod.loc[df_mod['DayNum'] == 154, 'Round'] = 6

df_mod

Unnamed: 0,Season,DayNum,WTeamID,LTeamID,Round
0,2012,136,1124,1355,1
1,2012,136,1160,1424,1
2,2012,136,1211,1452,1
3,2012,136,1231,1308,1
4,2012,136,1235,1163,1
...,...,...,...,...,...
687,2023,146,1274,1400,4
688,2023,146,1361,1166,4
689,2023,152,1163,1274,5
690,2023,152,1361,1194,5


In [38]:
for season in range(2012, 2024):
    for round_ in range(1, 7):
        if df_mod.loc[(df_mod['Season'] == season) & (df_mod['Round'] == round_), :].shape[0] != 2**(6 - round_):
            print(f"{season}, {round_} : {df_mod.loc[(df_mod['Season'] == season) & (df_mod['Round'] == round_), :].shape[0]}")

2020, 1 : 0
2020, 2 : 0
2020, 3 : 0
2020, 4 : 0
2020, 5 : 0
2020, 6 : 0
2021, 1 : 31


Remap to Team A / Team B format

In [39]:
df_mod = pd.DataFrame({
    'Season': list(df_mod['Season'])*2,
    'Round': list(df_mod['Round'])*2,
    'Result': [1 for _ in range(df_mod.shape[0])] + [-1 for _ in range(df_mod.shape[0])],
    'Team A ID': list(df_mod['WTeamID']) + list(df_mod['LTeamID']),
    'Team B ID': list(df_mod['LTeamID']) + list(df_mod['WTeamID']),
})

df_mod

Unnamed: 0,Season,Round,Result,Team A ID,Team B ID
0,2012,1,1,1124,1355
1,2012,1,1,1160,1424
2,2012,1,1,1211,1452
3,2012,1,1,1231,1308
4,2012,1,1,1235,1163
...,...,...,...,...,...
1379,2023,4,-1,1400,1274
1380,2023,4,-1,1166,1361
1381,2023,5,-1,1274,1163
1382,2023,5,-1,1194,1361


Get Head-to-Head (Omitted)

In [40]:
# df_h2h = pd.read_csv('../data/preprocessed/men_h2h/men_h2h.csv')

# df_h2h

In [41]:
# h2h_teams = df_h2h['Team A'].unique()

# df_match = pd.DataFrame(
#     [
#         [
#             h2h_team,
#             *process.extract(
#                 h2h_team,
#                 team_spellings,
#                 scorer=token_sort_ratio,
#                 limit=1
#             )[0][:2]
#         ] for h2h_team in tqdm(h2h_teams)
#     ],
#     columns=['Head to Head Team', 'Team Spelling', 'Match Score']
# ).sort_values('Match Score', ignore_index=True)

# df_match.head(25)

In [42]:
# h2h_to_spelling = dict(zip(df_match['Head to Head Team'], df_match['Team Spelling']))

# df_h2h.insert(df_h2h.columns.get_loc('Team A'), 'Team A ID', df_h2h['Team A'].map(h2h_to_spelling).map(spelling_to_id))

# df_h2h.insert(df_h2h.columns.get_loc('Team B'), 'Team B ID', df_h2h['Team B'].map(h2h_to_spelling).map(spelling_to_id))

# df_h2h

In [43]:
# df_mod = pd.merge(
#     df_mod,
#     df_h2h[['Season', 'Team A ID', 'Team B ID', 'Head to Head', 'Common Opps']],
#     how='left',
#     on=['Season', 'Team A ID', 'Team B ID'],
# )

# df_mod

Get team names

In [44]:
df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\MTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2024
1,1102,Air Force,1985,2024
2,1103,Akron,1985,2024
3,1104,Alabama,1985,2024
4,1105,Alabama A&M,2000,2024
...,...,...,...,...
373,1474,Queens NC,2023,2024
374,1475,Southern Indiana,2023,2024
375,1476,Stonehill,2023,2024
376,1477,TX A&M Commerce,2023,2024


In [45]:
id_to_team = dict(zip(df_teams['TeamID'], df_teams['TeamName']))

df_mod.insert(df_mod.columns.get_loc('Team A ID') + 1, 'Team A', df_mod['Team A ID'].map(id_to_team))
df_mod.insert(df_mod.columns.get_loc('Team B ID') + 1, 'Team B', df_mod['Team B ID'].map(id_to_team))

df_mod

Unnamed: 0,Season,Round,Result,Team A ID,Team A,Team B ID,Team B
0,2012,1,1,1124,Baylor,1355,S Dakota St
1,2012,1,1,1160,Colorado,1424,UNLV
2,2012,1,1,1211,Gonzaga,1452,West Virginia
3,2012,1,1,1231,Indiana,1308,New Mexico St
4,2012,1,1,1235,Iowa St,1163,Connecticut
...,...,...,...,...,...,...,...
1379,2023,4,-1,1400,Texas,1274,Miami FL
1380,2023,4,-1,1166,Creighton,1361,San Diego St
1381,2023,5,-1,1274,Miami FL,1163,Connecticut
1382,2023,5,-1,1194,FL Atlantic,1361,San Diego St


Map features

In [46]:
team_a_features = pd.merge(
    df_mod[['Season', 'Team A ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team A ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team A ID', 'TeamID'])

team_b_features = pd.merge(
    df_mod[['Season', 'Team B ID']],
    df.drop(columns=['Team']),
    how='left',
    left_on=['Season', 'Team B ID'],
    right_on=['Season', 'TeamID'],
).drop(columns=['Season', 'Team B ID', 'TeamID'])

df_features = team_a_features - team_b_features

df_features['Team A ADJOE Team B ADJDE'] = team_a_features['ADJOE'] + team_b_features['ADJDE']
df_features['Team B ADJOE Team A ADJDE'] = team_b_features['ADJOE'] + team_a_features['ADJDE']

df_features['Team A Offense Team B Defense'] = team_a_features['Adjusted Offense'] + team_b_features['Adjusted Defense']
df_features['Team B Offense Team A Defense'] = team_b_features['Adjusted Offense'] + team_a_features['Adjusted Defense']

df_features['Team A BARTHAG'] = team_a_features['BARTHAG']
df_features['Team B BARTHAG'] = team_b_features['BARTHAG']

df_features

Unnamed: 0,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters,Team A ADJOE Team B ADJDE,Team B ADJOE Team A ADJDE,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A BARTHAG,Team B BARTHAG
0,0.0,1.250000,0.006629,2.7,-7.1,9.8,0.1420,-1.2,-2.7,5.0,2.1,7.1,-3.4,2.2,0.4,7.4,0.0703,0.429650,1.160555,0.067725,0.002941,-0.064784,0.583427,0.111128,215.1,205.3,2.102834,2.035109,0.9090,0.7670
1,-1.0,-1.000000,-0.093750,-5.1,1.9,-7.0,-0.1138,-3.9,0.9,-0.3,-2.3,-3.3,7.2,-0.9,-4.0,-3.1,-0.0848,-0.167900,-1.103464,-0.086768,-0.061487,0.025281,-3.752108,-0.219484,196.5,203.5,1.942389,2.029157,0.7615,0.8753
2,0.0,-0.750000,0.212702,-0.3,-1.2,0.9,0.0132,4.7,-2.8,0.6,-0.7,-6.4,8.9,-4.7,0.7,2.3,-0.0286,-0.026925,0.690981,0.029671,0.004867,-0.024804,0.670750,0.159773,206.1,205.2,2.033802,2.004131,0.8612,0.8480
3,0.0,0.000000,0.030303,13.0,-0.9,13.9,0.1917,4.0,-0.1,-1.6,-0.1,-5.2,-6.2,1.0,-3.3,7.6,0.1886,0.008725,1.015935,0.104575,0.097325,-0.007250,-3.019422,0.108978,216.5,202.6,2.116883,2.012308,0.9184,0.7267
4,-7.0,-3.250000,0.081439,0.9,1.0,-0.1,-0.0023,2.7,3.9,-0.8,0.7,-4.6,4.3,0.8,3.0,1.4,-0.1929,-0.199375,0.131480,0.008000,0.021394,0.013394,2.322449,-0.032585,205.5,205.6,2.038228,2.030228,0.8553,0.8576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,-2.0,-0.333333,-0.016544,-2.7,-9.8,7.1,0.0873,-2.3,-3.5,0.1,4.7,-3.7,1.5,10.4,0.0,3.8,0.0684,0.154450,0.292643,0.095063,-0.007959,-0.103022,-0.276751,0.033572,216.0,208.9,2.143993,2.048930,0.9296,0.8423
1380,1.0,1.000000,-0.176136,2.5,2.7,-0.2,-0.0056,4.2,-0.2,-1.0,-5.3,-6.2,-6.2,-9.6,1.8,-3.5,-0.0526,0.029250,-0.638693,0.013201,0.032554,0.019353,1.681630,-0.100021,203.9,204.1,2.045875,2.032673,0.9130,0.9186
1381,3.0,0.666667,0.023674,-1.6,8.9,-10.5,-0.1052,1.5,5.8,-2.5,-1.3,-7.2,-1.4,-14.6,1.8,-1.3,-0.0240,-0.096625,-0.229468,-0.101200,-0.013950,0.087249,1.582331,-0.023358,209.8,220.3,2.064703,2.165903,0.8423,0.9475
1382,-1.0,-0.666667,0.093750,0.9,4.7,-3.8,-0.0447,4.9,-1.6,-0.9,-1.3,-1.3,-4.8,-2.7,2.1,-1.0,-0.2767,-0.310225,0.192519,0.009792,0.030394,0.020602,2.299120,0.008468,202.3,206.1,2.043715,2.033922,0.8739,0.9186


In [47]:
df_mod[df_features.columns] = df_features

df_mod

Unnamed: 0,Season,Round,Result,Team A ID,Team A,Team B ID,Team B,Past Year Tournament Result,Past 4 Years Tournament Results,WIN%,ADJOE,ADJDE,ADJEM,BARTHAG,EFG%,EFGD%,TOR,TORD,ORB,FTR,FTRD,ADJ T.,WAB,Past Year BARTHAG,Past 4 Years BARTHAG,Rating,Efficiency Margin,Adjusted Offense,Adjusted Defense,Adjusted Tempo,Starters,Team A ADJOE Team B ADJDE,Team B ADJOE Team A ADJDE,Team A Offense Team B Defense,Team B Offense Team A Defense,Team A BARTHAG,Team B BARTHAG
0,2012,1,1,1124,Baylor,1355,S Dakota St,0.0,1.250000,0.006629,2.7,-7.1,9.8,0.1420,-1.2,-2.7,5.0,2.1,7.1,-3.4,2.2,0.4,7.4,0.0703,0.429650,1.160555,0.067725,0.002941,-0.064784,0.583427,0.111128,215.1,205.3,2.102834,2.035109,0.9090,0.7670
1,2012,1,1,1160,Colorado,1424,UNLV,-1.0,-1.000000,-0.093750,-5.1,1.9,-7.0,-0.1138,-3.9,0.9,-0.3,-2.3,-3.3,7.2,-0.9,-4.0,-3.1,-0.0848,-0.167900,-1.103464,-0.086768,-0.061487,0.025281,-3.752108,-0.219484,196.5,203.5,1.942389,2.029157,0.7615,0.8753
2,2012,1,1,1211,Gonzaga,1452,West Virginia,0.0,-0.750000,0.212702,-0.3,-1.2,0.9,0.0132,4.7,-2.8,0.6,-0.7,-6.4,8.9,-4.7,0.7,2.3,-0.0286,-0.026925,0.690981,0.029671,0.004867,-0.024804,0.670750,0.159773,206.1,205.2,2.033802,2.004131,0.8612,0.8480
3,2012,1,1,1231,Indiana,1308,New Mexico St,0.0,0.000000,0.030303,13.0,-0.9,13.9,0.1917,4.0,-0.1,-1.6,-0.1,-5.2,-6.2,1.0,-3.3,7.6,0.1886,0.008725,1.015935,0.104575,0.097325,-0.007250,-3.019422,0.108978,216.5,202.6,2.116883,2.012308,0.9184,0.7267
4,2012,1,1,1235,Iowa St,1163,Connecticut,-7.0,-3.250000,0.081439,0.9,1.0,-0.1,-0.0023,2.7,3.9,-0.8,0.7,-4.6,4.3,0.8,3.0,1.4,-0.1929,-0.199375,0.131480,0.008000,0.021394,0.013394,2.322449,-0.032585,205.5,205.6,2.038228,2.030228,0.8553,0.8576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,2023,4,-1,1400,Texas,1274,Miami FL,-2.0,-0.333333,-0.016544,-2.7,-9.8,7.1,0.0873,-2.3,-3.5,0.1,4.7,-3.7,1.5,10.4,0.0,3.8,0.0684,0.154450,0.292643,0.095063,-0.007959,-0.103022,-0.276751,0.033572,216.0,208.9,2.143993,2.048930,0.9296,0.8423
1380,2023,4,-1,1166,Creighton,1361,San Diego St,1.0,1.000000,-0.176136,2.5,2.7,-0.2,-0.0056,4.2,-0.2,-1.0,-5.3,-6.2,-6.2,-9.6,1.8,-3.5,-0.0526,0.029250,-0.638693,0.013201,0.032554,0.019353,1.681630,-0.100021,203.9,204.1,2.045875,2.032673,0.9130,0.9186
1381,2023,5,-1,1274,Miami FL,1163,Connecticut,3.0,0.666667,0.023674,-1.6,8.9,-10.5,-0.1052,1.5,5.8,-2.5,-1.3,-7.2,-1.4,-14.6,1.8,-1.3,-0.0240,-0.096625,-0.229468,-0.101200,-0.013950,0.087249,1.582331,-0.023358,209.8,220.3,2.064703,2.165903,0.8423,0.9475
1382,2023,5,-1,1194,FL Atlantic,1361,San Diego St,-1.0,-0.666667,0.093750,0.9,4.7,-3.8,-0.0447,4.9,-1.6,-0.9,-1.3,-1.3,-4.8,-2.7,2.1,-1.0,-0.2767,-0.310225,0.192519,0.009792,0.030394,0.020602,2.299120,0.008468,202.3,206.1,2.043715,2.033922,0.8739,0.9186


In [48]:
df_mod.to_csv('../data/preprocessed/model_data/model_data.csv', index=False)

'Done'

'Done'