In [1]:
import pandas as pd
import numpy as np

### Clean up the ranking data
Rankings from Day 133 and then Day 128 for 2022

Systems: Whitlock (WLK), Pomeroy (POM), Massey (MAS), Wobus MOV (DAV))

In [2]:
ranks = pd.read_csv('./data/MMasseyOrdinals.csv')
ranks = ranks[ranks.RankingDayNum == 133].drop('RankingDayNum', axis=1)
WLK = ranks[ranks.SystemName == 'WLK'].drop('SystemName', axis=1)
POM = ranks[ranks.SystemName == 'POM'].drop('SystemName', axis=1)
MAS = ranks[ranks.SystemName == 'MAS'].drop('SystemName', axis=1)
SAG = ranks[ranks.SystemName == 'SAG'].drop('SystemName', axis=1)

ranks_clean = pd.DataFrame()
yrs = np.arange(2003,2021,1)
yrs[-1] = 2021
for yr in yrs:

    sys1 = WLK[WLK.Season == yr]
    sys2 = POM[POM.Season == yr].drop('Season', axis=1)
    sys3 = MAS[MAS.Season == yr].drop('Season', axis=1)
    sys4 = SAG[SAG.Season == yr].drop('Season', axis=1)

    sys1 = sys1.rename(columns={'OrdinalRank' : 'WLK'})
    sys2 = sys2.rename(columns={'OrdinalRank' : 'POM'})
    sys3 = sys3.rename(columns={'OrdinalRank' : 'MAS'})
    sys4 = sys4.rename(columns={'OrdinalRank' : 'SAG'})

    sys = sys1.merge(sys2, left_on='TeamID', right_on='TeamID', how='outer')
    sys = sys.merge(sys3, left_on='TeamID', right_on='TeamID', how='outer')
    sys = sys.merge(sys4, left_on='TeamID', right_on='TeamID', how='outer')
    
    ranks_clean = pd.concat([ranks_clean, sys])
    
ranks_clean['AvgRank'] = ranks_clean.drop(['Season','TeamID'], axis=1).mean(axis=1)
ranks_clean.head(5)

Unnamed: 0,Season,TeamID,WLK,POM,MAS,SAG,AvgRank
0,2003.0,1102,165.0,160.0,172.0,149,161.5
1,2003.0,1103,172.0,163.0,163.0,172,167.5
2,2003.0,1104,36.0,33.0,41.0,37,36.75
3,2003.0,1105,310.0,307.0,310.0,312,309.75
4,2003.0,1106,254.0,263.0,270.0,268,263.75


### Clean up the regular season data

In [3]:
reg = pd.read_csv('./data/MRegularSeasonDetailedResults.csv')
reg['WMOV'] = reg['WScore'] - reg['LScore']

W_rename = {}
L_rename = {}
W_cols = []
L_cols = []
for col in reg.columns:
    if col.startswith('L'):
        L_rename[col] = col.split('L')[1]
        L_cols.append(col)
    if col.startswith('W'):
        W_rename[col] = col.split('W')[1]
        W_cols.append(col)
    
team_averages = pd.DataFrame()
team_sums = pd.DataFrame()
yrs = np.arange(2003,2021,1)
yrs[-1] = 2021
for yr in yrs:
    reg_yr = reg[reg.Season == yr]
    reg_W = reg_yr.drop(['DayNum','WLoc','NumOT'] + L_cols, axis=1)
    reg_L = reg_yr.drop(['DayNum','WLoc','NumOT'] + W_cols, axis=1)

    reg_L = reg_L.rename(columns=L_rename)
    reg_W = reg_W.rename(columns=W_rename)
    reg_W['OppScore'] = reg_yr['LScore']
    reg_L['OppScore'] = reg_yr['WScore']
    reg_WL = pd.concat([reg_W, reg_L])

    yr_avg = pd.DataFrame()
    yr_sums = pd.DataFrame()
    for t in reg_WL.TeamID:
        team = reg_WL[reg_WL.TeamID == t]
        yr_sums[t] = team.drop(['Season', 'TeamID'], axis=1).sum(axis=0)
        yr_avg[t] = team.drop(['Season', 'TeamID','OppScore'], axis=1).mean(axis=0)

    yr_sums = yr_sums.transpose()
    yr_sums = yr_sums.reset_index().rename(columns={'index' : 'TeamID'})
    yr_sums['Season'] = yr
    
    yr_avg = yr_avg.transpose()
    yr_avg = yr_avg.reset_index().rename(columns={'index' : 'TeamID'})
    yr_avg['Season'] = yr
    
    yr_sums['TotPoss'] = yr_sums['FGA'] - yr_sums['OR'] + yr_sums['TO'] + (0.4 * yr_sums['FTA'])
    yr_sums['OffEff'] = yr_sums['TotPoss'] / yr_sums['Score']
    yr_sums['DefEff'] = yr_sums['TotPoss'] / yr_sums['OppScore']
    yr_sums['FT%'] = yr_sums['FTM'] / yr_sums['FTA']

    yr_sums = yr_sums.drop(['TeamID','Season','Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 
                  'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk',
                  'PF', 'MOV'], axis=1)
    
    team_averages = pd.concat([team_averages, yr_avg])
    team_sums = pd.concat([team_sums, yr_sums])
    

reg_clean = pd.concat([team_averages, team_sums], axis=1)
reg_clean

Unnamed: 0,TeamID,Score,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,...,Stl,Blk,PF,MOV,Season,OppScore,TotPoss,OffEff,DefEff,FT%
0,1104,69.285714,24.035714,57.178571,6.357143,19.857143,14.857143,20.928571,13.571429,23.928571,...,6.607143,3.785714,18.035714,13.176471,2003,1820.0,1827.4,0.941959,1.004066,0.709898
1,1272,74.517241,26.275862,60.000000,7.000000,20.068966,14.965517,22.896552,14.068966,25.965517,...,7.379310,5.068966,18.758621,12.695652,2003,1909.0,1997.6,0.924387,1.046412,0.653614
2,1266,78.392857,27.214286,56.250000,5.785714,15.250000,18.178571,23.607143,13.107143,24.071429,...,6.000000,3.642857,18.642857,14.826087,2003,1895.0,1852.4,0.843918,0.977520,0.770045
3,1296,69.612903,24.354839,53.064516,6.290323,16.419355,14.612903,22.387097,13.000000,22.645161,...,7.612903,3.612903,19.806452,10.176471,2003,2164.0,2046.6,0.948378,0.945749,0.652738
4,1400,78.857143,28.000000,62.428571,5.857143,16.785714,17.000000,23.785714,16.178571,26.142857,...,6.392857,3.857143,20.357143,14.818182,2003,1923.0,1937.4,0.877446,1.007488,0.714715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,1394,63.428571,22.000000,53.857143,6.523810,19.571429,12.904762,18.761905,7.190476,22.809524,...,6.666667,2.761905,5.333333,16.500000,2021,1556.0,1481.6,1.112312,0.952185,0.687817
343,1225,61.045455,22.227273,51.500000,5.545455,16.272727,11.045455,15.227273,5.636364,20.636364,...,4.954545,1.954545,6.181818,5.000000,2021,1682.0,1491.0,1.110201,0.886445,0.725373
344,1354,61.222222,22.277778,57.222222,6.833333,21.444444,9.833333,15.888889,8.333333,19.722222,...,5.888889,2.833333,5.055556,6.000000,2021,1414.0,1280.4,1.161887,0.905516,0.618881
345,1175,69.941176,24.176471,61.764706,8.117647,26.117647,13.470588,18.411765,8.000000,22.294118,...,6.352941,3.411765,6.235294,7.000000,2021,1491.0,1329.2,1.117914,0.891482,0.731629


### Clean up the tournament data

In [4]:
trny = pd.read_csv('./data/MNCAATourneyDetailedResults.csv')

W_cols.remove('WTeamID')
W_cols.remove('WMOV')
L_cols.remove('LTeamID')

trny = trny[trny.DayNum > 135] # do not consider play in games
trny = trny.drop(W_cols + L_cols + ['DayNum','NumOT'], axis=1)
trny

Unnamed: 0,Season,WTeamID,LTeamID
1,2003,1112,1436
2,2003,1113,1272
3,2003,1141,1166
4,2003,1143,1301
5,2003,1163,1140
...,...,...,...
1176,2021,1211,1425
1177,2021,1417,1276
1178,2021,1124,1222
1179,2021,1211,1417


In [5]:
seeds = pd.read_csv('./data/MNCAATourneySeeds.csv')

seeds = seeds[seeds.Season >= 2003]

seeds_clean = []
for seed in seeds.Seed.tolist():
    seeds_clean.append(int(seed.strip('WXYZab')))

    
seeds['SeedsClean'] = seeds_clean
seeds = seeds.drop('Seed', axis=1).rename(columns={'SeedsClean' : 'Seed'})
seeds.head(5)

Unnamed: 0,Season,TeamID,Seed
1154,2003,1328,1
1155,2003,1448,2
1156,2003,1393,3
1157,2003,1257,4
1158,2003,1280,5


### Combine data to give tourney game-by-game data

In [6]:
cols = reg_clean.columns.tolist()
cols.remove('TeamID')
cols.remove('Season')

opp_cols = {}
for col in cols:
    opp_cols[col] = 'Opp_' + col

trny_clean = pd.DataFrame()
yrs = np.arange(2003,2021,1)
yrs[-1] = 2021
for yr in yrs:
    trny_yr = trny[trny.Season == yr]
    team_avg_yr = reg_clean[reg_clean.Season == yr]
    seeds_yr = seeds[seeds.Season == yr]
    ranks_yr = ranks_clean[ranks_clean.Season == yr]

    ##### Tourney games from Winner perspective
    # Merge tourney game with losers stats
    trny_W = trny_yr.merge(team_avg_yr.drop('Season', axis=1),
                  left_on='LTeamID', right_on='TeamID')
    trny_W = trny_W.drop('TeamID', axis=1)
    trny_W = trny_W.rename(columns=opp_cols)

    # Merge tourney game with losers seeds
    trny_W = trny_W.merge(seeds_yr.drop('Season', axis=1),
                         left_on='LTeamID', right_on='TeamID')
    trny_W = trny_W.rename(columns={'Seed' : 'Opp_Seed'})
    trny_W = trny_W.drop('TeamID', axis=1)
    
    # Merge tourney game with losers ranks
    trny_W = trny_W.merge(ranks_yr.drop(['Season', 'WLK', 'POM', 'MAS', 'SAG'], axis=1),
                         left_on='LTeamID', right_on='TeamID')
    trny_W = trny_W.rename(columns={'AvgRank' : 'Opp_AvgRank'})
    trny_W = trny_W.drop('TeamID', axis=1)

    # Merge tourney game with winners stats
    trny_W = trny_W.merge(team_avg_yr.drop('Season', axis=1),
                       left_on='WTeamID', right_on='TeamID')
    trny_W = trny_W.drop(['TeamID','LTeamID'], axis=1)

    # Merge tourney game with winners seeds
    trny_W = trny_W.merge(seeds_yr.drop('Season', axis=1),
                         left_on='WTeamID', right_on='TeamID')
    trny_W = trny_W.drop('TeamID', axis=1)  
    
    # Merge tourney game with winners ranks
    trny_W = trny_W.merge(ranks_yr.drop(['Season', 'WLK', 'POM', 'MAS', 'SAG'], axis=1),
                         left_on='WTeamID', right_on='TeamID')
    trny_W = trny_W.drop('TeamID', axis=1)
    
    trny_W = trny_W.rename(columns={'WTeamID' : 'TeamID'})

    # Add a win column
    trny_W['Win'] = 1


    ##### Tourney games from Loser perspective
    # Merge tourney game with winners stats
    trny_L = trny_yr.merge(team_avg_yr.drop('Season', axis=1),
                  left_on='WTeamID', right_on='TeamID')
    trny_L = trny_L.drop('TeamID', axis=1)
    trny_L = trny_L.rename(columns=opp_cols)

    # Merge tourney game with winners seeds
    trny_L = trny_L.merge(seeds_yr.drop('Season', axis=1),
                         left_on='WTeamID', right_on='TeamID')
    trny_L = trny_L.rename(columns={'Seed' : 'Opp_Seed'})
    trny_L = trny_L.drop('TeamID', axis=1) 
    
    # Merge tourney game with winners ranks
    trny_L = trny_L.merge(ranks_yr.drop(['Season', 'WLK', 'POM', 'MAS', 'SAG'], axis=1),
                         left_on='WTeamID', right_on='TeamID')
    trny_L = trny_L.rename(columns={'AvgRank' : 'Opp_AvgRank'})
    trny_L = trny_L.drop('TeamID', axis=1)

    # Merge tourney game with losers stats
    trny_L = trny_L.merge(team_avg_yr.drop('Season', axis=1),
                       left_on='LTeamID', right_on='TeamID')
    trny_L = trny_L.drop(['TeamID','WTeamID'], axis=1)

    # Merge tourney game with losers seeds
    trny_L = trny_L.merge(seeds_yr.drop('Season', axis=1),
                         left_on='LTeamID', right_on='TeamID')
    trny_L = trny_L.drop('TeamID', axis=1) 
    
    # Merge tourney game with losers ranks
    trny_L = trny_L.merge(ranks_yr.drop(['Season', 'WLK', 'POM', 'MAS', 'SAG'], axis=1),
                         left_on='LTeamID', right_on='TeamID')
    trny_L = trny_L.drop('TeamID', axis=1)
    trny_L = trny_L.rename(columns={'LTeamID' : 'TeamID'})

    # Add a win column
    trny_L['Win'] = 0


    # Compile all the tourney games for that year
    trny_clean = pd.concat([trny_clean, trny_W, trny_L])

trny_clean

Unnamed: 0,Season,TeamID,Opp_Score,Opp_FGM,Opp_FGA,Opp_FGM3,Opp_FGA3,Opp_FTM,Opp_FTA,Opp_OR,...,PF,MOV,OppScore,TotPoss,OffEff,DefEff,FT%,Seed,AvgRank,Win
0,2003,1112,67.793103,24.827586,55.862069,5.275862,15.482759,12.862069,19.551724,12.965517,...,17.750000,17.280000,1967.0,2109.0,0.883906,1.072191,0.701429,1,2.50,1
1,2003,1112,77.064516,26.064516,55.451613,7.161290,19.064516,17.774194,24.645161,11.935484,...,17.750000,17.280000,1967.0,2109.0,0.883906,1.072191,0.701429,1,2.50,1
2,2003,1112,80.096774,27.225806,60.677419,8.290323,21.774194,17.354839,22.838710,11.387097,...,17.750000,17.280000,1967.0,2109.0,0.883906,1.072191,0.701429,1,2.50,1
3,2003,1113,74.517241,26.275862,60.000000,7.000000,20.068966,14.965517,22.896552,14.068966,...,19.413793,17.388889,2006.0,1963.0,0.891058,0.978564,0.669737,10,34.50,1
4,2003,1141,79.242424,28.696970,57.454545,7.969697,20.484848,13.878788,20.030303,10.878788,...,20.965517,11.173913,2124.0,2042.0,0.887440,0.961394,0.765753,11,41.50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,2021,1281,74.760000,26.800000,60.680000,8.000000,23.640000,13.160000,17.680000,8.720000,...,5.440000,8.687500,1798.0,1765.6,0.959044,0.981980,0.697588,9,46.00,0
62,2021,1179,74.758621,27.172414,58.172414,6.310345,18.137931,14.103448,21.793103,10.482759,...,5.518519,15.739130,1756.0,1802.2,0.875705,1.026310,0.710784,11,61.25,0
63,2021,1242,74.758621,27.172414,58.172414,6.310345,18.137931,14.103448,21.793103,10.482759,...,3.518519,13.789474,1787.0,1860.8,0.950358,1.041298,0.715447,3,15.00,0
64,2021,1332,74.758621,27.172414,58.172414,6.310345,18.137931,14.103448,21.793103,10.482759,...,5.192308,11.750000,1753.0,1749.4,0.904550,0.997946,0.704433,7,31.50,0


### Choose data to train on and save as csv

In [7]:
features = trny_clean.columns.tolist() # start with all possible features
features.remove('Season')
features.remove('TeamID')
features.remove('Win')

# Create seed differential
trny_clean['SeedDiff'] = trny_clean['Seed'] - trny_clean['Opp_Seed']

training_data = trny_clean[features + ['Win']]
training_data = training_data.sample(frac=1).reset_index(drop=True)
training_data.head(5)

Unnamed: 0,Opp_Score,Opp_FGM,Opp_FGA,Opp_FGM3,Opp_FGA3,Opp_FTM,Opp_FTA,Opp_OR,Opp_DR,Opp_Ast,...,PF,MOV,OppScore,TotPoss,OffEff,DefEff,FT%,Seed,AvgRank,Win
0,79.703704,28.074074,61.407407,5.222222,15.185185,18.333333,24.814815,14.0,24.037037,14.037037,...,18.482759,15.666667,1961.0,1978.8,0.870185,1.009077,0.752591,2,9.5,1
1,78.8,28.533333,58.5,7.4,18.566667,14.333333,20.233333,10.366667,27.5,15.0,...,17.516129,12.217391,2075.0,2006.0,0.897539,0.966747,0.704202,12,88.0,0
2,73.636364,26.272727,55.69697,7.454545,20.515152,13.636364,19.606061,11.181818,24.515152,15.575758,...,19.8125,12.454545,2382.0,2332.4,0.915026,0.979177,0.653795,5,29.0,0
3,73.933333,26.533333,54.8,5.2,14.2,15.666667,20.833333,11.1,25.2,16.366667,...,18.344828,13.470588,1956.0,1905.6,0.910029,0.974233,0.698738,8,35.75,1
4,68.09375,23.5625,52.84375,5.3125,15.40625,15.65625,22.53125,12.3125,24.90625,15.8125,...,17.9375,12.166667,2295.0,2192.0,0.910677,0.95512,0.715033,14,96.25,0


In [8]:
training_data.to_csv('./data/training_data_2003-21.csv')

### Create a data set of regular season averages for 2022

In [22]:
reg = pd.read_csv('./data/MRegularSeasonDetailedResults.csv')
teams = pd.read_csv('./data/MTeams.csv')
reg['WMOV'] = reg['WScore'] - reg['LScore']

ranks = pd.read_csv('./data/MMasseyOrdinals.csv')
ranks = ranks[ranks.RankingDayNum == 128].drop('RankingDayNum', axis=1)
WLK = ranks[ranks.SystemName == 'WLK'].drop('SystemName', axis=1)
POM = ranks[ranks.SystemName == 'POM'].drop('SystemName', axis=1)
MAS = ranks[ranks.SystemName == 'MAS'].drop('SystemName', axis=1)
SAG = ranks[ranks.SystemName == 'SAG'].drop('SystemName', axis=1)

yr = 2022
sys1 = WLK[WLK.Season == yr]
sys2 = POM[POM.Season == yr].drop('Season', axis=1)
sys3 = MAS[MAS.Season == yr].drop('Season', axis=1)
sys4 = SAG[SAG.Season == yr].drop('Season', axis=1)

sys1 = sys1.rename(columns={'OrdinalRank' : 'WLK'})
sys2 = sys2.rename(columns={'OrdinalRank' : 'POM'})
sys3 = sys3.rename(columns={'OrdinalRank' : 'MAS'})
sys4 = sys4.rename(columns={'OrdinalRank' : 'SAG'})

sys = sys1.merge(sys2, left_on='TeamID', right_on='TeamID', how='outer')
sys = sys.merge(sys3, left_on='TeamID', right_on='TeamID', how='outer')
sys = sys.merge(sys4, left_on='TeamID', right_on='TeamID', how='outer')

sys['AvgRank'] = sys.drop(['Season','TeamID'], axis=1).mean(axis=1)

W_rename = {}
L_rename = {}
W_cols = []
L_cols = []
for col in reg.columns:
    if col.startswith('L'):
        L_rename[col] = col.split('L')[1]
        L_cols.append(col)
    if col.startswith('W'):
        W_rename[col] = col.split('W')[1]
        W_cols.append(col)
        
reg_yr = reg[reg.Season == yr]
ranks_yr = sys[sys.Season == yr]

reg_W = reg_yr.drop(['DayNum','WLoc','NumOT'] + L_cols, axis=1)
reg_L = reg_yr.drop(['DayNum','WLoc','NumOT'] + W_cols, axis=1)

reg_W = reg_W.rename(columns=W_rename)
reg_L = reg_L.rename(columns=L_rename)
reg_W['OppScore'] = reg_yr['LScore']
reg_L['OppScore'] = reg_yr['WScore']
reg_WL = pd.concat([reg_W, reg_L])

yr_avg = pd.DataFrame()
yr_sums = pd.DataFrame()
for t in reg_WL.TeamID:
    team = reg_WL[reg_WL.TeamID == t]
    yr_sums[t] = team.drop(['Season', 'TeamID'], axis=1).sum(axis=0)
    yr_avg[t] = team.drop(['Season', 'TeamID','OppScore'], axis=1).mean(axis=0)

yr_sums = yr_sums.transpose()
yr_sums = yr_sums.reset_index().rename(columns={'index' : 'TeamID'})
yr_sums['Season'] = yr
    
yr_avg = yr_avg.transpose()
yr_avg = yr_avg.reset_index().rename(columns={'index' : 'TeamID'})
yr_avg['Season'] = yr

yr_avg = yr_avg.merge(ranks_yr.drop('Season', axis=1),
                     left_on='TeamID', right_on='TeamID')
yr_avg = yr_avg.merge(teams[['TeamName', 'TeamID']],
           left_on='TeamID',
           right_on='TeamID',
           validate='many_to_one') \
    .drop(['TeamID','Season'], axis=1)

yr_sums['TotPoss'] = yr_sums['FGA'] - yr_sums['OR'] + yr_sums['TO'] + (0.4 * yr_sums['FTA'])
yr_sums['OffEff'] = yr_sums['TotPoss'] / yr_sums['Score']
yr_sums['DefEff'] = yr_sums['TotPoss'] / yr_sums['OppScore']
yr_sums['FT%'] = yr_sums['FTM'] / yr_sums['FTA']

yr_sums = yr_sums.drop(['Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 
              'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk',
              'PF', 'MOV'], axis=1)

data = pd.concat([yr_avg, yr_sums], axis=1)
data

Unnamed: 0,Score,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,...,SAG,AvgRank,TeamName,TeamID,OppScore,Season,TotPoss,OffEff,DefEff,FT%
0,79.968750,27.656250,62.750000,9.281250,30.093750,15.375000,21.062500,11.625000,24.718750,14.625000,...,23,22.00,Alabama,1104,2445.0,2022,2356.6,0.920907,0.963845,0.729970
1,61.433333,21.866667,57.733333,3.900000,14.400000,13.800000,19.366667,9.466667,23.933333,9.500000,...,338,327.00,Alabama A&M,1105,2010.0,2022,2077.4,1.127184,1.033532,0.712565
2,63.468750,24.406250,54.281250,5.031250,15.218750,9.625000,13.875000,6.812500,21.218750,12.187500,...,320,330.50,American Univ,1110,2270.0,2022,2090.6,1.029345,0.920969,0.693694
3,84.558824,30.441176,61.382353,7.764706,21.911765,15.911765,21.558824,10.441176,28.352941,19.911765,...,2,2.00,Arizona,1112,2296.0,2022,2465.2,0.857461,1.073693,0.738063
4,65.387097,25.000000,60.419355,6.903226,22.483871,8.483871,12.838710,7.548387,24.129032,13.354839,...,79,86.25,Arizona St,1113,2084.0,2022,2157.2,1.064233,1.035125,0.660804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,60.730769,22.961538,56.192308,6.730769,21.692308,8.076923,11.461538,7.115385,20.000000,12.769231,...,352,351.25,Maine,1263,1918.0,2022,1715.2,1.086257,0.894265,0.704698
354,54.275862,19.275862,50.103448,5.758621,18.103448,9.965517,15.620690,5.620690,20.103448,11.206897,...,355,355.50,E Illinois,1183,2070.0,2022,1901.2,1.207878,0.918454,0.637969
355,51.296296,18.851852,50.481481,5.111111,17.481481,8.481481,12.592593,7.851852,19.037037,10.111111,...,356,357.25,IUPUI,1237,1847.0,2022,1700.0,1.227437,0.920411,0.673529
356,60.888889,22.703704,56.962963,5.259259,17.407407,10.222222,14.925926,9.222222,21.592593,11.037037,...,343,349.75,Lamar,1249,2002.0,2022,1824.2,1.109611,0.911189,0.684864


In [23]:
data.to_csv('./data/testing_data_2022.csv')